Vowpal Wabbit
bfgs.cc
Go to the documentation of this file.
1 /*
2 Copyright (c) by respective owners including Yahoo!, Microsoft, and
3 individual contributors. All rights reserved. Released under a BSD (revised)
4 license as described in the file LICENSE.
5  */
6 /*
7 The algorithm here is generally based on Nocedal 1980, Liu and Nocedal 1989.
8 Implementation by Miro Dudik.
9  */
10 #include <cmath>
11 #include <fstream>
12 #include <float.h>
13 #ifndef _WIN32
14 #include <netdb.h>
15 #endif
16 #include <string.h>
17 #include <stdio.h>
18 #include <assert.h>
19 #include <sys/timeb.h>
20 #include "accumulate.h"
21 #include "reductions.h"
22 #include "gd.h"
23 #include "vw_exception.h"
24 #include <exception>
25 
26 using namespace LEARNER;
27 using namespace VW::config;
28 
29 #define CG_EXTRA 1
30 
31 #define MEM_GT 0
32 #define MEM_XT 1
33 #define MEM_YT 0
34 #define MEM_ST 1
35 
36 #define W_XT 0
37 #define W_GT 1
38 #define W_DIR 2
39 #define W_COND 3
40 
41 #define LEARN_OK 0
42 #define LEARN_CURV 1
43 #define LEARN_CONV 2
44 
45 class curv_exception : public std::exception
46 {
47 } curv_ex;
48 
49 /********************************************************************/
50 /* mem & w definition ***********************************************/
51 /********************************************************************/
52 // mem[2*i] = y_t
53 // mem[2*i+1] = s_t
54 //
55 // w[0] = weight
56 // w[1] = accumulated first derivative
57 // w[2] = step direction
58 // w[3] = preconditioner
59 
60 constexpr float max_precond_ratio = 10000.f;
61 
62 struct bfgs
63 {
64  vw* all; // prediction, regressor
65  int m;
66  float rel_threshold; // termination threshold
67 
68  double wolfe1_bound;
69 
70  size_t final_pass;
71  struct timeb t_start, t_end;
72  double net_comm_time;
73 
74  struct timeb t_start_global, t_end_global;
75  double net_time;
76 
79  size_t current_pass;
82 
83  // default transition behavior
86 
87  // set by initializer
90  float* mem;
91  double* rho;
92  double* alpha;
93 
95  // the below needs to be included when resetting, in addition to preconditioner and derivative
96  int lastj, origin;
97  double loss_sum, previous_loss_sum;
98  float step_size;
100  double curvature;
101 
102  // first pass specification
106 
108  {
109  predictions.delete_v();
110  free(mem);
111  free(rho);
112  free(alpha);
113  }
114 };
115 
116 constexpr const char* curv_message =
117  "Zero or negative curvature detected.\n"
118  "To increase curvature you can increase regularization or rescale features.\n"
119  "It is also possible that you have reached numerical accuracy\n"
120  "and further decrease in the objective cannot be reliably detected.\n";
121 
122 void zero_derivative(vw& all) { all.weights.set_zero(W_GT); }
123 
125 
126 void reset_state(vw& all, bfgs& b, bool zero)
127 {
128  b.lastj = b.origin = 0;
129  b.loss_sum = b.previous_loss_sum = 0.;
130  b.importance_weight_sum = 0.;
131  b.curvature = 0.;
132  b.first_pass = true;
133  b.gradient_pass = true;
134  b.preconditioner_pass = true;
135  if (zero)
136  {
137  zero_derivative(all);
138  zero_preconditioner(all);
139  }
140 }
141 
142 // w[0] = weight
143 // w[1] = accumulated first derivative
144 // w[2] = step direction
145 // w[3] = preconditioner
146 
147 constexpr bool test_example(example& ec) noexcept { return ec.l.simple.label == FLT_MAX; }
148 
149 float bfgs_predict(vw& all, example& ec)
150 {
153 }
154 
155 inline void add_grad(float& d, float f, float& fw) { (&fw)[W_GT] += d * f; }
156 
158 {
159  float fp = bfgs_predict(all, ec);
160  label_data& ld = ec.l.simple;
161  all.set_minmax(all.sd, ld.label);
162 
163  float loss_grad = all.loss->first_derivative(all.sd, fp, ld.label) * ec.weight;
164  GD::foreach_feature<float, add_grad>(all, ec, loss_grad);
165 
166  return fp;
167 }
168 
169 inline void add_precond(float& d, float f, float& fw) { (&fw)[W_COND] += d * f * f; }
170 
172 {
173  float curvature = all.loss->second_derivative(all.sd, ec.pred.scalar, ec.l.simple.label) * ec.weight;
174  GD::foreach_feature<float, add_precond>(all, ec, curvature);
175 }
176 
177 inline void add_DIR(float& p, const float fx, float& fw) { p += (&fw)[W_DIR] * fx; }
178 
179 float dot_with_direction(vw& all, example& ec)
180 {
181  float temp = ec.l.simple.initial;
182  GD::foreach_feature<float, add_DIR>(all, ec, temp);
183  return temp;
184 }
185 
186 template <class T>
187 double regularizer_direction_magnitude(vw& /* all */, bfgs& b, double regularizer, T& weights)
188 {
189  double ret = 0.;
190  if (b.regularizers == nullptr)
191  for (typename T::iterator iter = weights.begin(); iter != weights.end(); ++iter)
192  ret += regularizer * (&(*iter))[W_DIR] * (&(*iter))[W_DIR];
193 
194  else
195  {
196  for (typename T::iterator iter = weights.begin(); iter != weights.end(); ++iter)
197  ret += ((double)b.regularizers[2 * (iter.index() >> weights.stride_shift())]) * (&(*iter))[W_DIR] *
198  (&(*iter))[W_DIR];
199  }
200  return ret;
201 }
202 
203 double regularizer_direction_magnitude(vw& all, bfgs& b, float regularizer)
204 {
205  // compute direction magnitude
206  double ret = 0.;
207 
208  if (regularizer == 0.)
209  return ret;
210 
211  if (all.weights.sparse)
212  return regularizer_direction_magnitude(all, b, regularizer, all.weights.sparse_weights);
213  else
214  return regularizer_direction_magnitude(all, b, regularizer, all.weights.dense_weights);
215 }
216 
217 template <class T>
218 float direction_magnitude(vw& /* all */, T& weights)
219 {
220  // compute direction magnitude
221  double ret = 0.;
222  for (typename T::iterator iter = weights.begin(); iter != weights.end(); ++iter)
223  ret += ((double)(&(*iter))[W_DIR]) * (&(*iter))[W_DIR];
224 
225  return (float)ret;
226 }
227 
229 {
230  // compute direction magnitude
231  if (all.weights.sparse)
232  return direction_magnitude(all, all.weights.sparse_weights);
233  else
234  return direction_magnitude(all, all.weights.dense_weights);
235 }
236 
237 template <class T>
238 void bfgs_iter_start(vw& all, bfgs& b, float* mem, int& lastj, double importance_weight_sum, int& origin, T& weights)
239 {
240  double g1_Hg1 = 0.;
241  double g1_g1 = 0.;
242 
243  origin = 0;
244  for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
245  {
246  float* mem1 = mem + (w.index() >> weights.stride_shift()) * b.mem_stride;
247  if (b.m > 0)
248  mem1[(MEM_XT + origin) % b.mem_stride] = (&(*w))[W_XT];
249  mem1[(MEM_GT + origin) % b.mem_stride] = (&(*w))[W_GT];
250  g1_Hg1 += ((double)(&(*w))[W_GT]) * ((&(*w))[W_GT]) * ((&(*w))[W_COND]);
251  g1_g1 += ((double)((&(*w))[W_GT])) * ((&(*w))[W_GT]);
252  (&(*w))[W_DIR] = -(&(*w))[W_COND] * ((&(*w))[W_GT]);
253  ((&(*w))[W_GT]) = 0;
254  }
255  lastj = 0;
256  if (!all.quiet)
257  fprintf(stderr, "%-10.5f\t%-10.5f\t%-10s\t%-10s\t%-10s\t", g1_g1 / (importance_weight_sum * importance_weight_sum),
258  g1_Hg1 / importance_weight_sum, "", "", "");
259 }
260 
261 void bfgs_iter_start(vw& all, bfgs& b, float* mem, int& lastj, double importance_weight_sum, int& origin)
262 {
263  if (all.weights.sparse)
264  bfgs_iter_start(all, b, mem, lastj, importance_weight_sum, origin, all.weights.sparse_weights);
265  else
266  bfgs_iter_start(all, b, mem, lastj, importance_weight_sum, origin, all.weights.dense_weights);
267 }
268 
269 template <class T>
270 void bfgs_iter_middle(vw& all, bfgs& b, float* mem, double* rho, double* alpha, int& lastj, int& origin, T& weights)
271 {
272  float* mem0 = mem;
273  uint32_t length = 1 << all.num_bits;
274  // implement conjugate gradient
275  if (b.m == 0)
276  {
277  double g_Hy = 0.;
278  double g_Hg = 0.;
279  double y = 0.;
280 
281  for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
282  {
283  mem = mem0 + (w.index() >> weights.stride_shift()) * b.mem_stride;
284  y = (&(*w))[W_GT] - mem[(MEM_GT + origin) % b.mem_stride];
285  g_Hy += ((double)(&(*w))[W_GT]) * ((&(*w))[W_COND]) * y;
286  g_Hg +=
287  ((double)mem[(MEM_GT + origin) % b.mem_stride]) * ((&(*w))[W_COND]) * mem[(MEM_GT + origin) % b.mem_stride];
288  }
289 
290  float beta = (float)(g_Hy / g_Hg);
291 
292  if (beta < 0.f || std::isnan(beta))
293  beta = 0.f;
294 
295  for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
296  {
297  mem = mem0 + (w.index() >> weights.stride_shift()) * b.mem_stride;
298  mem[(MEM_GT + origin) % b.mem_stride] = (&(*w))[W_GT];
299 
300  (&(*w))[W_DIR] *= beta;
301  (&(*w))[W_DIR] -= ((&(*w))[W_COND]) * ((&(*w))[W_GT]);
302  (&(*w))[W_GT] = 0;
303  }
304  if (!all.quiet)
305  fprintf(stderr, "%f\t", beta);
306  return;
307 
308  mem = mem0 + (length - 1) * b.mem_stride;
309  }
310  else
311  {
312  if (!all.quiet)
313  fprintf(stderr, "%-10s\t", "");
314  }
315 
316  // implement bfgs
317  double y_s = 0.;
318  double y_Hy = 0.;
319  double s_q = 0.;
320 
321  for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
322  {
323  float* mem1 = mem + (w.index() >> weights.stride_shift()) * b.mem_stride;
324  mem1[(MEM_YT + origin) % b.mem_stride] = (&(*w))[W_GT] - mem1[(MEM_GT + origin) % b.mem_stride];
325  mem1[(MEM_ST + origin) % b.mem_stride] = (&(*w))[W_XT] - mem1[(MEM_XT + origin) % b.mem_stride];
326  (&(*w))[W_DIR] = (&(*w))[W_GT];
327  y_s += ((double)mem1[(MEM_YT + origin) % b.mem_stride]) * mem1[(MEM_ST + origin) % b.mem_stride];
328  y_Hy +=
329  ((double)mem1[(MEM_YT + origin) % b.mem_stride]) * mem1[(MEM_YT + origin) % b.mem_stride] * ((&(*w))[W_COND]);
330  s_q += ((double)mem1[(MEM_ST + origin) % b.mem_stride]) * ((&(*w))[W_GT]);
331  }
332 
333  if (y_s <= 0. || y_Hy <= 0.)
334  throw curv_ex;
335  rho[0] = 1 / y_s;
336 
337  float gamma = (float)(y_s / y_Hy);
338 
339  for (int j = 0; j < lastj; j++)
340  {
341  alpha[j] = rho[j] * s_q;
342  s_q = 0.;
343  for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
344  {
345  mem = mem0 + (w.index() >> weights.stride_shift()) * b.mem_stride;
346  (&(*w))[W_DIR] -= (float)alpha[j] * mem[(2 * j + MEM_YT + origin) % b.mem_stride];
347  s_q += ((double)mem[(2 * j + 2 + MEM_ST + origin) % b.mem_stride]) * ((&(*w))[W_DIR]);
348  }
349  }
350 
351  alpha[lastj] = rho[lastj] * s_q;
352  double y_r = 0.;
353 
354  for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
355  {
356  mem = mem0 + (w.index() >> weights.stride_shift()) * b.mem_stride;
357  (&(*w))[W_DIR] -= (float)alpha[lastj] * mem[(2 * lastj + MEM_YT + origin) % b.mem_stride];
358  (&(*w))[W_DIR] *= gamma * ((&(*w))[W_COND]);
359  y_r += ((double)mem[(2 * lastj + MEM_YT + origin) % b.mem_stride]) * ((&(*w))[W_DIR]);
360  }
361 
362  double coef_j;
363 
364  for (int j = lastj; j > 0; j--)
365  {
366  coef_j = alpha[j] - rho[j] * y_r;
367  y_r = 0.;
368  for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
369  {
370  mem = mem0 + (w.index() >> weights.stride_shift()) * b.mem_stride;
371  (&(*w))[W_DIR] += (float)coef_j * mem[(2 * j + MEM_ST + origin) % b.mem_stride];
372  y_r += ((double)mem[(2 * j - 2 + MEM_YT + origin) % b.mem_stride]) * ((&(*w))[W_DIR]);
373  }
374  }
375 
376  coef_j = alpha[0] - rho[0] * y_r;
377  for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
378  {
379  mem = mem0 + (w.index() >> weights.stride_shift()) * b.mem_stride;
380  (&(*w))[W_DIR] = -(&(*w))[W_DIR] - (float)coef_j * mem[(MEM_ST + origin) % b.mem_stride];
381  }
382 
383  /*********************
384  ** shift
385  ********************/
386 
387  lastj = (lastj < b.m - 1) ? lastj + 1 : b.m - 1;
388  origin = (origin + b.mem_stride - 2) % b.mem_stride;
389 
390  for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
391  {
392  mem = mem0 + (w.index() >> weights.stride_shift()) * b.mem_stride;
393  mem[(MEM_GT + origin) % b.mem_stride] = (&(*w))[W_GT];
394  mem[(MEM_XT + origin) % b.mem_stride] = (&(*w))[W_XT];
395  (&(*w))[W_GT] = 0;
396  }
397  for (int j = lastj; j > 0; j--) rho[j] = rho[j - 1];
398 }
399 
400 void bfgs_iter_middle(vw& all, bfgs& b, float* mem, double* rho, double* alpha, int& lastj, int& origin)
401 {
402  if (all.weights.sparse)
403  bfgs_iter_middle(all, b, mem, rho, alpha, lastj, origin, all.weights.sparse_weights);
404  else
405  bfgs_iter_middle(all, b, mem, rho, alpha, lastj, origin, all.weights.dense_weights);
406 }
407 
408 template <class T>
409 double wolfe_eval(vw& all, bfgs& b, float* mem, double loss_sum, double previous_loss_sum, double step_size,
410  double importance_weight_sum, int& origin, double& wolfe1, T& weights)
411 {
412  double g0_d = 0.;
413  double g1_d = 0.;
414  double g1_Hg1 = 0.;
415  double g1_g1 = 0.;
416 
417  for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
418  {
419  float* mem1 = mem + (w.index() >> weights.stride_shift()) * b.mem_stride;
420  g0_d += ((double)mem1[(MEM_GT + origin) % b.mem_stride]) * ((&(*w))[W_DIR]);
421  g1_d += ((double)(&(*w))[W_GT]) * (&(*w))[W_DIR];
422  g1_Hg1 += ((double)(&(*w))[W_GT]) * (&(*w))[W_GT] * ((&(*w))[W_COND]);
423  g1_g1 += ((double)(&(*w))[W_GT]) * (&(*w))[W_GT];
424  }
425 
426  wolfe1 = (loss_sum - previous_loss_sum) / (step_size * g0_d);
427  double wolfe2 = g1_d / g0_d;
428  // double new_step_cross = (loss_sum-previous_loss_sum-g1_d*step)/(g0_d-g1_d);
429 
430  if (!all.quiet)
431  fprintf(stderr, "%-10.5f\t%-10.5f\t%s%-10f\t%-10f\t", g1_g1 / (importance_weight_sum * importance_weight_sum),
432  g1_Hg1 / importance_weight_sum, " ", wolfe1, wolfe2);
433  return 0.5 * step_size;
434 }
435 
436 double wolfe_eval(vw& all, bfgs& b, float* mem, double loss_sum, double previous_loss_sum, double step_size,
437  double importance_weight_sum, int& origin, double& wolfe1)
438 {
439  if (all.weights.sparse)
440  return wolfe_eval(all, b, mem, loss_sum, previous_loss_sum, step_size, importance_weight_sum, origin, wolfe1,
441  all.weights.sparse_weights);
442  else
443  return wolfe_eval(all, b, mem, loss_sum, previous_loss_sum, step_size, importance_weight_sum, origin, wolfe1,
444  all.weights.dense_weights);
445 }
446 
447 template <class T>
448 double add_regularization(vw& all, bfgs& b, float regularization, T& weights)
449 {
450  // compute the derivative difference
451  double ret = 0.;
452 
453  if (b.regularizers == nullptr)
454  for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
455  {
456  (&(*w))[W_GT] += regularization * (*w);
457  ret += 0.5 * regularization * (*w) * (*w);
458  }
459  else
460  for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
461  {
462  uint64_t i = w.index() >> weights.stride_shift();
463  weight delta_weight = *w - b.regularizers[2 * i + 1];
464  (&(*w))[W_GT] += b.regularizers[2 * i] * delta_weight;
465  ret += 0.5 * b.regularizers[2 * i] * delta_weight * delta_weight;
466  }
467 
468  // if we're not regularizing the intercept term, then subtract it off from the result above
469  // when accessing weights[constant], always use weights.strided_index(constant)
470  if (all.no_bias)
471  {
472  if (b.regularizers == nullptr)
473  {
474  (&weights.strided_index(constant))[W_GT] -= regularization * (weights.strided_index(constant));
475  ret -= 0.5 * regularization * (weights.strided_index(constant)) * (weights.strided_index(constant));
476  }
477  else
478  {
479  uint64_t i = constant >> weights.stride_shift();
480  weight delta_weight = (weights.strided_index(constant)) - b.regularizers[2 * i + 1];
481  (&weights.strided_index(constant))[W_GT] -= b.regularizers[2 * i] * delta_weight;
482  ret -= 0.5 * b.regularizers[2 * i] * delta_weight * delta_weight;
483  }
484  }
485 
486  return ret;
487 }
488 
489 double add_regularization(vw& all, bfgs& b, float regularization)
490 {
491  if (all.weights.sparse)
492  return add_regularization(all, b, regularization, all.weights.sparse_weights);
493  else
494  return add_regularization(all, b, regularization, all.weights.dense_weights);
495 }
496 
497 template <class T>
498 void finalize_preconditioner(vw& /* all */, bfgs& b, float regularization, T& weights)
499 {
500  float max_hessian = 0.f;
501 
502  if (b.regularizers == nullptr)
503  for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
504  {
505  (&(*w))[W_COND] += regularization;
506  if ((&(*w))[W_COND] > max_hessian)
507  max_hessian = (&(*w))[W_COND];
508  if ((&(*w))[W_COND] > 0)
509  (&(*w))[W_COND] = 1.f / (&(*w))[W_COND];
510  }
511  else
512  for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
513  {
514  (&(*w))[W_COND] += b.regularizers[2 * (w.index() >> weights.stride_shift())];
515  if ((&(*w))[W_COND] > max_hessian)
516  max_hessian = (&(*w))[W_COND];
517  if ((&(*w))[W_COND] > 0)
518  (&(*w))[W_COND] = 1.f / (&(*w))[W_COND];
519  }
520 
521  float max_precond = (max_hessian == 0.f) ? 0.f : max_precond_ratio / max_hessian;
522 
523  for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
524  {
525  if (std::isinf(*w) || *w > max_precond)
526  (&(*w))[W_COND] = max_precond;
527  }
528 }
529 void finalize_preconditioner(vw& all, bfgs& b, float regularization)
530 {
531  if (all.weights.sparse)
532  finalize_preconditioner(all, b, regularization, all.weights.sparse_weights);
533  else
534  finalize_preconditioner(all, b, regularization, all.weights.dense_weights);
535 }
536 
537 template <class T>
538 void preconditioner_to_regularizer(vw& all, bfgs& b, float regularization, T& weights)
539 {
540  uint32_t length = 1 << all.num_bits;
541 
542  if (b.regularizers == nullptr)
543  {
544  b.regularizers = calloc_or_throw<weight>(2 * length);
545 
546  if (b.regularizers == nullptr)
547  THROW("Failed to allocate weight array: try decreasing -b <bits>");
548 
549  for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
550  {
551  uint64_t i = w.index() >> weights.stride_shift();
552  b.regularizers[2 * i] = regularization;
553  if ((&(*w))[W_COND] > 0.f)
554  b.regularizers[2 * i] += 1.f / (&(*w))[W_COND];
555  }
556  }
557  else
558  for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
559  {
560  if ((&(*w))[W_COND] > 0.f)
561  b.regularizers[2 * (w.index() >> weights.stride_shift())] += 1.f / (&(*w))[W_COND];
562  }
563 
564  for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
565  b.regularizers[2 * (w.index() >> weights.stride_shift()) + 1] = *w;
566 }
567 void preconditioner_to_regularizer(vw& all, bfgs& b, float regularization)
568 {
569  if (all.weights.sparse)
570  preconditioner_to_regularizer(all, b, regularization, all.weights.sparse_weights);
571  else
572  preconditioner_to_regularizer(all, b, regularization, all.weights.dense_weights);
573 }
574 
575 template <class T>
576 void regularizer_to_weight(vw& /* all */, bfgs& b, T& weights)
577 {
578  if (b.regularizers != nullptr)
579  {
580  for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
581  {
582  uint64_t i = w.index() >> weights.stride_shift();
583  (&(*w))[W_COND] = b.regularizers[2 * i];
584  *w = b.regularizers[2 * i + 1];
585  }
586  }
587 }
588 
590 {
591  if (all.weights.sparse)
593  else
595 }
596 
597 void zero_state(vw& all)
598 {
599  all.weights.set_zero(W_GT);
600  all.weights.set_zero(W_DIR);
601  all.weights.set_zero(W_COND);
602 }
603 
604 template <class T>
605 double derivative_in_direction(vw& /* all */, bfgs& b, float* mem, int& origin, T& weights)
606 {
607  double ret = 0.;
608  for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
609  {
610  float* mem1 = mem + (w.index() >> weights.stride_shift()) * b.mem_stride;
611  ret += ((double)mem1[(MEM_GT + origin) % b.mem_stride]) * (&(*w))[W_DIR];
612  }
613  return ret;
614 }
615 
616 double derivative_in_direction(vw& all, bfgs& b, float* mem, int& origin)
617 {
618  if (all.weights.sparse)
619  return derivative_in_direction(all, b, mem, origin, all.weights.sparse_weights);
620  else
621  return derivative_in_direction(all, b, mem, origin, all.weights.dense_weights);
622 }
623 
624 template <class T>
625 void update_weight(vw& /* all */, float step_size, T& w)
626 {
627  for (typename T::iterator iter = w.begin(); iter != w.end(); ++iter)
628  (&(*iter))[W_XT] += step_size * (&(*iter))[W_DIR];
629 }
630 
631 void update_weight(vw& all, float step_size)
632 {
633  if (all.weights.sparse)
634  update_weight(all, step_size, all.weights.sparse_weights);
635  else
636  update_weight(all, step_size, all.weights.dense_weights);
637 }
638 
639 int process_pass(vw& all, bfgs& b)
640 {
641  int status = LEARN_OK;
642 
643  finalize_preconditioner(all, b, all.l2_lambda);
644  /********************************************************************/
645  /* A) FIRST PASS FINISHED: INITIALIZE FIRST LINE SEARCH *************/
646  /********************************************************************/
647  if (b.first_pass)
648  {
649  if (all.all_reduce != nullptr)
650  {
651  accumulate(all, all.weights, W_COND); // Accumulate preconditioner
652  float temp = (float)b.importance_weight_sum;
654  }
655  // finalize_preconditioner(all, b, all.l2_lambda);
656  if (all.all_reduce != nullptr)
657  {
658  float temp = (float)b.loss_sum;
659  b.loss_sum = accumulate_scalar(all, temp); // Accumulate loss_sums
660  accumulate(all, all.weights, 1); // Accumulate gradients from all nodes
661  }
662  if (all.l2_lambda > 0.)
663  b.loss_sum += add_regularization(all, b, all.l2_lambda);
664  if (!all.quiet)
665  fprintf(stderr, "%2lu %-10.5f\t", (long unsigned int)b.current_pass + 1, b.loss_sum / b.importance_weight_sum);
666 
668  b.loss_sum = 0.;
669  b.example_number = 0;
670  b.curvature = 0;
672  if (b.first_hessian_on)
673  {
674  b.gradient_pass = false; // now start computing curvature
675  }
676  else
677  {
678  b.step_size = 0.5;
679  float d_mag = direction_magnitude(all);
680  ftime(&b.t_end_global);
681  b.net_time = (int)(1000.0 * (b.t_end_global.time - b.t_start_global.time) +
682  (b.t_end_global.millitm - b.t_start_global.millitm));
683  if (!all.quiet)
684  fprintf(stderr, "%-10s\t%-10.5f\t%-.5f\n", "", d_mag, b.step_size);
685  b.predictions.clear();
686  update_weight(all, b.step_size);
687  }
688  }
689  else
690  /********************************************************************/
691  /* B) GRADIENT CALCULATED *******************************************/
692  /********************************************************************/
693  if (b.gradient_pass) // We just finished computing all gradients
694  {
695  if (all.all_reduce != nullptr)
696  {
697  float t = (float)b.loss_sum;
698  b.loss_sum = accumulate_scalar(all, t); // Accumulate loss_sums
699  accumulate(all, all.weights, 1); // Accumulate gradients from all nodes
700  }
701  if (all.l2_lambda > 0.)
702  b.loss_sum += add_regularization(all, b, all.l2_lambda);
703  if (!all.quiet)
704  {
705  if (!all.holdout_set_off && b.current_pass >= 1)
706  {
708  {
709  fprintf(stderr, "%2lu ", (long unsigned int)b.current_pass + 1);
710  fprintf(stderr, "h unknown ");
711  }
712  else
713  fprintf(stderr, "%2lu h%-10.5f\t", (long unsigned int)b.current_pass + 1,
715  }
716  else
717  fprintf(stderr, "%2lu %-10.5f\t", (long unsigned int)b.current_pass + 1, b.loss_sum / b.importance_weight_sum);
718  }
719  double wolfe1;
720  double new_step = wolfe_eval(
721  all, b, b.mem, b.loss_sum, b.previous_loss_sum, b.step_size, b.importance_weight_sum, b.origin, wolfe1);
722 
723  /********************************************************************/
724  /* B0) DERIVATIVE ZERO: MINIMUM FOUND *******************************/
725  /********************************************************************/
726  if (std::isnan((float)wolfe1))
727  {
728  fprintf(stderr, "\n");
729  fprintf(stdout, "Derivative 0 detected.\n");
730  b.step_size = 0.0;
731  status = LEARN_CONV;
732  }
733  /********************************************************************/
734  /* B1) LINE SEARCH FAILED *******************************************/
735  /********************************************************************/
736  else if (b.backstep_on && (wolfe1 < b.wolfe1_bound || b.loss_sum > b.previous_loss_sum))
737  {
738  // curvature violated, or we stepped too far last time: step back
739  ftime(&b.t_end_global);
740  b.net_time = (int)(1000.0 * (b.t_end_global.time - b.t_start_global.time) +
741  (b.t_end_global.millitm - b.t_start_global.millitm));
742  float ratio = (b.step_size == 0.f) ? 0.f : (float)new_step / (float)b.step_size;
743  if (!all.quiet)
744  fprintf(stderr, "%-10s\t%-10s\t(revise x %.1f)\t%-.5f\n", "", "", ratio, new_step);
745  b.predictions.clear();
746  update_weight(all, (float)(-b.step_size + new_step));
747  b.step_size = (float)new_step;
748  zero_derivative(all);
749  b.loss_sum = 0.;
750  }
751 
752  /********************************************************************/
753  /* B2) LINE SEARCH SUCCESSFUL OR DISABLED ******************/
754  /* DETERMINE NEXT SEARCH DIRECTION ******************/
755  /********************************************************************/
756  else
757  {
758  double rel_decrease = (b.previous_loss_sum - b.loss_sum) / b.previous_loss_sum;
759  if (!std::isnan((float)rel_decrease) && b.backstep_on && fabs(rel_decrease) < b.rel_threshold)
760  {
761  fprintf(stdout,
762  "\nTermination condition reached in pass %ld: decrease in loss less than %.3f%%.\n"
763  "If you want to optimize further, decrease termination threshold.\n",
764  (long int)b.current_pass + 1, b.rel_threshold * 100.0);
765  status = LEARN_CONV;
766  }
768  b.loss_sum = 0.;
769  b.example_number = 0;
770  b.curvature = 0;
771  b.step_size = 1.0;
772 
773  try
774  {
775  bfgs_iter_middle(all, b, b.mem, b.rho, b.alpha, b.lastj, b.origin);
776  }
777  catch (const curv_exception&)
778  {
779  fprintf(stdout, "In bfgs_iter_middle: %s", curv_message);
780  b.step_size = 0.0;
781  status = LEARN_CURV;
782  }
783 
784  if (all.hessian_on)
785  {
786  b.gradient_pass = false; // now start computing curvature
787  }
788  else
789  {
790  float d_mag = direction_magnitude(all);
791  ftime(&b.t_end_global);
792  b.net_time = (int)(1000.0 * (b.t_end_global.time - b.t_start_global.time) +
793  (b.t_end_global.millitm - b.t_start_global.millitm));
794  if (!all.quiet)
795  fprintf(stderr, "%-10s\t%-10.5f\t%-.5f\n", "", d_mag, b.step_size);
796  b.predictions.clear();
797  update_weight(all, b.step_size);
798  }
799  }
800  }
801 
802  /********************************************************************/
803  /* C) NOT FIRST PASS, CURVATURE CALCULATED **************************/
804  /********************************************************************/
805  else // just finished all second gradients
806  {
807  if (all.all_reduce != nullptr)
808  {
809  float t = (float)b.curvature;
810  b.curvature = accumulate_scalar(all, t); // Accumulate curvatures
811  }
812  if (all.l2_lambda > 0.)
814  float dd = (float)derivative_in_direction(all, b, b.mem, b.origin);
815  if (b.curvature == 0. && dd != 0.)
816  {
817  fprintf(stdout, "%s", curv_message);
818  b.step_size = 0.0;
819  status = LEARN_CURV;
820  }
821  else if (dd == 0.)
822  {
823  fprintf(stdout, "Derivative 0 detected.\n");
824  b.step_size = 0.0;
825  status = LEARN_CONV;
826  }
827  else
828  b.step_size = -dd / (float)b.curvature;
829 
830  float d_mag = direction_magnitude(all);
831 
832  b.predictions.clear();
833  update_weight(all, b.step_size);
834  ftime(&b.t_end_global);
835  b.net_time = (int)(1000.0 * (b.t_end_global.time - b.t_start_global.time) +
836  (b.t_end_global.millitm - b.t_start_global.millitm));
837 
838  if (!all.quiet)
839  fprintf(stderr, "%-10.5f\t%-10.5f\t%-.5f\n", b.curvature / b.importance_weight_sum, d_mag, b.step_size);
840  b.gradient_pass = true;
841  } // now start computing derivatives.
842  b.current_pass++;
843  b.first_pass = false;
844  b.preconditioner_pass = false;
845 
846  if (b.output_regularizer) // need to accumulate and place the regularizer.
847  {
848  if (all.all_reduce != nullptr)
849  accumulate(all, all.weights, W_COND); // Accumulate preconditioner
850  // preconditioner_to_regularizer(all, b, all.l2_lambda);
851  }
852  ftime(&b.t_end_global);
853  b.net_time = (int)(1000.0 * (b.t_end_global.time - b.t_start_global.time) +
854  (b.t_end_global.millitm - b.t_start_global.millitm));
855 
856  if (all.save_per_pass)
858  return status;
859 }
860 
861 void process_example(vw& all, bfgs& b, example& ec)
862 {
863  label_data& ld = ec.l.simple;
864  if (b.first_pass)
866 
867  /********************************************************************/
868  /* I) GRADIENT CALCULATION ******************************************/
869  /********************************************************************/
870  if (b.gradient_pass)
871  {
872  ec.pred.scalar = predict_and_gradient(all, ec); // w[0] & w[1]
873  ec.loss = all.loss->getLoss(all.sd, ec.pred.scalar, ld.label) * ec.weight;
874  b.loss_sum += ec.loss;
876  }
877  /********************************************************************/
878  /* II) CURVATURE CALCULATION ****************************************/
879  /********************************************************************/
880  else // computing curvature
881  {
882  float d_dot_x = dot_with_direction(all, ec); // w[2]
883  if (b.example_number >= b.predictions.size()) // Make things safe in case example source is strange.
884  b.example_number = b.predictions.size() - 1;
887  ec.loss = all.loss->getLoss(all.sd, ec.pred.scalar, ld.label) * ec.weight;
888  float sd = all.loss->second_derivative(all.sd, b.predictions[b.example_number++], ld.label);
889  b.curvature += ((double)d_dot_x) * d_dot_x * sd * ec.weight;
890  }
892 
893  if (b.preconditioner_pass)
894  update_preconditioner(all, ec); // w[3]
895 }
896 
897 void end_pass(bfgs& b)
898 {
899  vw* all = b.all;
900 
901  if (b.current_pass <= b.final_pass)
902  {
903  if (b.current_pass < b.final_pass)
904  {
905  int status = process_pass(*all, b);
906 
907  // reaching the max number of passes regardless of convergence
908  if (b.final_pass == b.current_pass)
909  {
910  b.all->trace_message << "Maximum number of passes reached. ";
911  if (!b.output_regularizer)
912  b.all->trace_message << "If you want to optimize further, increase the number of passes\n";
913  if (b.output_regularizer)
914  {
915  b.all->trace_message << "\nRegular model file has been created. ";
916  b.all->trace_message << "Output feature regularizer file is created only when the convergence is reached. "
917  "Try increasing the number of passes for convergence\n";
918  b.output_regularizer = false;
919  }
920  }
921 
922  // attain convergence before reaching max iterations
923  if (status != LEARN_OK && b.final_pass > b.current_pass)
924  {
925  b.final_pass = b.current_pass;
926  }
927  else
928  {
929  // Not converged yet.
930  // Reset preconditioner to zero so that it is correctly recomputed in the next pass
931  zero_preconditioner(*all);
932  }
933  if (!all->holdout_set_off)
934  {
937  if (b.early_stop_thres == b.no_win_counter)
938  {
939  set_done(*all);
940  b.all->trace_message << "Early termination reached w.r.t. holdout set error";
941  }
942  }
943  if (b.final_pass == b.current_pass)
944  {
946  set_done(*all);
947  }
948  }
949  else // reaching convergence in the previous pass
950  b.current_pass++;
951  }
952 }
953 
954 // placeholder
955 template <bool audit>
957 {
958  vw* all = b.all;
959  ec.pred.scalar = bfgs_predict(*all, ec);
960  if (audit)
961  GD::print_audit_features(*(b.all), ec);
962 }
963 
964 template <bool audit>
965 void learn(bfgs& b, base_learner& base, example& ec)
966 {
967  vw* all = b.all;
968  assert(ec.in_use);
969 
970  if (b.current_pass <= b.final_pass)
971  {
972  if (test_example(ec))
973  predict<audit>(b, base, ec);
974  else
975  process_example(*all, b, ec);
976  }
977 }
978 
979 void save_load_regularizer(vw& all, bfgs& b, io_buf& model_file, bool read, bool text)
980 {
981  int c = 0;
982  uint32_t length = 2 * (1 << all.num_bits);
983  uint32_t i = 0;
984  size_t brw = 1;
985 
986  if (b.output_regularizer && !read)
988 
989  do
990  {
991  brw = 1;
992  weight* v;
993  if (read)
994  {
995  c++;
996  brw = model_file.bin_read_fixed((char*)&i, sizeof(i), "");
997  if (brw > 0)
998  {
999  assert(i < length);
1000  v = &(b.regularizers[i]);
1001  brw += model_file.bin_read_fixed((char*)v, sizeof(*v), "");
1002  }
1003  }
1004  else // write binary or text
1005  {
1006  v = &(b.regularizers[i]);
1007  if (*v != 0.)
1008  {
1009  c++;
1010  std::stringstream msg;
1011  msg << i;
1012  brw = bin_text_write_fixed(model_file, (char*)&i, sizeof(i), msg, text);
1013 
1014  msg << ":" << *v << "\n";
1015  brw += bin_text_write_fixed(model_file, (char*)v, sizeof(*v), msg, text);
1016  }
1017  }
1018  if (!read)
1019  i++;
1020  } while ((!read && i < length) || (read && brw > 0));
1021 
1022  if (read)
1023  regularizer_to_weight(all, b);
1024 }
1025 
1026 void save_load(bfgs& b, io_buf& model_file, bool read, bool text)
1027 {
1028  vw* all = b.all;
1029 
1030  uint32_t length = 1 << all->num_bits;
1031 
1032  if (read)
1033  {
1034  initialize_regressor(*all);
1035  if (all->per_feature_regularizer_input != "")
1036  {
1037  b.regularizers = calloc_or_throw<weight>(2 * length);
1038  if (b.regularizers == nullptr)
1039  THROW("Failed to allocate regularizers array: try decreasing -b <bits>");
1040  }
1041  int m = b.m;
1042 
1043  b.mem_stride = (m == 0) ? CG_EXTRA : 2 * m;
1044  b.mem = calloc_or_throw<float>(all->length() * b.mem_stride);
1045  b.rho = calloc_or_throw<double>(m);
1046  b.alpha = calloc_or_throw<double>(m);
1047 
1048  uint32_t stride_shift = all->weights.stride_shift();
1049 
1050  if (!all->quiet)
1051  std::cerr << "m = " << m << std::endl
1052  << "Allocated "
1053  << ((long unsigned int)all->length() *
1054  (sizeof(float) * (b.mem_stride) + (sizeof(weight) << stride_shift)) >>
1055  20)
1056  << "M for weights and mem" << std::endl;
1057 
1058  b.net_time = 0.0;
1059  ftime(&b.t_start_global);
1060 
1061  if (!all->quiet)
1062  {
1063  const char* header_fmt = "%2s %-10s\t%-10s\t%-10s\t %-10s\t%-10s\t%-10s\t%-10s\t%-10s\t%-s\n";
1064  fprintf(stderr, header_fmt, "##", "avg. loss", "der. mag.", "d. m. cond.", "wolfe1", "wolfe2", "mix fraction",
1065  "curvature", "dir. magnitude", "step size");
1066  std::cerr.precision(5);
1067  }
1068 
1069  if (b.regularizers != nullptr)
1070  all->l2_lambda = 1; // To make sure we are adding the regularization
1072  reset_state(*all, b, false);
1073  }
1074 
1075  // bool reg_vector = b.output_regularizer || all->per_feature_regularizer_input.length() > 0;
1076  bool reg_vector = (b.output_regularizer && !read) || (all->per_feature_regularizer_input.length() > 0 && read);
1077 
1078  if (model_file.files.size() > 0)
1079  {
1080  std::stringstream msg;
1081  msg << ":" << reg_vector << "\n";
1082  bin_text_read_write_fixed(model_file, (char*)&reg_vector, sizeof(reg_vector), "", read, msg, text);
1083 
1084  if (reg_vector)
1085  save_load_regularizer(*all, b, model_file, read, text);
1086  else
1087  GD::save_load_regressor(*all, model_file, read, text);
1088  }
1089 }
1090 
1091 void init_driver(bfgs& b) { b.backstep_on = true; }
1092 
1094 {
1095  auto b = scoped_calloc_or_throw<bfgs>();
1096  bool conjugate_gradient = false;
1097  bool bfgs_option = false;
1098  option_group_definition bfgs_outer_options("LBFGS and Conjugate Gradient options");
1099  bfgs_outer_options.add(
1100  make_option("conjugate_gradient", conjugate_gradient).keep().help("use conjugate gradient based optimization"));
1101 
1102  option_group_definition bfgs_inner_options("LBFGS and Conjugate Gradient options");
1103  bfgs_inner_options.add(make_option("bfgs", bfgs_option).keep().help("use conjugate gradient based optimization"));
1104  bfgs_inner_options.add(make_option("hessian_on", all.hessian_on).help("use second derivative in line search"));
1105  bfgs_inner_options.add(make_option("mem", b->m).default_value(15).help("memory in bfgs"));
1106  bfgs_inner_options.add(
1107  make_option("termination", b->rel_threshold).default_value(0.001f).help("Termination threshold"));
1108 
1109  options.add_and_parse(bfgs_outer_options);
1110  if (!conjugate_gradient)
1111  {
1112  options.add_and_parse(bfgs_inner_options);
1113  if (!bfgs_option)
1114  {
1115  return nullptr;
1116  }
1117  }
1118 
1119  b->all = &all;
1120  b->wolfe1_bound = 0.01;
1121  b->first_hessian_on = true;
1122  b->first_pass = true;
1123  b->gradient_pass = true;
1124  b->preconditioner_pass = true;
1125  b->backstep_on = false;
1126  b->final_pass = all.numpasses;
1127  b->no_win_counter = 0;
1128 
1129  if (!all.holdout_set_off)
1130  {
1131  all.sd->holdout_best_loss = FLT_MAX;
1132  b->early_stop_thres = options.get_typed_option<size_t>("early_terminate").value();
1133  }
1134 
1135  if (b->m == 0)
1136  all.hessian_on = true;
1137 
1138  if (!all.quiet)
1139  {
1140  if (b->m > 0)
1141  b->all->trace_message << "enabling BFGS based optimization ";
1142  else
1143  b->all->trace_message << "enabling conjugate gradient optimization via BFGS ";
1144  if (all.hessian_on)
1145  b->all->trace_message << "with curvature calculation" << std::endl;
1146  else
1147  b->all->trace_message << "**without** curvature calculation" << std::endl;
1148  }
1149 
1150  if (all.numpasses < 2 && all.training)
1151  THROW("you must make at least 2 passes to use BFGS");
1152 
1153  all.bfgs = true;
1154  all.weights.stride_shift(2);
1155 
1156  void (*learn_ptr)(bfgs&, base_learner&, example&) = nullptr;
1157  if (all.audit)
1158  learn_ptr = learn<true>;
1159  else
1160  learn_ptr = learn<false>;
1161 
1163  if (all.audit || all.hash_inv)
1164  l = &init_learner(b, learn_ptr, predict<true>, all.weights.stride());
1165  else
1166  l = &init_learner(b, learn_ptr, predict<false>, all.weights.stride());
1167 
1168  l->set_save_load(save_load);
1169  l->set_init_driver(init_driver);
1170  l->set_end_pass(end_pass);
1171 
1172  return make_base(*l);
1173 }
float bfgs_predict(vw &all, example &ec)
Definition: bfgs.cc:149
size_t length()
Definition: global_data.h:513
float finalize_prediction(shared_data *sd, float ret)
Definition: gd.cc:339
void zero_preconditioner(vw &all)
Definition: bfgs.cc:124
void set_done(vw &all)
Definition: parser.cc:578
void preconditioner_to_regularizer(vw &all, bfgs &b, float regularization, T &weights)
Definition: bfgs.cc:538
parameters weights
Definition: global_data.h:537
loss_function * loss
Definition: global_data.h:523
double add_regularization(vw &all, bfgs &b, float regularization, T &weights)
Definition: bfgs.cc:448
#define W_GT
Definition: bfgs.cc:37
void print_audit_features(vw &all, example &ec)
Definition: gd.cc:331
bool backstep_on
Definition: bfgs.cc:85
constexpr bool test_example(example &ec) noexcept
Definition: bfgs.cc:147
void accumulate(vw &all, parameters &weights, size_t offset)
Definition: accumulate.cc:20
int origin
Definition: bfgs.cc:96
void initialize_regressor(vw &all, T &weights)
#define W_DIR
Definition: bfgs.cc:38
bool output_regularizer
Definition: bfgs.cc:89
base_learner * bfgs_setup(options_i &options, vw &all)
Definition: bfgs.cc:1093
uint64_t stride_shift(const stagewise_poly &poly, uint64_t idx)
double holdout_sum_loss_since_last_pass
Definition: global_data.h:163
float scalar
Definition: example.h:45
bool hash_inv
Definition: global_data.h:541
float direction_magnitude(vw &, T &weights)
Definition: bfgs.cc:218
virtual float second_derivative(shared_data *, float prediction, float label)=0
float * mem
Definition: bfgs.cc:90
int process_pass(vw &all, bfgs &b)
Definition: bfgs.cc:639
int m
Definition: bfgs.cc:65
#define LEARN_CURV
Definition: bfgs.cc:42
uint32_t stride()
void process_example(vw &all, bfgs &b, example &ec)
Definition: bfgs.cc:861
#define LEARN_CONV
Definition: bfgs.cc:43
double * alpha
Definition: bfgs.cc:92
float dot_with_direction(vw &all, example &ec)
Definition: bfgs.cc:179
double holdout_best_loss
Definition: global_data.h:161
base_learner * make_base(learner< T, E > &base)
Definition: learner.h:462
double net_comm_time
Definition: bfgs.cc:72
float partial_prediction
Definition: example.h:68
bool quiet
Definition: global_data.h:487
double curvature
Definition: bfgs.cc:100
void finalize_regressor(vw &all, std::string reg_name)
Definition: bfgs.cc:62
virtual void add_and_parse(const option_group_definition &group)=0
float label
Definition: simple_label.h:14
float updated_prediction
Definition: example.h:69
label_data simple
Definition: example.h:28
void bfgs_iter_middle(vw &all, bfgs &b, float *mem, double *rho, double *alpha, int &lastj, int &origin, T &weights)
Definition: bfgs.cc:270
#define LEARN_OK
Definition: bfgs.cc:41
bool holdout_set_off
Definition: global_data.h:499
float step_size
Definition: bfgs.cc:98
bool summarize_holdout_set(vw &all, size_t &no_win_counter)
uint32_t num_bits
Definition: global_data.h:398
weight * regularizers
Definition: bfgs.cc:94
bool training
Definition: global_data.h:488
size_t size() const
Definition: v_array.h:68
#define MEM_XT
Definition: bfgs.cc:32
#define MEM_ST
Definition: bfgs.cc:34
virtual float first_derivative(shared_data *, float prediction, float label)=0
float inline_predict(vw &all, example &ec)
Definition: gd.h:98
bool hessian_on
Definition: global_data.h:413
#define CG_EXTRA
Definition: bfgs.cc:29
void save_predictor(vw &all, std::string reg_name, size_t current_pass)
double derivative_in_direction(vw &, bfgs &b, float *mem, int &origin, T &weights)
Definition: bfgs.cc:605
#define W_XT
Definition: bfgs.cc:36
void(* set_minmax)(shared_data *sd, float label)
Definition: global_data.h:394
AllReduce * all_reduce
Definition: global_data.h:381
size_t bin_text_write_fixed(io_buf &io, char *data, size_t len, std::stringstream &msg, bool text)
Definition: io_buf.h:313
void save_load(bfgs &b, io_buf &model_file, bool read, bool text)
Definition: bfgs.cc:1026
size_t bin_read_fixed(char *data, size_t len, const char *read_message)
Definition: io_buf.h:230
constexpr float max_precond_ratio
Definition: bfgs.cc:60
int mem_stride
Definition: bfgs.cc:88
constexpr const char * curv_message
Definition: bfgs.cc:116
v_array< float > predictions
Definition: bfgs.cc:77
virtual float getLoss(shared_data *, float prediction, float label)=0
bool no_bias
Definition: global_data.h:446
learner< T, E > & init_learner(free_ptr< T > &dat, L *base, void(*learn)(T &, L &, E &), void(*predict)(T &, L &, E &), size_t ws, prediction_type::prediction_type_t pred_type)
Definition: learner.h:369
~bfgs()
Definition: bfgs.cc:107
curv_exception curv_ex
void push_back(const T &new_ele)
Definition: v_array.h:107
shared_data * sd
Definition: global_data.h:375
typed_option< T > & get_typed_option(const std::string &key)
Definition: options.h:120
void end_pass(example &ec, vw &all)
Definition: learner.cc:44
float l2_lambda
Definition: global_data.h:445
void save_load_regularizer(vw &all, bfgs &b, io_buf &model_file, bool read, bool text)
Definition: bfgs.cc:979
v_array< int > files
Definition: io_buf.h:64
void clear()
Definition: v_array.h:88
vw_ostream trace_message
Definition: global_data.h:424
bool bfgs
Definition: global_data.h:412
void update_preconditioner(vw &all, example &ec)
Definition: bfgs.cc:171
void regularizer_to_weight(vw &, bfgs &b, T &weights)
Definition: bfgs.cc:576
double weighted_holdout_examples_since_last_pass
Definition: global_data.h:162
std::string per_feature_regularizer_output
Definition: global_data.h:441
constexpr uint64_t constant
Definition: constant.h:11
double wolfe_eval(vw &all, bfgs &b, float *mem, double loss_sum, double previous_loss_sum, double step_size, double importance_weight_sum, int &origin, double &wolfe1, T &weights)
Definition: bfgs.cc:409
double net_time
Definition: bfgs.cc:75
dense_parameters dense_weights
double regularizer_direction_magnitude(vw &, bfgs &b, double regularizer, T &weights)
Definition: bfgs.cc:187
void zero_derivative(vw &all)
Definition: bfgs.cc:122
float initial
Definition: simple_label.h:16
bool first_hessian_on
Definition: bfgs.cc:84
#define MEM_YT
Definition: bfgs.cc:33
void set_zero(size_t offset)
Definition: io_buf.h:54
vw * all
Definition: bfgs.cc:64
void add_grad(float &d, float f, float &fw)
Definition: bfgs.cc:155
bool gradient_pass
Definition: bfgs.cc:104
void zero_state(vw &all)
Definition: bfgs.cc:597
std::string per_feature_regularizer_text
Definition: global_data.h:442
int lastj
Definition: bfgs.cc:96
size_t final_pass
Definition: bfgs.cc:70
void update_weight(vw &, float step_size, T &w)
Definition: bfgs.cc:625
size_t numpasses
Definition: global_data.h:451
float loss
Definition: example.h:70
float weight
option_group_definition & add(T &&op)
Definition: options.h:90
#define W_COND
Definition: bfgs.cc:39
#define MEM_GT
Definition: bfgs.cc:31
struct timeb t_start_global t_end_global
Definition: bfgs.cc:74
polylabel l
Definition: example.h:57
double * rho
Definition: bfgs.cc:91
bool save_per_pass
Definition: global_data.h:408
void init_driver(bfgs &b)
Definition: bfgs.cc:1091
void finalize_preconditioner(vw &, bfgs &b, float regularization, T &weights)
Definition: bfgs.cc:498
bool in_use
Definition: example.h:79
typed_option< T > make_option(std::string name, T &location)
Definition: options.h:80
size_t example_number
Definition: bfgs.cc:78
sparse_parameters sparse_weights
double previous_loss_sum
Definition: bfgs.cc:97
float accumulate_scalar(vw &all, float local_sum)
Definition: accumulate.cc:44
float predict_and_gradient(vw &all, example &ec)
Definition: bfgs.cc:157
uint32_t stride_shift()
void reset_state(vw &all, bfgs &b, bool zero)
Definition: bfgs.cc:126
size_t current_pass
Definition: bfgs.cc:79
size_t no_win_counter
Definition: bfgs.cc:80
std::string per_feature_regularizer_input
Definition: global_data.h:440
bool audit
Definition: global_data.h:486
void predict(bfgs &b, base_learner &, example &ec)
Definition: bfgs.cc:956
double wolfe1_bound
Definition: bfgs.cc:68
polyprediction pred
Definition: example.h:60
void delete_v()
Definition: v_array.h:98
double loss_sum
Definition: bfgs.cc:97
void save_load_regressor(vw &all, io_buf &model_file, bool read, bool text, T &weights)
Definition: gd.cc:707
double importance_weight_sum
Definition: bfgs.cc:99
size_t early_stop_thres
Definition: bfgs.cc:81
std::string final_regressor_name
Definition: global_data.h:535
void add_DIR(float &p, const float fx, float &fw)
Definition: bfgs.cc:177
void learn(bfgs &b, base_learner &base, example &ec)
Definition: bfgs.cc:965
float weight
Definition: example.h:62
bool preconditioner_pass
Definition: bfgs.cc:105
void add_precond(float &d, float f, float &fw)
Definition: bfgs.cc:169
size_t bin_text_read_write_fixed(io_buf &io, char *data, size_t len, const char *read_message, bool read, std::stringstream &msg, bool text)
Definition: io_buf.h:326
#define THROW(args)
Definition: vw_exception.h:181
constexpr uint64_t c
Definition: rand48.cc:12
float f
Definition: cache.cc:40
void bfgs_iter_start(vw &all, bfgs &b, float *mem, int &lastj, double importance_weight_sum, int &origin, T &weights)
Definition: bfgs.cc:238
bool first_pass
Definition: bfgs.cc:103
float rel_threshold
Definition: bfgs.cc:66