Vowpal Wabbit
Classes | Macros | Functions | Variables
bfgs.cc File Reference
#include <cmath>
#include <fstream>
#include <float.h>
#include <netdb.h>
#include <string.h>
#include <stdio.h>
#include <assert.h>
#include <sys/timeb.h>
#include "accumulate.h"
#include "reductions.h"
#include "gd.h"
#include "vw_exception.h"
#include <exception>

Go to the source code of this file.

Classes

class  curv_exception
 
struct  bfgs
 

Macros

#define CG_EXTRA   1
 
#define MEM_GT   0
 
#define MEM_XT   1
 
#define MEM_YT   0
 
#define MEM_ST   1
 
#define W_XT   0
 
#define W_GT   1
 
#define W_DIR   2
 
#define W_COND   3
 
#define LEARN_OK   0
 
#define LEARN_CURV   1
 
#define LEARN_CONV   2
 

Functions

void zero_derivative (vw &all)
 
void zero_preconditioner (vw &all)
 
void reset_state (vw &all, bfgs &b, bool zero)
 
constexpr bool test_example (example &ec) noexcept
 
float bfgs_predict (vw &all, example &ec)
 
void add_grad (float &d, float f, float &fw)
 
float predict_and_gradient (vw &all, example &ec)
 
void add_precond (float &d, float f, float &fw)
 
void update_preconditioner (vw &all, example &ec)
 
void add_DIR (float &p, const float fx, float &fw)
 
float dot_with_direction (vw &all, example &ec)
 
template<class T >
double regularizer_direction_magnitude (vw &, bfgs &b, double regularizer, T &weights)
 
double regularizer_direction_magnitude (vw &all, bfgs &b, float regularizer)
 
template<class T >
float direction_magnitude (vw &, T &weights)
 
float direction_magnitude (vw &all)
 
template<class T >
void bfgs_iter_start (vw &all, bfgs &b, float *mem, int &lastj, double importance_weight_sum, int &origin, T &weights)
 
void bfgs_iter_start (vw &all, bfgs &b, float *mem, int &lastj, double importance_weight_sum, int &origin)
 
template<class T >
void bfgs_iter_middle (vw &all, bfgs &b, float *mem, double *rho, double *alpha, int &lastj, int &origin, T &weights)
 
void bfgs_iter_middle (vw &all, bfgs &b, float *mem, double *rho, double *alpha, int &lastj, int &origin)
 
template<class T >
double wolfe_eval (vw &all, bfgs &b, float *mem, double loss_sum, double previous_loss_sum, double step_size, double importance_weight_sum, int &origin, double &wolfe1, T &weights)
 
double wolfe_eval (vw &all, bfgs &b, float *mem, double loss_sum, double previous_loss_sum, double step_size, double importance_weight_sum, int &origin, double &wolfe1)
 
template<class T >
double add_regularization (vw &all, bfgs &b, float regularization, T &weights)
 
double add_regularization (vw &all, bfgs &b, float regularization)
 
template<class T >
void finalize_preconditioner (vw &, bfgs &b, float regularization, T &weights)
 
void finalize_preconditioner (vw &all, bfgs &b, float regularization)
 
template<class T >
void preconditioner_to_regularizer (vw &all, bfgs &b, float regularization, T &weights)
 
void preconditioner_to_regularizer (vw &all, bfgs &b, float regularization)
 
template<class T >
void regularizer_to_weight (vw &, bfgs &b, T &weights)
 
void regularizer_to_weight (vw &all, bfgs &b)
 
void zero_state (vw &all)
 
template<class T >
double derivative_in_direction (vw &, bfgs &b, float *mem, int &origin, T &weights)
 
double derivative_in_direction (vw &all, bfgs &b, float *mem, int &origin)
 
template<class T >
void update_weight (vw &, float step_size, T &w)
 
void update_weight (vw &all, float step_size)
 
int process_pass (vw &all, bfgs &b)
 
void process_example (vw &all, bfgs &b, example &ec)
 
void end_pass (bfgs &b)
 
template<bool audit>
void predict (bfgs &b, base_learner &, example &ec)
 
template<bool audit>
void learn (bfgs &b, base_learner &base, example &ec)
 
void save_load_regularizer (vw &all, bfgs &b, io_buf &model_file, bool read, bool text)
 
void save_load (bfgs &b, io_buf &model_file, bool read, bool text)
 
void init_driver (bfgs &b)
 
base_learnerbfgs_setup (options_i &options, vw &all)
 

Variables

curv_exception curv_ex
 
constexpr float max_precond_ratio = 10000.f
 
constexpr const char * curv_message
 

Macro Definition Documentation

◆ CG_EXTRA

#define CG_EXTRA   1

Definition at line 29 of file bfgs.cc.

Referenced by save_load().

◆ LEARN_CONV

#define LEARN_CONV   2

Definition at line 43 of file bfgs.cc.

Referenced by process_pass().

◆ LEARN_CURV

#define LEARN_CURV   1

Definition at line 42 of file bfgs.cc.

Referenced by process_pass().

◆ LEARN_OK

#define LEARN_OK   0

Definition at line 41 of file bfgs.cc.

Referenced by end_pass(), and process_pass().

◆ MEM_GT

#define MEM_GT   0

Definition at line 31 of file bfgs.cc.

Referenced by bfgs_iter_middle(), bfgs_iter_start(), derivative_in_direction(), and wolfe_eval().

◆ MEM_ST

#define MEM_ST   1

Definition at line 34 of file bfgs.cc.

Referenced by bfgs_iter_middle().

◆ MEM_XT

#define MEM_XT   1

Definition at line 32 of file bfgs.cc.

Referenced by bfgs_iter_middle(), and bfgs_iter_start().

◆ MEM_YT

#define MEM_YT   0

Definition at line 33 of file bfgs.cc.

Referenced by bfgs_iter_middle().

◆ W_COND

#define W_COND   3

◆ W_DIR

#define W_DIR   2

◆ W_GT

#define W_GT   1

◆ W_XT

#define W_XT   0

Definition at line 36 of file bfgs.cc.

Referenced by bfgs_iter_middle(), bfgs_iter_start(), and update_weight().

Function Documentation

◆ add_DIR()

void add_DIR ( float &  p,
const float  fx,
float &  fw 
)
inline

Definition at line 177 of file bfgs.cc.

References W_DIR.

177 { p += (&fw)[W_DIR] * fx; }
#define W_DIR
Definition: bfgs.cc:38

◆ add_grad()

void add_grad ( float &  d,
float  f,
float &  fw 
)
inline

Definition at line 155 of file bfgs.cc.

References W_GT.

155 { (&fw)[W_GT] += d * f; }
#define W_GT
Definition: bfgs.cc:37
float f
Definition: cache.cc:40

◆ add_precond()

void add_precond ( float &  d,
float  f,
float &  fw 
)
inline

Definition at line 169 of file bfgs.cc.

References W_COND.

169 { (&fw)[W_COND] += d * f * f; }
#define W_COND
Definition: bfgs.cc:39
float f
Definition: cache.cc:40

◆ add_regularization() [1/2]

template<class T >
double add_regularization ( vw all,
bfgs b,
float  regularization,
T &  weights 
)

Definition at line 448 of file bfgs.cc.

References constant, vw::no_bias, bfgs::regularizers, and W_GT.

Referenced by add_regularization(), and process_pass().

449 {
450  // compute the derivative difference
451  double ret = 0.;
452 
453  if (b.regularizers == nullptr)
454  for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
455  {
456  (&(*w))[W_GT] += regularization * (*w);
457  ret += 0.5 * regularization * (*w) * (*w);
458  }
459  else
460  for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
461  {
462  uint64_t i = w.index() >> weights.stride_shift();
463  weight delta_weight = *w - b.regularizers[2 * i + 1];
464  (&(*w))[W_GT] += b.regularizers[2 * i] * delta_weight;
465  ret += 0.5 * b.regularizers[2 * i] * delta_weight * delta_weight;
466  }
467 
468  // if we're not regularizing the intercept term, then subtract it off from the result above
469  // when accessing weights[constant], always use weights.strided_index(constant)
470  if (all.no_bias)
471  {
472  if (b.regularizers == nullptr)
473  {
474  (&weights.strided_index(constant))[W_GT] -= regularization * (weights.strided_index(constant));
475  ret -= 0.5 * regularization * (weights.strided_index(constant)) * (weights.strided_index(constant));
476  }
477  else
478  {
479  uint64_t i = constant >> weights.stride_shift();
480  weight delta_weight = (weights.strided_index(constant)) - b.regularizers[2 * i + 1];
481  (&weights.strided_index(constant))[W_GT] -= b.regularizers[2 * i] * delta_weight;
482  ret -= 0.5 * b.regularizers[2 * i] * delta_weight * delta_weight;
483  }
484  }
485 
486  return ret;
487 }
#define W_GT
Definition: bfgs.cc:37
weight * regularizers
Definition: bfgs.cc:94
bool no_bias
Definition: global_data.h:446
constexpr uint64_t constant
Definition: constant.h:11
float weight

◆ add_regularization() [2/2]

double add_regularization ( vw all,
bfgs b,
float  regularization 
)

Definition at line 489 of file bfgs.cc.

References add_regularization(), parameters::dense_weights, parameters::sparse, parameters::sparse_weights, and vw::weights.

490 {
491  if (all.weights.sparse)
492  return add_regularization(all, b, regularization, all.weights.sparse_weights);
493  else
494  return add_regularization(all, b, regularization, all.weights.dense_weights);
495 }
parameters weights
Definition: global_data.h:537
double add_regularization(vw &all, bfgs &b, float regularization, T &weights)
Definition: bfgs.cc:448
dense_parameters dense_weights
sparse_parameters sparse_weights

◆ bfgs_iter_middle() [1/2]

template<class T >
void bfgs_iter_middle ( vw all,
bfgs b,
float *  mem,
double *  rho,
double *  alpha,
int &  lastj,
int &  origin,
T &  weights 
)

Definition at line 270 of file bfgs.cc.

References curv_ex, f, bfgs::m, MEM_GT, MEM_ST, bfgs::mem_stride, MEM_XT, MEM_YT, vw::num_bits, vw::quiet, W_COND, W_DIR, W_GT, and W_XT.

Referenced by bfgs_iter_middle(), and process_pass().

271 {
272  float* mem0 = mem;
273  uint32_t length = 1 << all.num_bits;
274  // implement conjugate gradient
275  if (b.m == 0)
276  {
277  double g_Hy = 0.;
278  double g_Hg = 0.;
279  double y = 0.;
280 
281  for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
282  {
283  mem = mem0 + (w.index() >> weights.stride_shift()) * b.mem_stride;
284  y = (&(*w))[W_GT] - mem[(MEM_GT + origin) % b.mem_stride];
285  g_Hy += ((double)(&(*w))[W_GT]) * ((&(*w))[W_COND]) * y;
286  g_Hg +=
287  ((double)mem[(MEM_GT + origin) % b.mem_stride]) * ((&(*w))[W_COND]) * mem[(MEM_GT + origin) % b.mem_stride];
288  }
289 
290  float beta = (float)(g_Hy / g_Hg);
291 
292  if (beta < 0.f || std::isnan(beta))
293  beta = 0.f;
294 
295  for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
296  {
297  mem = mem0 + (w.index() >> weights.stride_shift()) * b.mem_stride;
298  mem[(MEM_GT + origin) % b.mem_stride] = (&(*w))[W_GT];
299 
300  (&(*w))[W_DIR] *= beta;
301  (&(*w))[W_DIR] -= ((&(*w))[W_COND]) * ((&(*w))[W_GT]);
302  (&(*w))[W_GT] = 0;
303  }
304  if (!all.quiet)
305  fprintf(stderr, "%f\t", beta);
306  return;
307 
308  mem = mem0 + (length - 1) * b.mem_stride;
309  }
310  else
311  {
312  if (!all.quiet)
313  fprintf(stderr, "%-10s\t", "");
314  }
315 
316  // implement bfgs
317  double y_s = 0.;
318  double y_Hy = 0.;
319  double s_q = 0.;
320 
321  for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
322  {
323  float* mem1 = mem + (w.index() >> weights.stride_shift()) * b.mem_stride;
324  mem1[(MEM_YT + origin) % b.mem_stride] = (&(*w))[W_GT] - mem1[(MEM_GT + origin) % b.mem_stride];
325  mem1[(MEM_ST + origin) % b.mem_stride] = (&(*w))[W_XT] - mem1[(MEM_XT + origin) % b.mem_stride];
326  (&(*w))[W_DIR] = (&(*w))[W_GT];
327  y_s += ((double)mem1[(MEM_YT + origin) % b.mem_stride]) * mem1[(MEM_ST + origin) % b.mem_stride];
328  y_Hy +=
329  ((double)mem1[(MEM_YT + origin) % b.mem_stride]) * mem1[(MEM_YT + origin) % b.mem_stride] * ((&(*w))[W_COND]);
330  s_q += ((double)mem1[(MEM_ST + origin) % b.mem_stride]) * ((&(*w))[W_GT]);
331  }
332 
333  if (y_s <= 0. || y_Hy <= 0.)
334  throw curv_ex;
335  rho[0] = 1 / y_s;
336 
337  float gamma = (float)(y_s / y_Hy);
338 
339  for (int j = 0; j < lastj; j++)
340  {
341  alpha[j] = rho[j] * s_q;
342  s_q = 0.;
343  for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
344  {
345  mem = mem0 + (w.index() >> weights.stride_shift()) * b.mem_stride;
346  (&(*w))[W_DIR] -= (float)alpha[j] * mem[(2 * j + MEM_YT + origin) % b.mem_stride];
347  s_q += ((double)mem[(2 * j + 2 + MEM_ST + origin) % b.mem_stride]) * ((&(*w))[W_DIR]);
348  }
349  }
350 
351  alpha[lastj] = rho[lastj] * s_q;
352  double y_r = 0.;
353 
354  for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
355  {
356  mem = mem0 + (w.index() >> weights.stride_shift()) * b.mem_stride;
357  (&(*w))[W_DIR] -= (float)alpha[lastj] * mem[(2 * lastj + MEM_YT + origin) % b.mem_stride];
358  (&(*w))[W_DIR] *= gamma * ((&(*w))[W_COND]);
359  y_r += ((double)mem[(2 * lastj + MEM_YT + origin) % b.mem_stride]) * ((&(*w))[W_DIR]);
360  }
361 
362  double coef_j;
363 
364  for (int j = lastj; j > 0; j--)
365  {
366  coef_j = alpha[j] - rho[j] * y_r;
367  y_r = 0.;
368  for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
369  {
370  mem = mem0 + (w.index() >> weights.stride_shift()) * b.mem_stride;
371  (&(*w))[W_DIR] += (float)coef_j * mem[(2 * j + MEM_ST + origin) % b.mem_stride];
372  y_r += ((double)mem[(2 * j - 2 + MEM_YT + origin) % b.mem_stride]) * ((&(*w))[W_DIR]);
373  }
374  }
375 
376  coef_j = alpha[0] - rho[0] * y_r;
377  for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
378  {
379  mem = mem0 + (w.index() >> weights.stride_shift()) * b.mem_stride;
380  (&(*w))[W_DIR] = -(&(*w))[W_DIR] - (float)coef_j * mem[(MEM_ST + origin) % b.mem_stride];
381  }
382 
383  /*********************
384  ** shift
385  ********************/
386 
387  lastj = (lastj < b.m - 1) ? lastj + 1 : b.m - 1;
388  origin = (origin + b.mem_stride - 2) % b.mem_stride;
389 
390  for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
391  {
392  mem = mem0 + (w.index() >> weights.stride_shift()) * b.mem_stride;
393  mem[(MEM_GT + origin) % b.mem_stride] = (&(*w))[W_GT];
394  mem[(MEM_XT + origin) % b.mem_stride] = (&(*w))[W_XT];
395  (&(*w))[W_GT] = 0;
396  }
397  for (int j = lastj; j > 0; j--) rho[j] = rho[j - 1];
398 }
#define W_GT
Definition: bfgs.cc:37
#define W_DIR
Definition: bfgs.cc:38
int m
Definition: bfgs.cc:65
bool quiet
Definition: global_data.h:487
uint32_t num_bits
Definition: global_data.h:398
#define MEM_XT
Definition: bfgs.cc:32
#define MEM_ST
Definition: bfgs.cc:34
#define W_XT
Definition: bfgs.cc:36
int mem_stride
Definition: bfgs.cc:88
curv_exception curv_ex
#define MEM_YT
Definition: bfgs.cc:33
#define W_COND
Definition: bfgs.cc:39
#define MEM_GT
Definition: bfgs.cc:31
float f
Definition: cache.cc:40

◆ bfgs_iter_middle() [2/2]

void bfgs_iter_middle ( vw all,
bfgs b,
float *  mem,
double *  rho,
double *  alpha,
int &  lastj,
int &  origin 
)

Definition at line 400 of file bfgs.cc.

References bfgs_iter_middle(), parameters::dense_weights, parameters::sparse, parameters::sparse_weights, and vw::weights.

401 {
402  if (all.weights.sparse)
403  bfgs_iter_middle(all, b, mem, rho, alpha, lastj, origin, all.weights.sparse_weights);
404  else
405  bfgs_iter_middle(all, b, mem, rho, alpha, lastj, origin, all.weights.dense_weights);
406 }
parameters weights
Definition: global_data.h:537
void bfgs_iter_middle(vw &all, bfgs &b, float *mem, double *rho, double *alpha, int &lastj, int &origin, T &weights)
Definition: bfgs.cc:270
dense_parameters dense_weights
sparse_parameters sparse_weights

◆ bfgs_iter_start() [1/2]

template<class T >
void bfgs_iter_start ( vw all,
bfgs b,
float *  mem,
int &  lastj,
double  importance_weight_sum,
int &  origin,
T &  weights 
)

Definition at line 238 of file bfgs.cc.

References bfgs::m, MEM_GT, bfgs::mem_stride, MEM_XT, vw::quiet, W_COND, W_DIR, W_GT, and W_XT.

Referenced by bfgs_iter_start(), and process_pass().

239 {
240  double g1_Hg1 = 0.;
241  double g1_g1 = 0.;
242 
243  origin = 0;
244  for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
245  {
246  float* mem1 = mem + (w.index() >> weights.stride_shift()) * b.mem_stride;
247  if (b.m > 0)
248  mem1[(MEM_XT + origin) % b.mem_stride] = (&(*w))[W_XT];
249  mem1[(MEM_GT + origin) % b.mem_stride] = (&(*w))[W_GT];
250  g1_Hg1 += ((double)(&(*w))[W_GT]) * ((&(*w))[W_GT]) * ((&(*w))[W_COND]);
251  g1_g1 += ((double)((&(*w))[W_GT])) * ((&(*w))[W_GT]);
252  (&(*w))[W_DIR] = -(&(*w))[W_COND] * ((&(*w))[W_GT]);
253  ((&(*w))[W_GT]) = 0;
254  }
255  lastj = 0;
256  if (!all.quiet)
257  fprintf(stderr, "%-10.5f\t%-10.5f\t%-10s\t%-10s\t%-10s\t", g1_g1 / (importance_weight_sum * importance_weight_sum),
258  g1_Hg1 / importance_weight_sum, "", "", "");
259 }
#define W_GT
Definition: bfgs.cc:37
#define W_DIR
Definition: bfgs.cc:38
int m
Definition: bfgs.cc:65
bool quiet
Definition: global_data.h:487
#define MEM_XT
Definition: bfgs.cc:32
#define W_XT
Definition: bfgs.cc:36
int mem_stride
Definition: bfgs.cc:88
#define W_COND
Definition: bfgs.cc:39
#define MEM_GT
Definition: bfgs.cc:31

◆ bfgs_iter_start() [2/2]

void bfgs_iter_start ( vw all,
bfgs b,
float *  mem,
int &  lastj,
double  importance_weight_sum,
int &  origin 
)

Definition at line 261 of file bfgs.cc.

References bfgs_iter_start(), parameters::dense_weights, parameters::sparse, parameters::sparse_weights, and vw::weights.

262 {
263  if (all.weights.sparse)
264  bfgs_iter_start(all, b, mem, lastj, importance_weight_sum, origin, all.weights.sparse_weights);
265  else
266  bfgs_iter_start(all, b, mem, lastj, importance_weight_sum, origin, all.weights.dense_weights);
267 }
parameters weights
Definition: global_data.h:537
dense_parameters dense_weights
sparse_parameters sparse_weights
void bfgs_iter_start(vw &all, bfgs &b, float *mem, int &lastj, double importance_weight_sum, int &origin, T &weights)
Definition: bfgs.cc:238

◆ bfgs_predict()

float bfgs_predict ( vw all,
example ec 
)

Definition at line 149 of file bfgs.cc.

References GD::finalize_prediction(), GD::inline_predict(), example::partial_prediction, and vw::sd.

Referenced by predict(), and predict_and_gradient().

150 {
153 }
float finalize_prediction(shared_data *sd, float ret)
Definition: gd.cc:339
float partial_prediction
Definition: example.h:68
float inline_predict(vw &all, example &ec)
Definition: gd.h:98
shared_data * sd
Definition: global_data.h:375

◆ bfgs_setup()

base_learner* bfgs_setup ( options_i options,
vw all 
)

Definition at line 1093 of file bfgs.cc.

References VW::config::option_group_definition::add(), VW::config::options_i::add_and_parse(), vw::audit, vw::bfgs, LEARNER::end_pass(), VW::config::options_i::get_typed_option(), vw::hash_inv, vw::hessian_on, shared_data::holdout_best_loss, vw::holdout_set_off, init_driver(), LEARNER::init_learner(), LEARNER::make_base(), VW::config::make_option(), vw::numpasses, vw::quiet, save_load(), vw::sd, parameters::stride(), parameters::stride_shift(), THROW, vw::training, and vw::weights.

Referenced by parse_reductions().

1094 {
1095  auto b = scoped_calloc_or_throw<bfgs>();
1096  bool conjugate_gradient = false;
1097  bool bfgs_option = false;
1098  option_group_definition bfgs_outer_options("LBFGS and Conjugate Gradient options");
1099  bfgs_outer_options.add(
1100  make_option("conjugate_gradient", conjugate_gradient).keep().help("use conjugate gradient based optimization"));
1101 
1102  option_group_definition bfgs_inner_options("LBFGS and Conjugate Gradient options");
1103  bfgs_inner_options.add(make_option("bfgs", bfgs_option).keep().help("use conjugate gradient based optimization"));
1104  bfgs_inner_options.add(make_option("hessian_on", all.hessian_on).help("use second derivative in line search"));
1105  bfgs_inner_options.add(make_option("mem", b->m).default_value(15).help("memory in bfgs"));
1106  bfgs_inner_options.add(
1107  make_option("termination", b->rel_threshold).default_value(0.001f).help("Termination threshold"));
1108 
1109  options.add_and_parse(bfgs_outer_options);
1110  if (!conjugate_gradient)
1111  {
1112  options.add_and_parse(bfgs_inner_options);
1113  if (!bfgs_option)
1114  {
1115  return nullptr;
1116  }
1117  }
1118 
1119  b->all = &all;
1120  b->wolfe1_bound = 0.01;
1121  b->first_hessian_on = true;
1122  b->first_pass = true;
1123  b->gradient_pass = true;
1124  b->preconditioner_pass = true;
1125  b->backstep_on = false;
1126  b->final_pass = all.numpasses;
1127  b->no_win_counter = 0;
1128 
1129  if (!all.holdout_set_off)
1130  {
1131  all.sd->holdout_best_loss = FLT_MAX;
1132  b->early_stop_thres = options.get_typed_option<size_t>("early_terminate").value();
1133  }
1134 
1135  if (b->m == 0)
1136  all.hessian_on = true;
1137 
1138  if (!all.quiet)
1139  {
1140  if (b->m > 0)
1141  b->all->trace_message << "enabling BFGS based optimization ";
1142  else
1143  b->all->trace_message << "enabling conjugate gradient optimization via BFGS ";
1144  if (all.hessian_on)
1145  b->all->trace_message << "with curvature calculation" << std::endl;
1146  else
1147  b->all->trace_message << "**without** curvature calculation" << std::endl;
1148  }
1149 
1150  if (all.numpasses < 2 && all.training)
1151  THROW("you must make at least 2 passes to use BFGS");
1152 
1153  all.bfgs = true;
1154  all.weights.stride_shift(2);
1155 
1156  void (*learn_ptr)(bfgs&, base_learner&, example&) = nullptr;
1157  if (all.audit)
1158  learn_ptr = learn<true>;
1159  else
1160  learn_ptr = learn<false>;
1161 
1163  if (all.audit || all.hash_inv)
1164  l = &init_learner(b, learn_ptr, predict<true>, all.weights.stride());
1165  else
1166  l = &init_learner(b, learn_ptr, predict<false>, all.weights.stride());
1167 
1168  l->set_save_load(save_load);
1169  l->set_init_driver(init_driver);
1170  l->set_end_pass(end_pass);
1171 
1172  return make_base(*l);
1173 }
parameters weights
Definition: global_data.h:537
bool hash_inv
Definition: global_data.h:541
uint32_t stride()
double holdout_best_loss
Definition: global_data.h:161
base_learner * make_base(learner< T, E > &base)
Definition: learner.h:462
bool quiet
Definition: global_data.h:487
Definition: bfgs.cc:62
virtual void add_and_parse(const option_group_definition &group)=0
bool holdout_set_off
Definition: global_data.h:499
bool training
Definition: global_data.h:488
bool hessian_on
Definition: global_data.h:413
void end_pass(bfgs &b)
Definition: bfgs.cc:897
void save_load(bfgs &b, io_buf &model_file, bool read, bool text)
Definition: bfgs.cc:1026
learner< T, E > & init_learner(free_ptr< T > &dat, L *base, void(*learn)(T &, L &, E &), void(*predict)(T &, L &, E &), size_t ws, prediction_type::prediction_type_t pred_type)
Definition: learner.h:369
shared_data * sd
Definition: global_data.h:375
typed_option< T > & get_typed_option(const std::string &key)
Definition: options.h:120
bool bfgs
Definition: global_data.h:412
size_t numpasses
Definition: global_data.h:451
void init_driver(bfgs &b)
Definition: bfgs.cc:1091
typed_option< T > make_option(std::string name, T &location)
Definition: options.h:80
uint32_t stride_shift()
bool audit
Definition: global_data.h:486
#define THROW(args)
Definition: vw_exception.h:181

◆ derivative_in_direction() [1/2]

template<class T >
double derivative_in_direction ( vw ,
bfgs b,
float *  mem,
int &  origin,
T &  weights 
)

Definition at line 605 of file bfgs.cc.

References MEM_GT, bfgs::mem_stride, and W_DIR.

Referenced by derivative_in_direction(), and process_pass().

606 {
607  double ret = 0.;
608  for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
609  {
610  float* mem1 = mem + (w.index() >> weights.stride_shift()) * b.mem_stride;
611  ret += ((double)mem1[(MEM_GT + origin) % b.mem_stride]) * (&(*w))[W_DIR];
612  }
613  return ret;
614 }
#define W_DIR
Definition: bfgs.cc:38
int mem_stride
Definition: bfgs.cc:88
#define MEM_GT
Definition: bfgs.cc:31

◆ derivative_in_direction() [2/2]

double derivative_in_direction ( vw all,
bfgs b,
float *  mem,
int &  origin 
)

Definition at line 616 of file bfgs.cc.

References parameters::dense_weights, derivative_in_direction(), parameters::sparse, parameters::sparse_weights, and vw::weights.

617 {
618  if (all.weights.sparse)
619  return derivative_in_direction(all, b, mem, origin, all.weights.sparse_weights);
620  else
621  return derivative_in_direction(all, b, mem, origin, all.weights.dense_weights);
622 }
parameters weights
Definition: global_data.h:537
double derivative_in_direction(vw &, bfgs &b, float *mem, int &origin, T &weights)
Definition: bfgs.cc:605
dense_parameters dense_weights
sparse_parameters sparse_weights

◆ direction_magnitude() [1/2]

template<class T >
float direction_magnitude ( vw ,
T &  weights 
)

Definition at line 218 of file bfgs.cc.

References W_DIR.

Referenced by direction_magnitude(), and process_pass().

219 {
220  // compute direction magnitude
221  double ret = 0.;
222  for (typename T::iterator iter = weights.begin(); iter != weights.end(); ++iter)
223  ret += ((double)(&(*iter))[W_DIR]) * (&(*iter))[W_DIR];
224 
225  return (float)ret;
226 }
#define W_DIR
Definition: bfgs.cc:38

◆ direction_magnitude() [2/2]

float direction_magnitude ( vw all)

Definition at line 228 of file bfgs.cc.

References parameters::dense_weights, direction_magnitude(), parameters::sparse, parameters::sparse_weights, and vw::weights.

229 {
230  // compute direction magnitude
231  if (all.weights.sparse)
232  return direction_magnitude(all, all.weights.sparse_weights);
233  else
234  return direction_magnitude(all, all.weights.dense_weights);
235 }
parameters weights
Definition: global_data.h:537
float direction_magnitude(vw &, T &weights)
Definition: bfgs.cc:218
dense_parameters dense_weights
sparse_parameters sparse_weights

◆ dot_with_direction()

float dot_with_direction ( vw all,
example ec 
)

Definition at line 179 of file bfgs.cc.

References label_data::initial, example::l, and polylabel::simple.

Referenced by process_example().

180 {
181  float temp = ec.l.simple.initial;
182  GD::foreach_feature<float, add_DIR>(all, ec, temp);
183  return temp;
184 }
label_data simple
Definition: example.h:28
float initial
Definition: simple_label.h:16
polylabel l
Definition: example.h:57

◆ end_pass()

void end_pass ( bfgs b)

Definition at line 897 of file bfgs.cc.

References bfgs::all, bfgs::current_pass, bfgs::early_stop_thres, bfgs::final_pass, vw::final_regressor_name, finalize_regressor(), vw::holdout_set_off, LEARN_OK, bfgs::no_win_counter, bfgs::output_regularizer, process_pass(), set_done(), summarize_holdout_set(), vw::trace_message, and zero_preconditioner().

898 {
899  vw* all = b.all;
900 
901  if (b.current_pass <= b.final_pass)
902  {
903  if (b.current_pass < b.final_pass)
904  {
905  int status = process_pass(*all, b);
906 
907  // reaching the max number of passes regardless of convergence
908  if (b.final_pass == b.current_pass)
909  {
910  b.all->trace_message << "Maximum number of passes reached. ";
911  if (!b.output_regularizer)
912  b.all->trace_message << "If you want to optimize further, increase the number of passes\n";
913  if (b.output_regularizer)
914  {
915  b.all->trace_message << "\nRegular model file has been created. ";
916  b.all->trace_message << "Output feature regularizer file is created only when the convergence is reached. "
917  "Try increasing the number of passes for convergence\n";
918  b.output_regularizer = false;
919  }
920  }
921 
922  // attain convergence before reaching max iterations
923  if (status != LEARN_OK && b.final_pass > b.current_pass)
924  {
925  b.final_pass = b.current_pass;
926  }
927  else
928  {
929  // Not converged yet.
930  // Reset preconditioner to zero so that it is correctly recomputed in the next pass
931  zero_preconditioner(*all);
932  }
933  if (!all->holdout_set_off)
934  {
937  if (b.early_stop_thres == b.no_win_counter)
938  {
939  set_done(*all);
940  b.all->trace_message << "Early termination reached w.r.t. holdout set error";
941  }
942  }
943  if (b.final_pass == b.current_pass)
944  {
946  set_done(*all);
947  }
948  }
949  else // reaching convergence in the previous pass
950  b.current_pass++;
951  }
952 }
void zero_preconditioner(vw &all)
Definition: bfgs.cc:124
void set_done(vw &all)
Definition: parser.cc:578
bool output_regularizer
Definition: bfgs.cc:89
int process_pass(vw &all, bfgs &b)
Definition: bfgs.cc:639
void finalize_regressor(vw &all, std::string reg_name)
#define LEARN_OK
Definition: bfgs.cc:41
bool holdout_set_off
Definition: global_data.h:499
bool summarize_holdout_set(vw &all, size_t &no_win_counter)
vw_ostream trace_message
Definition: global_data.h:424
vw * all
Definition: bfgs.cc:64
size_t final_pass
Definition: bfgs.cc:70
size_t current_pass
Definition: bfgs.cc:79
size_t no_win_counter
Definition: bfgs.cc:80
size_t early_stop_thres
Definition: bfgs.cc:81
std::string final_regressor_name
Definition: global_data.h:535

◆ finalize_preconditioner() [1/2]

template<class T >
void finalize_preconditioner ( vw ,
bfgs b,
float  regularization,
T &  weights 
)

Definition at line 498 of file bfgs.cc.

References f, max_precond_ratio, bfgs::regularizers, and W_COND.

Referenced by finalize_preconditioner(), and process_pass().

499 {
500  float max_hessian = 0.f;
501 
502  if (b.regularizers == nullptr)
503  for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
504  {
505  (&(*w))[W_COND] += regularization;
506  if ((&(*w))[W_COND] > max_hessian)
507  max_hessian = (&(*w))[W_COND];
508  if ((&(*w))[W_COND] > 0)
509  (&(*w))[W_COND] = 1.f / (&(*w))[W_COND];
510  }
511  else
512  for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
513  {
514  (&(*w))[W_COND] += b.regularizers[2 * (w.index() >> weights.stride_shift())];
515  if ((&(*w))[W_COND] > max_hessian)
516  max_hessian = (&(*w))[W_COND];
517  if ((&(*w))[W_COND] > 0)
518  (&(*w))[W_COND] = 1.f / (&(*w))[W_COND];
519  }
520 
521  float max_precond = (max_hessian == 0.f) ? 0.f : max_precond_ratio / max_hessian;
522 
523  for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
524  {
525  if (std::isinf(*w) || *w > max_precond)
526  (&(*w))[W_COND] = max_precond;
527  }
528 }
weight * regularizers
Definition: bfgs.cc:94
constexpr float max_precond_ratio
Definition: bfgs.cc:60
#define W_COND
Definition: bfgs.cc:39
float f
Definition: cache.cc:40

◆ finalize_preconditioner() [2/2]

void finalize_preconditioner ( vw all,
bfgs b,
float  regularization 
)

Definition at line 529 of file bfgs.cc.

References parameters::dense_weights, finalize_preconditioner(), parameters::sparse, parameters::sparse_weights, and vw::weights.

530 {
531  if (all.weights.sparse)
532  finalize_preconditioner(all, b, regularization, all.weights.sparse_weights);
533  else
534  finalize_preconditioner(all, b, regularization, all.weights.dense_weights);
535 }
parameters weights
Definition: global_data.h:537
dense_parameters dense_weights
void finalize_preconditioner(vw &, bfgs &b, float regularization, T &weights)
Definition: bfgs.cc:498
sparse_parameters sparse_weights

◆ init_driver()

void init_driver ( bfgs b)

Definition at line 1091 of file bfgs.cc.

References bfgs::backstep_on.

Referenced by bfgs_setup().

1091 { b.backstep_on = true; }
bool backstep_on
Definition: bfgs.cc:85

◆ learn()

template<bool audit>
void learn ( bfgs b,
base_learner base,
example ec 
)

Definition at line 965 of file bfgs.cc.

References bfgs::all, bfgs::current_pass, bfgs::final_pass, example::in_use, process_example(), and test_example().

Referenced by LEARNER::init_cost_sensitive_learner(), LEARNER::learner< CB_EXPLORE::cb_explore, example >::init_learner(), LEARNER::init_learner(), LEARNER::init_multiclass_learner(), MWT::predict_or_learn(), VW::cb_explore_adf::softmax::setup(), VW::cb_explore_adf::greedy::setup(), VW::cb_explore_adf::first::setup(), VW::cb_explore_adf::bag::setup(), VW::cb_explore_adf::cover::setup(), and VW::cb_explore_adf::regcb::setup().

966 {
967  vw* all = b.all;
968  assert(ec.in_use);
969 
970  if (b.current_pass <= b.final_pass)
971  {
972  if (test_example(ec))
973  predict<audit>(b, base, ec);
974  else
975  process_example(*all, b, ec);
976  }
977 }
constexpr bool test_example(example &ec) noexcept
Definition: bfgs.cc:147
void process_example(vw &all, bfgs &b, example &ec)
Definition: bfgs.cc:861
vw * all
Definition: bfgs.cc:64
size_t final_pass
Definition: bfgs.cc:70
bool in_use
Definition: example.h:79
size_t current_pass
Definition: bfgs.cc:79

◆ preconditioner_to_regularizer() [1/2]

template<class T >
void preconditioner_to_regularizer ( vw all,
bfgs b,
float  regularization,
T &  weights 
)

Definition at line 538 of file bfgs.cc.

References f, vw::num_bits, bfgs::regularizers, THROW, and W_COND.

Referenced by preconditioner_to_regularizer(), and save_load_regularizer().

539 {
540  uint32_t length = 1 << all.num_bits;
541 
542  if (b.regularizers == nullptr)
543  {
544  b.regularizers = calloc_or_throw<weight>(2 * length);
545 
546  if (b.regularizers == nullptr)
547  THROW("Failed to allocate weight array: try decreasing -b <bits>");
548 
549  for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
550  {
551  uint64_t i = w.index() >> weights.stride_shift();
552  b.regularizers[2 * i] = regularization;
553  if ((&(*w))[W_COND] > 0.f)
554  b.regularizers[2 * i] += 1.f / (&(*w))[W_COND];
555  }
556  }
557  else
558  for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
559  {
560  if ((&(*w))[W_COND] > 0.f)
561  b.regularizers[2 * (w.index() >> weights.stride_shift())] += 1.f / (&(*w))[W_COND];
562  }
563 
564  for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
565  b.regularizers[2 * (w.index() >> weights.stride_shift()) + 1] = *w;
566 }
uint32_t num_bits
Definition: global_data.h:398
weight * regularizers
Definition: bfgs.cc:94
#define W_COND
Definition: bfgs.cc:39
#define THROW(args)
Definition: vw_exception.h:181
float f
Definition: cache.cc:40

◆ preconditioner_to_regularizer() [2/2]

void preconditioner_to_regularizer ( vw all,
bfgs b,
float  regularization 
)

Definition at line 567 of file bfgs.cc.

References parameters::dense_weights, preconditioner_to_regularizer(), parameters::sparse, parameters::sparse_weights, and vw::weights.

568 {
569  if (all.weights.sparse)
570  preconditioner_to_regularizer(all, b, regularization, all.weights.sparse_weights);
571  else
572  preconditioner_to_regularizer(all, b, regularization, all.weights.dense_weights);
573 }
void preconditioner_to_regularizer(vw &all, bfgs &b, float regularization, T &weights)
Definition: bfgs.cc:538
parameters weights
Definition: global_data.h:537
dense_parameters dense_weights
sparse_parameters sparse_weights

◆ predict()

template<bool audit>
void predict ( bfgs b,
base_learner ,
example ec 
)

◆ predict_and_gradient()

float predict_and_gradient ( vw all,
example ec 
)

Definition at line 157 of file bfgs.cc.

References bfgs_predict(), loss_function::first_derivative(), example::l, label_data::label, vw::loss, vw::sd, vw::set_minmax, polylabel::simple, and example::weight.

Referenced by process_example().

158 {
159  float fp = bfgs_predict(all, ec);
160  label_data& ld = ec.l.simple;
161  all.set_minmax(all.sd, ld.label);
162 
163  float loss_grad = all.loss->first_derivative(all.sd, fp, ld.label) * ec.weight;
164  GD::foreach_feature<float, add_grad>(all, ec, loss_grad);
165 
166  return fp;
167 }
float bfgs_predict(vw &all, example &ec)
Definition: bfgs.cc:149
loss_function * loss
Definition: global_data.h:523
float label
Definition: simple_label.h:14
label_data simple
Definition: example.h:28
virtual float first_derivative(shared_data *, float prediction, float label)=0
void(* set_minmax)(shared_data *sd, float label)
Definition: global_data.h:394
shared_data * sd
Definition: global_data.h:375
polylabel l
Definition: example.h:57
float weight
Definition: example.h:62

◆ process_example()

void process_example ( vw all,
bfgs b,
example ec 
)

Definition at line 861 of file bfgs.cc.

References bfgs::curvature, dot_with_direction(), bfgs::example_number, bfgs::first_pass, loss_function::getLoss(), bfgs::gradient_pass, bfgs::importance_weight_sum, example::l, label_data::label, example::loss, vw::loss, bfgs::loss_sum, example::partial_prediction, bfgs::preconditioner_pass, example::pred, predict_and_gradient(), bfgs::predictions, v_array< T >::push_back(), polyprediction::scalar, vw::sd, loss_function::second_derivative(), polylabel::simple, v_array< T >::size(), update_preconditioner(), example::updated_prediction, and example::weight.

Referenced by learn().

862 {
863  label_data& ld = ec.l.simple;
864  if (b.first_pass)
866 
867  /********************************************************************/
868  /* I) GRADIENT CALCULATION ******************************************/
869  /********************************************************************/
870  if (b.gradient_pass)
871  {
872  ec.pred.scalar = predict_and_gradient(all, ec); // w[0] & w[1]
873  ec.loss = all.loss->getLoss(all.sd, ec.pred.scalar, ld.label) * ec.weight;
874  b.loss_sum += ec.loss;
876  }
877  /********************************************************************/
878  /* II) CURVATURE CALCULATION ****************************************/
879  /********************************************************************/
880  else // computing curvature
881  {
882  float d_dot_x = dot_with_direction(all, ec); // w[2]
883  if (b.example_number >= b.predictions.size()) // Make things safe in case example source is strange.
884  b.example_number = b.predictions.size() - 1;
887  ec.loss = all.loss->getLoss(all.sd, ec.pred.scalar, ld.label) * ec.weight;
888  float sd = all.loss->second_derivative(all.sd, b.predictions[b.example_number++], ld.label);
889  b.curvature += ((double)d_dot_x) * d_dot_x * sd * ec.weight;
890  }
892 
893  if (b.preconditioner_pass)
894  update_preconditioner(all, ec); // w[3]
895 }
loss_function * loss
Definition: global_data.h:523
float scalar
Definition: example.h:45
virtual float second_derivative(shared_data *, float prediction, float label)=0
float dot_with_direction(vw &all, example &ec)
Definition: bfgs.cc:179
float partial_prediction
Definition: example.h:68
double curvature
Definition: bfgs.cc:100
float label
Definition: simple_label.h:14
float updated_prediction
Definition: example.h:69
label_data simple
Definition: example.h:28
size_t size() const
Definition: v_array.h:68
v_array< float > predictions
Definition: bfgs.cc:77
virtual float getLoss(shared_data *, float prediction, float label)=0
void push_back(const T &new_ele)
Definition: v_array.h:107
shared_data * sd
Definition: global_data.h:375
void update_preconditioner(vw &all, example &ec)
Definition: bfgs.cc:171
bool gradient_pass
Definition: bfgs.cc:104
float loss
Definition: example.h:70
polylabel l
Definition: example.h:57
size_t example_number
Definition: bfgs.cc:78
float predict_and_gradient(vw &all, example &ec)
Definition: bfgs.cc:157
polyprediction pred
Definition: example.h:60
double loss_sum
Definition: bfgs.cc:97
double importance_weight_sum
Definition: bfgs.cc:99
float weight
Definition: example.h:62
bool preconditioner_pass
Definition: bfgs.cc:105
bool first_pass
Definition: bfgs.cc:103

◆ process_pass()

int process_pass ( vw all,
bfgs b 
)

Definition at line 639 of file bfgs.cc.

References accumulate(), accumulate_scalar(), add_regularization(), vw::all_reduce, bfgs::alpha, bfgs::backstep_on, bfgs_iter_middle(), bfgs_iter_start(), v_array< T >::clear(), bfgs::current_pass, curv_message, bfgs::curvature, derivative_in_direction(), direction_magnitude(), bfgs::example_number, f, vw::final_regressor_name, finalize_preconditioner(), bfgs::first_hessian_on, bfgs::first_pass, bfgs::gradient_pass, vw::hessian_on, vw::holdout_set_off, shared_data::holdout_sum_loss_since_last_pass, bfgs::importance_weight_sum, vw::l2_lambda, bfgs::lastj, LEARN_CONV, LEARN_CURV, LEARN_OK, bfgs::loss_sum, bfgs::mem, bfgs::net_time, bfgs::origin, bfgs::output_regularizer, bfgs::preconditioner_pass, bfgs::predictions, bfgs::previous_loss_sum, vw::quiet, regularizer_direction_magnitude(), bfgs::rel_threshold, bfgs::rho, vw::save_per_pass, save_predictor(), vw::sd, bfgs::step_size, bfgs::t_end_global, update_weight(), W_COND, shared_data::weighted_holdout_examples_since_last_pass, vw::weights, wolfe_eval(), and zero_derivative().

Referenced by end_pass().

640 {
641  int status = LEARN_OK;
642 
643  finalize_preconditioner(all, b, all.l2_lambda);
644  /********************************************************************/
645  /* A) FIRST PASS FINISHED: INITIALIZE FIRST LINE SEARCH *************/
646  /********************************************************************/
647  if (b.first_pass)
648  {
649  if (all.all_reduce != nullptr)
650  {
651  accumulate(all, all.weights, W_COND); // Accumulate preconditioner
652  float temp = (float)b.importance_weight_sum;
654  }
655  // finalize_preconditioner(all, b, all.l2_lambda);
656  if (all.all_reduce != nullptr)
657  {
658  float temp = (float)b.loss_sum;
659  b.loss_sum = accumulate_scalar(all, temp); // Accumulate loss_sums
660  accumulate(all, all.weights, 1); // Accumulate gradients from all nodes
661  }
662  if (all.l2_lambda > 0.)
663  b.loss_sum += add_regularization(all, b, all.l2_lambda);
664  if (!all.quiet)
665  fprintf(stderr, "%2lu %-10.5f\t", (long unsigned int)b.current_pass + 1, b.loss_sum / b.importance_weight_sum);
666 
668  b.loss_sum = 0.;
669  b.example_number = 0;
670  b.curvature = 0;
672  if (b.first_hessian_on)
673  {
674  b.gradient_pass = false; // now start computing curvature
675  }
676  else
677  {
678  b.step_size = 0.5;
679  float d_mag = direction_magnitude(all);
680  ftime(&b.t_end_global);
681  b.net_time = (int)(1000.0 * (b.t_end_global.time - b.t_start_global.time) +
682  (b.t_end_global.millitm - b.t_start_global.millitm));
683  if (!all.quiet)
684  fprintf(stderr, "%-10s\t%-10.5f\t%-.5f\n", "", d_mag, b.step_size);
685  b.predictions.clear();
686  update_weight(all, b.step_size);
687  }
688  }
689  else
690  /********************************************************************/
691  /* B) GRADIENT CALCULATED *******************************************/
692  /********************************************************************/
693  if (b.gradient_pass) // We just finished computing all gradients
694  {
695  if (all.all_reduce != nullptr)
696  {
697  float t = (float)b.loss_sum;
698  b.loss_sum = accumulate_scalar(all, t); // Accumulate loss_sums
699  accumulate(all, all.weights, 1); // Accumulate gradients from all nodes
700  }
701  if (all.l2_lambda > 0.)
702  b.loss_sum += add_regularization(all, b, all.l2_lambda);
703  if (!all.quiet)
704  {
705  if (!all.holdout_set_off && b.current_pass >= 1)
706  {
708  {
709  fprintf(stderr, "%2lu ", (long unsigned int)b.current_pass + 1);
710  fprintf(stderr, "h unknown ");
711  }
712  else
713  fprintf(stderr, "%2lu h%-10.5f\t", (long unsigned int)b.current_pass + 1,
715  }
716  else
717  fprintf(stderr, "%2lu %-10.5f\t", (long unsigned int)b.current_pass + 1, b.loss_sum / b.importance_weight_sum);
718  }
719  double wolfe1;
720  double new_step = wolfe_eval(
721  all, b, b.mem, b.loss_sum, b.previous_loss_sum, b.step_size, b.importance_weight_sum, b.origin, wolfe1);
722 
723  /********************************************************************/
724  /* B0) DERIVATIVE ZERO: MINIMUM FOUND *******************************/
725  /********************************************************************/
726  if (std::isnan((float)wolfe1))
727  {
728  fprintf(stderr, "\n");
729  fprintf(stdout, "Derivative 0 detected.\n");
730  b.step_size = 0.0;
731  status = LEARN_CONV;
732  }
733  /********************************************************************/
734  /* B1) LINE SEARCH FAILED *******************************************/
735  /********************************************************************/
736  else if (b.backstep_on && (wolfe1 < b.wolfe1_bound || b.loss_sum > b.previous_loss_sum))
737  {
738  // curvature violated, or we stepped too far last time: step back
739  ftime(&b.t_end_global);
740  b.net_time = (int)(1000.0 * (b.t_end_global.time - b.t_start_global.time) +
741  (b.t_end_global.millitm - b.t_start_global.millitm));
742  float ratio = (b.step_size == 0.f) ? 0.f : (float)new_step / (float)b.step_size;
743  if (!all.quiet)
744  fprintf(stderr, "%-10s\t%-10s\t(revise x %.1f)\t%-.5f\n", "", "", ratio, new_step);
745  b.predictions.clear();
746  update_weight(all, (float)(-b.step_size + new_step));
747  b.step_size = (float)new_step;
748  zero_derivative(all);
749  b.loss_sum = 0.;
750  }
751 
752  /********************************************************************/
753  /* B2) LINE SEARCH SUCCESSFUL OR DISABLED ******************/
754  /* DETERMINE NEXT SEARCH DIRECTION ******************/
755  /********************************************************************/
756  else
757  {
758  double rel_decrease = (b.previous_loss_sum - b.loss_sum) / b.previous_loss_sum;
759  if (!std::isnan((float)rel_decrease) && b.backstep_on && fabs(rel_decrease) < b.rel_threshold)
760  {
761  fprintf(stdout,
762  "\nTermination condition reached in pass %ld: decrease in loss less than %.3f%%.\n"
763  "If you want to optimize further, decrease termination threshold.\n",
764  (long int)b.current_pass + 1, b.rel_threshold * 100.0);
765  status = LEARN_CONV;
766  }
768  b.loss_sum = 0.;
769  b.example_number = 0;
770  b.curvature = 0;
771  b.step_size = 1.0;
772 
773  try
774  {
775  bfgs_iter_middle(all, b, b.mem, b.rho, b.alpha, b.lastj, b.origin);
776  }
777  catch (const curv_exception&)
778  {
779  fprintf(stdout, "In bfgs_iter_middle: %s", curv_message);
780  b.step_size = 0.0;
781  status = LEARN_CURV;
782  }
783 
784  if (all.hessian_on)
785  {
786  b.gradient_pass = false; // now start computing curvature
787  }
788  else
789  {
790  float d_mag = direction_magnitude(all);
791  ftime(&b.t_end_global);
792  b.net_time = (int)(1000.0 * (b.t_end_global.time - b.t_start_global.time) +
793  (b.t_end_global.millitm - b.t_start_global.millitm));
794  if (!all.quiet)
795  fprintf(stderr, "%-10s\t%-10.5f\t%-.5f\n", "", d_mag, b.step_size);
796  b.predictions.clear();
797  update_weight(all, b.step_size);
798  }
799  }
800  }
801 
802  /********************************************************************/
803  /* C) NOT FIRST PASS, CURVATURE CALCULATED **************************/
804  /********************************************************************/
805  else // just finished all second gradients
806  {
807  if (all.all_reduce != nullptr)
808  {
809  float t = (float)b.curvature;
810  b.curvature = accumulate_scalar(all, t); // Accumulate curvatures
811  }
812  if (all.l2_lambda > 0.)
814  float dd = (float)derivative_in_direction(all, b, b.mem, b.origin);
815  if (b.curvature == 0. && dd != 0.)
816  {
817  fprintf(stdout, "%s", curv_message);
818  b.step_size = 0.0;
819  status = LEARN_CURV;
820  }
821  else if (dd == 0.)
822  {
823  fprintf(stdout, "Derivative 0 detected.\n");
824  b.step_size = 0.0;
825  status = LEARN_CONV;
826  }
827  else
828  b.step_size = -dd / (float)b.curvature;
829 
830  float d_mag = direction_magnitude(all);
831 
832  b.predictions.clear();
833  update_weight(all, b.step_size);
834  ftime(&b.t_end_global);
835  b.net_time = (int)(1000.0 * (b.t_end_global.time - b.t_start_global.time) +
836  (b.t_end_global.millitm - b.t_start_global.millitm));
837 
838  if (!all.quiet)
839  fprintf(stderr, "%-10.5f\t%-10.5f\t%-.5f\n", b.curvature / b.importance_weight_sum, d_mag, b.step_size);
840  b.gradient_pass = true;
841  } // now start computing derivatives.
842  b.current_pass++;
843  b.first_pass = false;
844  b.preconditioner_pass = false;
845 
846  if (b.output_regularizer) // need to accumulate and place the regularizer.
847  {
848  if (all.all_reduce != nullptr)
849  accumulate(all, all.weights, W_COND); // Accumulate preconditioner
850  // preconditioner_to_regularizer(all, b, all.l2_lambda);
851  }
852  ftime(&b.t_end_global);
853  b.net_time = (int)(1000.0 * (b.t_end_global.time - b.t_start_global.time) +
854  (b.t_end_global.millitm - b.t_start_global.millitm));
855 
856  if (all.save_per_pass)
858  return status;
859 }
parameters weights
Definition: global_data.h:537
double add_regularization(vw &all, bfgs &b, float regularization, T &weights)
Definition: bfgs.cc:448
bool backstep_on
Definition: bfgs.cc:85
void accumulate(vw &all, parameters &weights, size_t offset)
Definition: accumulate.cc:20
int origin
Definition: bfgs.cc:96
bool output_regularizer
Definition: bfgs.cc:89
double holdout_sum_loss_since_last_pass
Definition: global_data.h:163
float direction_magnitude(vw &, T &weights)
Definition: bfgs.cc:218
float * mem
Definition: bfgs.cc:90
#define LEARN_CURV
Definition: bfgs.cc:42
#define LEARN_CONV
Definition: bfgs.cc:43
double * alpha
Definition: bfgs.cc:92
bool quiet
Definition: global_data.h:487
double curvature
Definition: bfgs.cc:100
void bfgs_iter_middle(vw &all, bfgs &b, float *mem, double *rho, double *alpha, int &lastj, int &origin, T &weights)
Definition: bfgs.cc:270
#define LEARN_OK
Definition: bfgs.cc:41
bool holdout_set_off
Definition: global_data.h:499
float step_size
Definition: bfgs.cc:98
bool hessian_on
Definition: global_data.h:413
void save_predictor(vw &all, std::string reg_name, size_t current_pass)
double derivative_in_direction(vw &, bfgs &b, float *mem, int &origin, T &weights)
Definition: bfgs.cc:605
AllReduce * all_reduce
Definition: global_data.h:381
constexpr const char * curv_message
Definition: bfgs.cc:116
v_array< float > predictions
Definition: bfgs.cc:77
shared_data * sd
Definition: global_data.h:375
float l2_lambda
Definition: global_data.h:445
void clear()
Definition: v_array.h:88
double weighted_holdout_examples_since_last_pass
Definition: global_data.h:162
double wolfe_eval(vw &all, bfgs &b, float *mem, double loss_sum, double previous_loss_sum, double step_size, double importance_weight_sum, int &origin, double &wolfe1, T &weights)
Definition: bfgs.cc:409
double net_time
Definition: bfgs.cc:75
double regularizer_direction_magnitude(vw &, bfgs &b, double regularizer, T &weights)
Definition: bfgs.cc:187
void zero_derivative(vw &all)
Definition: bfgs.cc:122
bool first_hessian_on
Definition: bfgs.cc:84
bool gradient_pass
Definition: bfgs.cc:104
int lastj
Definition: bfgs.cc:96
void update_weight(vw &, float step_size, T &w)
Definition: bfgs.cc:625
#define W_COND
Definition: bfgs.cc:39
struct timeb t_start_global t_end_global
Definition: bfgs.cc:74
double * rho
Definition: bfgs.cc:91
bool save_per_pass
Definition: global_data.h:408
void finalize_preconditioner(vw &, bfgs &b, float regularization, T &weights)
Definition: bfgs.cc:498
size_t example_number
Definition: bfgs.cc:78
double previous_loss_sum
Definition: bfgs.cc:97
float accumulate_scalar(vw &all, float local_sum)
Definition: accumulate.cc:44
size_t current_pass
Definition: bfgs.cc:79
double loss_sum
Definition: bfgs.cc:97
double importance_weight_sum
Definition: bfgs.cc:99
std::string final_regressor_name
Definition: global_data.h:535
bool preconditioner_pass
Definition: bfgs.cc:105
float f
Definition: cache.cc:40
void bfgs_iter_start(vw &all, bfgs &b, float *mem, int &lastj, double importance_weight_sum, int &origin, T &weights)
Definition: bfgs.cc:238
bool first_pass
Definition: bfgs.cc:103
float rel_threshold
Definition: bfgs.cc:66

◆ regularizer_direction_magnitude() [1/2]

template<class T >
double regularizer_direction_magnitude ( vw ,
bfgs b,
double  regularizer,
T &  weights 
)

Definition at line 187 of file bfgs.cc.

References bfgs::regularizers, and W_DIR.

Referenced by process_pass(), and regularizer_direction_magnitude().

188 {
189  double ret = 0.;
190  if (b.regularizers == nullptr)
191  for (typename T::iterator iter = weights.begin(); iter != weights.end(); ++iter)
192  ret += regularizer * (&(*iter))[W_DIR] * (&(*iter))[W_DIR];
193 
194  else
195  {
196  for (typename T::iterator iter = weights.begin(); iter != weights.end(); ++iter)
197  ret += ((double)b.regularizers[2 * (iter.index() >> weights.stride_shift())]) * (&(*iter))[W_DIR] *
198  (&(*iter))[W_DIR];
199  }
200  return ret;
201 }
#define W_DIR
Definition: bfgs.cc:38
weight * regularizers
Definition: bfgs.cc:94

◆ regularizer_direction_magnitude() [2/2]

double regularizer_direction_magnitude ( vw all,
bfgs b,
float  regularizer 
)

Definition at line 203 of file bfgs.cc.

References parameters::dense_weights, regularizer_direction_magnitude(), parameters::sparse, parameters::sparse_weights, and vw::weights.

204 {
205  // compute direction magnitude
206  double ret = 0.;
207 
208  if (regularizer == 0.)
209  return ret;
210 
211  if (all.weights.sparse)
212  return regularizer_direction_magnitude(all, b, regularizer, all.weights.sparse_weights);
213  else
214  return regularizer_direction_magnitude(all, b, regularizer, all.weights.dense_weights);
215 }
parameters weights
Definition: global_data.h:537
dense_parameters dense_weights
double regularizer_direction_magnitude(vw &, bfgs &b, double regularizer, T &weights)
Definition: bfgs.cc:187
sparse_parameters sparse_weights

◆ regularizer_to_weight() [1/2]

template<class T >
void regularizer_to_weight ( vw ,
bfgs b,
T &  weights 
)

Definition at line 576 of file bfgs.cc.

References bfgs::regularizers, and W_COND.

Referenced by regularizer_to_weight(), and save_load_regularizer().

577 {
578  if (b.regularizers != nullptr)
579  {
580  for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
581  {
582  uint64_t i = w.index() >> weights.stride_shift();
583  (&(*w))[W_COND] = b.regularizers[2 * i];
584  *w = b.regularizers[2 * i + 1];
585  }
586  }
587 }
weight * regularizers
Definition: bfgs.cc:94
#define W_COND
Definition: bfgs.cc:39

◆ regularizer_to_weight() [2/2]

void regularizer_to_weight ( vw all,
bfgs b 
)

Definition at line 589 of file bfgs.cc.

References parameters::dense_weights, regularizer_to_weight(), parameters::sparse, parameters::sparse_weights, and vw::weights.

590 {
591  if (all.weights.sparse)
593  else
595 }
parameters weights
Definition: global_data.h:537
void regularizer_to_weight(vw &, bfgs &b, T &weights)
Definition: bfgs.cc:576
dense_parameters dense_weights
sparse_parameters sparse_weights

◆ reset_state()

void reset_state ( vw all,
bfgs b,
bool  zero 
)

Definition at line 126 of file bfgs.cc.

References bfgs::curvature, bfgs::first_pass, bfgs::gradient_pass, bfgs::importance_weight_sum, bfgs::lastj, bfgs::loss_sum, bfgs::origin, bfgs::preconditioner_pass, bfgs::previous_loss_sum, zero_derivative(), and zero_preconditioner().

Referenced by save_load().

127 {
128  b.lastj = b.origin = 0;
129  b.loss_sum = b.previous_loss_sum = 0.;
130  b.importance_weight_sum = 0.;
131  b.curvature = 0.;
132  b.first_pass = true;
133  b.gradient_pass = true;
134  b.preconditioner_pass = true;
135  if (zero)
136  {
137  zero_derivative(all);
138  zero_preconditioner(all);
139  }
140 }
void zero_preconditioner(vw &all)
Definition: bfgs.cc:124
int origin
Definition: bfgs.cc:96
double curvature
Definition: bfgs.cc:100
void zero_derivative(vw &all)
Definition: bfgs.cc:122
bool gradient_pass
Definition: bfgs.cc:104
int lastj
Definition: bfgs.cc:96
double previous_loss_sum
Definition: bfgs.cc:97
double loss_sum
Definition: bfgs.cc:97
double importance_weight_sum
Definition: bfgs.cc:99
bool preconditioner_pass
Definition: bfgs.cc:105
bool first_pass
Definition: bfgs.cc:103

◆ save_load()

void save_load ( bfgs b,
io_buf model_file,
bool  read,
bool  text 
)

Definition at line 1026 of file bfgs.cc.

References bfgs::all, bfgs::alpha, bin_text_read_write_fixed(), CG_EXTRA, io_buf::files, initialize_regressor(), vw::l2_lambda, vw::length(), bfgs::m, bfgs::mem, bfgs::mem_stride, bfgs::net_time, vw::num_bits, bfgs::output_regularizer, vw::per_feature_regularizer_input, vw::per_feature_regularizer_output, vw::per_feature_regularizer_text, vw::quiet, bfgs::regularizers, reset_state(), bfgs::rho, GD::save_load_regressor(), save_load_regularizer(), v_array< T >::size(), stride_shift(), parameters::stride_shift(), THROW, and vw::weights.

Referenced by bfgs_setup().

1027 {
1028  vw* all = b.all;
1029 
1030  uint32_t length = 1 << all->num_bits;
1031 
1032  if (read)
1033  {
1034  initialize_regressor(*all);
1035  if (all->per_feature_regularizer_input != "")
1036  {
1037  b.regularizers = calloc_or_throw<weight>(2 * length);
1038  if (b.regularizers == nullptr)
1039  THROW("Failed to allocate regularizers array: try decreasing -b <bits>");
1040  }
1041  int m = b.m;
1042 
1043  b.mem_stride = (m == 0) ? CG_EXTRA : 2 * m;
1044  b.mem = calloc_or_throw<float>(all->length() * b.mem_stride);
1045  b.rho = calloc_or_throw<double>(m);
1046  b.alpha = calloc_or_throw<double>(m);
1047 
1048  uint32_t stride_shift = all->weights.stride_shift();
1049 
1050  if (!all->quiet)
1051  std::cerr << "m = " << m << std::endl
1052  << "Allocated "
1053  << ((long unsigned int)all->length() *
1054  (sizeof(float) * (b.mem_stride) + (sizeof(weight) << stride_shift)) >>
1055  20)
1056  << "M for weights and mem" << std::endl;
1057 
1058  b.net_time = 0.0;
1059  ftime(&b.t_start_global);
1060 
1061  if (!all->quiet)
1062  {
1063  const char* header_fmt = "%2s %-10s\t%-10s\t%-10s\t %-10s\t%-10s\t%-10s\t%-10s\t%-10s\t%-s\n";
1064  fprintf(stderr, header_fmt, "##", "avg. loss", "der. mag.", "d. m. cond.", "wolfe1", "wolfe2", "mix fraction",
1065  "curvature", "dir. magnitude", "step size");
1066  std::cerr.precision(5);
1067  }
1068 
1069  if (b.regularizers != nullptr)
1070  all->l2_lambda = 1; // To make sure we are adding the regularization
1072  reset_state(*all, b, false);
1073  }
1074 
1075  // bool reg_vector = b.output_regularizer || all->per_feature_regularizer_input.length() > 0;
1076  bool reg_vector = (b.output_regularizer && !read) || (all->per_feature_regularizer_input.length() > 0 && read);
1077 
1078  if (model_file.files.size() > 0)
1079  {
1080  std::stringstream msg;
1081  msg << ":" << reg_vector << "\n";
1082  bin_text_read_write_fixed(model_file, (char*)&reg_vector, sizeof(reg_vector), "", read, msg, text);
1083 
1084  if (reg_vector)
1085  save_load_regularizer(*all, b, model_file, read, text);
1086  else
1087  GD::save_load_regressor(*all, model_file, read, text);
1088  }
1089 }
size_t length()
Definition: global_data.h:513
parameters weights
Definition: global_data.h:537
void initialize_regressor(vw &all, T &weights)
bool output_regularizer
Definition: bfgs.cc:89
uint64_t stride_shift(const stagewise_poly &poly, uint64_t idx)
float * mem
Definition: bfgs.cc:90
int m
Definition: bfgs.cc:65
double * alpha
Definition: bfgs.cc:92
bool quiet
Definition: global_data.h:487
uint32_t num_bits
Definition: global_data.h:398
weight * regularizers
Definition: bfgs.cc:94
size_t size() const
Definition: v_array.h:68
#define CG_EXTRA
Definition: bfgs.cc:29
int mem_stride
Definition: bfgs.cc:88
float l2_lambda
Definition: global_data.h:445
void save_load_regularizer(vw &all, bfgs &b, io_buf &model_file, bool read, bool text)
Definition: bfgs.cc:979
v_array< int > files
Definition: io_buf.h:64
std::string per_feature_regularizer_output
Definition: global_data.h:441
double net_time
Definition: bfgs.cc:75
vw * all
Definition: bfgs.cc:64
std::string per_feature_regularizer_text
Definition: global_data.h:442
float weight
double * rho
Definition: bfgs.cc:91
uint32_t stride_shift()
void reset_state(vw &all, bfgs &b, bool zero)
Definition: bfgs.cc:126
std::string per_feature_regularizer_input
Definition: global_data.h:440
void save_load_regressor(vw &all, io_buf &model_file, bool read, bool text, T &weights)
Definition: gd.cc:707
size_t bin_text_read_write_fixed(io_buf &io, char *data, size_t len, const char *read_message, bool read, std::stringstream &msg, bool text)
Definition: io_buf.h:326
#define THROW(args)
Definition: vw_exception.h:181

◆ save_load_regularizer()

void save_load_regularizer ( vw all,
bfgs b,
io_buf model_file,
bool  read,
bool  text 
)

Definition at line 979 of file bfgs.cc.

References bfgs::all, io_buf::bin_read_fixed(), bin_text_write_fixed(), c, vw::l2_lambda, vw::num_bits, bfgs::output_regularizer, preconditioner_to_regularizer(), regularizer_to_weight(), and bfgs::regularizers.

Referenced by save_load().

980 {
981  int c = 0;
982  uint32_t length = 2 * (1 << all.num_bits);
983  uint32_t i = 0;
984  size_t brw = 1;
985 
986  if (b.output_regularizer && !read)
988 
989  do
990  {
991  brw = 1;
992  weight* v;
993  if (read)
994  {
995  c++;
996  brw = model_file.bin_read_fixed((char*)&i, sizeof(i), "");
997  if (brw > 0)
998  {
999  assert(i < length);
1000  v = &(b.regularizers[i]);
1001  brw += model_file.bin_read_fixed((char*)v, sizeof(*v), "");
1002  }
1003  }
1004  else // write binary or text
1005  {
1006  v = &(b.regularizers[i]);
1007  if (*v != 0.)
1008  {
1009  c++;
1010  std::stringstream msg;
1011  msg << i;
1012  brw = bin_text_write_fixed(model_file, (char*)&i, sizeof(i), msg, text);
1013 
1014  msg << ":" << *v << "\n";
1015  brw += bin_text_write_fixed(model_file, (char*)v, sizeof(*v), msg, text);
1016  }
1017  }
1018  if (!read)
1019  i++;
1020  } while ((!read && i < length) || (read && brw > 0));
1021 
1022  if (read)
1023  regularizer_to_weight(all, b);
1024 }
void preconditioner_to_regularizer(vw &all, bfgs &b, float regularization, T &weights)
Definition: bfgs.cc:538
bool output_regularizer
Definition: bfgs.cc:89
uint32_t num_bits
Definition: global_data.h:398
weight * regularizers
Definition: bfgs.cc:94
size_t bin_text_write_fixed(io_buf &io, char *data, size_t len, std::stringstream &msg, bool text)
Definition: io_buf.h:313
size_t bin_read_fixed(char *data, size_t len, const char *read_message)
Definition: io_buf.h:230
float l2_lambda
Definition: global_data.h:445
void regularizer_to_weight(vw &, bfgs &b, T &weights)
Definition: bfgs.cc:576
vw * all
Definition: bfgs.cc:64
float weight
constexpr uint64_t c
Definition: rand48.cc:12

◆ test_example()

constexpr bool test_example ( example ec)
noexcept

Definition at line 147 of file bfgs.cc.

Referenced by learn().

147 { return ec.l.simple.label == FLT_MAX; }
float label
Definition: simple_label.h:14
label_data simple
Definition: example.h:28
polylabel l
Definition: example.h:57

◆ update_preconditioner()

void update_preconditioner ( vw all,
example ec 
)

Definition at line 171 of file bfgs.cc.

References example::l, label_data::label, vw::loss, example::pred, polyprediction::scalar, vw::sd, loss_function::second_derivative(), polylabel::simple, and example::weight.

Referenced by process_example().

172 {
173  float curvature = all.loss->second_derivative(all.sd, ec.pred.scalar, ec.l.simple.label) * ec.weight;
174  GD::foreach_feature<float, add_precond>(all, ec, curvature);
175 }
loss_function * loss
Definition: global_data.h:523
float scalar
Definition: example.h:45
virtual float second_derivative(shared_data *, float prediction, float label)=0
float label
Definition: simple_label.h:14
label_data simple
Definition: example.h:28
shared_data * sd
Definition: global_data.h:375
polylabel l
Definition: example.h:57
polyprediction pred
Definition: example.h:60
float weight
Definition: example.h:62

◆ update_weight() [1/2]

template<class T >
void update_weight ( vw ,
float  step_size,
T &  w 
)

Definition at line 625 of file bfgs.cc.

References W_DIR, and W_XT.

Referenced by process_pass(), and update_weight().

626 {
627  for (typename T::iterator iter = w.begin(); iter != w.end(); ++iter)
628  (&(*iter))[W_XT] += step_size * (&(*iter))[W_DIR];
629 }
#define W_DIR
Definition: bfgs.cc:38
#define W_XT
Definition: bfgs.cc:36

◆ update_weight() [2/2]

void update_weight ( vw all,
float  step_size 
)

Definition at line 631 of file bfgs.cc.

References parameters::dense_weights, parameters::sparse, parameters::sparse_weights, update_weight(), and vw::weights.

632 {
633  if (all.weights.sparse)
634  update_weight(all, step_size, all.weights.sparse_weights);
635  else
636  update_weight(all, step_size, all.weights.dense_weights);
637 }
parameters weights
Definition: global_data.h:537
dense_parameters dense_weights
void update_weight(vw &, float step_size, T &w)
Definition: bfgs.cc:625
sparse_parameters sparse_weights

◆ wolfe_eval() [1/2]

template<class T >
double wolfe_eval ( vw all,
bfgs b,
float *  mem,
double  loss_sum,
double  previous_loss_sum,
double  step_size,
double  importance_weight_sum,
int &  origin,
double &  wolfe1,
T &  weights 
)

Definition at line 409 of file bfgs.cc.

References MEM_GT, bfgs::mem_stride, vw::quiet, W_COND, W_DIR, and W_GT.

Referenced by process_pass(), and wolfe_eval().

411 {
412  double g0_d = 0.;
413  double g1_d = 0.;
414  double g1_Hg1 = 0.;
415  double g1_g1 = 0.;
416 
417  for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
418  {
419  float* mem1 = mem + (w.index() >> weights.stride_shift()) * b.mem_stride;
420  g0_d += ((double)mem1[(MEM_GT + origin) % b.mem_stride]) * ((&(*w))[W_DIR]);
421  g1_d += ((double)(&(*w))[W_GT]) * (&(*w))[W_DIR];
422  g1_Hg1 += ((double)(&(*w))[W_GT]) * (&(*w))[W_GT] * ((&(*w))[W_COND]);
423  g1_g1 += ((double)(&(*w))[W_GT]) * (&(*w))[W_GT];
424  }
425 
426  wolfe1 = (loss_sum - previous_loss_sum) / (step_size * g0_d);
427  double wolfe2 = g1_d / g0_d;
428  // double new_step_cross = (loss_sum-previous_loss_sum-g1_d*step)/(g0_d-g1_d);
429 
430  if (!all.quiet)
431  fprintf(stderr, "%-10.5f\t%-10.5f\t%s%-10f\t%-10f\t", g1_g1 / (importance_weight_sum * importance_weight_sum),
432  g1_Hg1 / importance_weight_sum, " ", wolfe1, wolfe2);
433  return 0.5 * step_size;
434 }
#define W_GT
Definition: bfgs.cc:37
#define W_DIR
Definition: bfgs.cc:38
bool quiet
Definition: global_data.h:487
int mem_stride
Definition: bfgs.cc:88
#define W_COND
Definition: bfgs.cc:39
#define MEM_GT
Definition: bfgs.cc:31

◆ wolfe_eval() [2/2]

double wolfe_eval ( vw all,
bfgs b,
float *  mem,
double  loss_sum,
double  previous_loss_sum,
double  step_size,
double  importance_weight_sum,
int &  origin,
double &  wolfe1 
)

Definition at line 436 of file bfgs.cc.

References parameters::dense_weights, parameters::sparse, parameters::sparse_weights, vw::weights, and wolfe_eval().

438 {
439  if (all.weights.sparse)
440  return wolfe_eval(all, b, mem, loss_sum, previous_loss_sum, step_size, importance_weight_sum, origin, wolfe1,
441  all.weights.sparse_weights);
442  else
443  return wolfe_eval(all, b, mem, loss_sum, previous_loss_sum, step_size, importance_weight_sum, origin, wolfe1,
444  all.weights.dense_weights);
445 }
parameters weights
Definition: global_data.h:537
double wolfe_eval(vw &all, bfgs &b, float *mem, double loss_sum, double previous_loss_sum, double step_size, double importance_weight_sum, int &origin, double &wolfe1, T &weights)
Definition: bfgs.cc:409
dense_parameters dense_weights
sparse_parameters sparse_weights

◆ zero_derivative()

void zero_derivative ( vw all)

Definition at line 122 of file bfgs.cc.

References parameters::set_zero(), W_GT, and vw::weights.

Referenced by process_pass(), and reset_state().

122 { all.weights.set_zero(W_GT); }
parameters weights
Definition: global_data.h:537
#define W_GT
Definition: bfgs.cc:37
void set_zero(size_t offset)

◆ zero_preconditioner()

void zero_preconditioner ( vw all)

Definition at line 124 of file bfgs.cc.

References parameters::set_zero(), W_COND, and vw::weights.

Referenced by end_pass(), and reset_state().

124 { all.weights.set_zero(W_COND); }
parameters weights
Definition: global_data.h:537
void set_zero(size_t offset)
#define W_COND
Definition: bfgs.cc:39

◆ zero_state()

void zero_state ( vw all)

Definition at line 597 of file bfgs.cc.

References parameters::set_zero(), W_COND, W_DIR, W_GT, and vw::weights.

598 {
599  all.weights.set_zero(W_GT);
600  all.weights.set_zero(W_DIR);
601  all.weights.set_zero(W_COND);
602 }
parameters weights
Definition: global_data.h:537
#define W_GT
Definition: bfgs.cc:37
#define W_DIR
Definition: bfgs.cc:38
void set_zero(size_t offset)
#define W_COND
Definition: bfgs.cc:39

Variable Documentation

◆ curv_ex

curv_exception curv_ex

Referenced by bfgs_iter_middle().

◆ curv_message

constexpr const char* curv_message
Initial value:
=
"Zero or negative curvature detected.\n"
"To increase curvature you can increase regularization or rescale features.\n"
"It is also possible that you have reached numerical accuracy\n"
"and further decrease in the objective cannot be reliably detected.\n"

Definition at line 116 of file bfgs.cc.

Referenced by process_pass().

◆ max_precond_ratio

constexpr float max_precond_ratio = 10000.f

Definition at line 60 of file bfgs.cc.

Referenced by finalize_preconditioner().