cpp/8.8.1/bfgs_8cc_source.html

 /*
 Copyright (c) by respective owners including Yahoo!, Microsoft, and
 individual contributors. All rights reserved.  Released under a BSD (revised)
 license as described in the file LICENSE.
  */
 /*
 The algorithm here is generally based on Nocedal 1980, Liu and Nocedal 1989.
 Implementation by Miro Dudik.
  */
 #include <cmath>
 #include <fstream>
 #include <float.h>
 #ifndef _WIN32
 #include <netdb.h>
 #endif
 #include <string.h>
 #include <stdio.h>
 #include <assert.h>
 #include <sys/timeb.h>
 #include "accumulate.h"
 #include "reductions.h"
 #include "gd.h"
 #include "vw_exception.h"
 #include <exception>

 using namespace LEARNER;
 using namespace VW::config;

 #define CG_EXTRA 1

 #define MEM_GT 0
 #define MEM_XT 1
 #define MEM_YT 0
 #define MEM_ST 1

 #define W_XT 0
 #define W_GT 1
 #define W_DIR 2
 #define W_COND 3

 #define LEARN_OK 0
 #define LEARN_CURV 1
 #define LEARN_CONV 2

 class curv_exception : public std::exception
 {
 } curv_ex;

 /********************************************************************/
 /* mem & w definition ***********************************************/
 /********************************************************************/
 // mem[2*i] = y_t
 // mem[2*i+1] = s_t
 //
 // w[0] = weight
 // w[1] = accumulated first derivative
 // w[2] = step direction
 // w[3] = preconditioner

 constexpr float max_precond_ratio = 10000.f;

 struct bfgs
 {
   vw* all;  // prediction, regressor
   int m;
   float rel_threshold;  // termination threshold

   double wolfe1_bound;

   size_t final_pass;
   struct timeb t_start, t_end;
   double net_comm_time;

   struct timeb t_start_global, t_end_global;
   double net_time;

   v_array<float> predictions;
   size_t example_number;
   size_t current_pass;
   size_t no_win_counter;
   size_t early_stop_thres;

   // default transition behavior
   bool first_hessian_on;
   bool backstep_on;

   // set by initializer
   int mem_stride;
   bool output_regularizer;
   float* mem;
   double* rho;
   double* alpha;

   weight* regularizers;
   // the below needs to be included when resetting, in addition to preconditioner and derivative
   int lastj, origin;
   double loss_sum, previous_loss_sum;
   float step_size;
   double importance_weight_sum;
   double curvature;

   // first pass specification
   bool first_pass;
   bool gradient_pass;
   bool preconditioner_pass;

   ~bfgs()
   {
     predictions.delete_v();
     free(mem);
     free(rho);
     free(alpha);
   }
 };

 constexpr const char* curv_message =
     "Zero or negative curvature detected.\n"
     "To increase curvature you can increase regularization or rescale features.\n"
     "It is also possible that you have reached numerical accuracy\n"
     "and further decrease in the objective cannot be reliably detected.\n";

 void zero_derivative(vw& all) { all.weights.set_zero(W_GT); }

 void zero_preconditioner(vw& all) { all.weights.set_zero(W_COND); }

 void reset_state(vw& all, bfgs& b, bool zero)
 {
   b.lastj = b.origin = 0;
   b.loss_sum = b.previous_loss_sum = 0.;
   b.importance_weight_sum = 0.;
   b.curvature = 0.;
   b.first_pass = true;
   b.gradient_pass = true;
   b.preconditioner_pass = true;
   if (zero)
   {
     zero_derivative(all);
     zero_preconditioner(all);
   }
 }

 // w[0] = weight
 // w[1] = accumulated first derivative
 // w[2] = step direction
 // w[3] = preconditioner

 constexpr bool test_example(example& ec) noexcept { return ec.l.simple.label == FLT_MAX; }

 float bfgs_predict(vw& all, example& ec)
 {
   ec.partial_prediction = GD::inline_predict(all, ec);
   return GD::finalize_prediction(all.sd, ec.partial_prediction);
 }

 inline void add_grad(float& d, float f, float& fw) { (&fw)[W_GT] += d * f; }

 float predict_and_gradient(vw& all, example& ec)
 {
   float fp = bfgs_predict(all, ec);
   label_data& ld = ec.l.simple;
   all.set_minmax(all.sd, ld.label);

   float loss_grad = all.loss->first_derivative(all.sd, fp, ld.label) * ec.weight;
   GD::foreach_feature<float, add_grad>(all, ec, loss_grad);

   return fp;
 }

 inline void add_precond(float& d, float f, float& fw) { (&fw)[W_COND] += d * f * f; }

 void update_preconditioner(vw& all, example& ec)
 {
   float curvature = all.loss->second_derivative(all.sd, ec.pred.scalar, ec.l.simple.label) * ec.weight;
   GD::foreach_feature<float, add_precond>(all, ec, curvature);
 }

 inline void add_DIR(float& p, const float fx, float& fw) { p += (&fw)[W_DIR] * fx; }

 float dot_with_direction(vw& all, example& ec)
 {
   float temp = ec.l.simple.initial;
   GD::foreach_feature<float, add_DIR>(all, ec, temp);
   return temp;
 }

 template <class T>
 double regularizer_direction_magnitude(vw& /* all */, bfgs& b, double regularizer, T& weights)
 {
   double ret = 0.;
   if (b.regularizers == nullptr)
     for (typename T::iterator iter = weights.begin(); iter != weights.end(); ++iter)
       ret += regularizer * (&(*iter))[W_DIR] * (&(*iter))[W_DIR];

   else
   {
     for (typename T::iterator iter = weights.begin(); iter != weights.end(); ++iter)
       ret += ((double)b.regularizers[2 * (iter.index() >> weights.stride_shift())]) * (&(*iter))[W_DIR] *
           (&(*iter))[W_DIR];
   }
   return ret;
 }

 double regularizer_direction_magnitude(vw& all, bfgs& b, float regularizer)
 {
   // compute direction magnitude
   double ret = 0.;

   if (regularizer == 0.)
     return ret;

   if (all.weights.sparse)
     return regularizer_direction_magnitude(all, b, regularizer, all.weights.sparse_weights);
   else
     return regularizer_direction_magnitude(all, b, regularizer, all.weights.dense_weights);
 }

 template <class T>
 float direction_magnitude(vw& /* all */, T& weights)
 {
   // compute direction magnitude
   double ret = 0.;
   for (typename T::iterator iter = weights.begin(); iter != weights.end(); ++iter)
     ret += ((double)(&(*iter))[W_DIR]) * (&(*iter))[W_DIR];

   return (float)ret;
 }

 float direction_magnitude(vw& all)
 {
   // compute direction magnitude
   if (all.weights.sparse)
     return direction_magnitude(all, all.weights.sparse_weights);
   else
     return direction_magnitude(all, all.weights.dense_weights);
 }

 template <class T>
 void bfgs_iter_start(vw& all, bfgs& b, float* mem, int& lastj, double importance_weight_sum, int& origin, T& weights)
 {
   double g1_Hg1 = 0.;
   double g1_g1 = 0.;

   origin = 0;
   for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
   {
     float* mem1 = mem + (w.index() >> weights.stride_shift()) * b.mem_stride;
     if (b.m > 0)
       mem1[(MEM_XT + origin) % b.mem_stride] = (&(*w))[W_XT];
     mem1[(MEM_GT + origin) % b.mem_stride] = (&(*w))[W_GT];
     g1_Hg1 += ((double)(&(*w))[W_GT]) * ((&(*w))[W_GT]) * ((&(*w))[W_COND]);
     g1_g1 += ((double)((&(*w))[W_GT])) * ((&(*w))[W_GT]);
     (&(*w))[W_DIR] = -(&(*w))[W_COND] * ((&(*w))[W_GT]);
     ((&(*w))[W_GT]) = 0;
   }
   lastj = 0;
   if (!all.quiet)
     fprintf(stderr, "%-10.5f\t%-10.5f\t%-10s\t%-10s\t%-10s\t", g1_g1 / (importance_weight_sum * importance_weight_sum),
         g1_Hg1 / importance_weight_sum, "", "", "");
 }

 void bfgs_iter_start(vw& all, bfgs& b, float* mem, int& lastj, double importance_weight_sum, int& origin)
 {
   if (all.weights.sparse)
     bfgs_iter_start(all, b, mem, lastj, importance_weight_sum, origin, all.weights.sparse_weights);
   else
     bfgs_iter_start(all, b, mem, lastj, importance_weight_sum, origin, all.weights.dense_weights);
 }

 template <class T>
 void bfgs_iter_middle(vw& all, bfgs& b, float* mem, double* rho, double* alpha, int& lastj, int& origin, T& weights)
 {
   float* mem0 = mem;
   uint32_t length = 1 << all.num_bits;
   // implement conjugate gradient
   if (b.m == 0)
   {
     double g_Hy = 0.;
     double g_Hg = 0.;
     double y = 0.;

     for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
     {
       mem = mem0 + (w.index() >> weights.stride_shift()) * b.mem_stride;
       y = (&(*w))[W_GT] - mem[(MEM_GT + origin) % b.mem_stride];
       g_Hy += ((double)(&(*w))[W_GT]) * ((&(*w))[W_COND]) * y;
       g_Hg +=
           ((double)mem[(MEM_GT + origin) % b.mem_stride]) * ((&(*w))[W_COND]) * mem[(MEM_GT + origin) % b.mem_stride];
     }

     float beta = (float)(g_Hy / g_Hg);

     if (beta < 0.f || std::isnan(beta))
       beta = 0.f;

     for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
     {
       mem = mem0 + (w.index() >> weights.stride_shift()) * b.mem_stride;
       mem[(MEM_GT + origin) % b.mem_stride] = (&(*w))[W_GT];

       (&(*w))[W_DIR] *= beta;
       (&(*w))[W_DIR] -= ((&(*w))[W_COND]) * ((&(*w))[W_GT]);
       (&(*w))[W_GT] = 0;
     }
     if (!all.quiet)
       fprintf(stderr, "%f\t", beta);
     return;

     mem = mem0 + (length - 1) * b.mem_stride;
   }
   else
   {
     if (!all.quiet)
       fprintf(stderr, "%-10s\t", "");
   }

   // implement bfgs
   double y_s = 0.;
   double y_Hy = 0.;
   double s_q = 0.;

   for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
   {
     float* mem1 = mem + (w.index() >> weights.stride_shift()) * b.mem_stride;
     mem1[(MEM_YT + origin) % b.mem_stride] = (&(*w))[W_GT] - mem1[(MEM_GT + origin) % b.mem_stride];
     mem1[(MEM_ST + origin) % b.mem_stride] = (&(*w))[W_XT] - mem1[(MEM_XT + origin) % b.mem_stride];
     (&(*w))[W_DIR] = (&(*w))[W_GT];
     y_s += ((double)mem1[(MEM_YT + origin) % b.mem_stride]) * mem1[(MEM_ST + origin) % b.mem_stride];
     y_Hy +=
         ((double)mem1[(MEM_YT + origin) % b.mem_stride]) * mem1[(MEM_YT + origin) % b.mem_stride] * ((&(*w))[W_COND]);
     s_q += ((double)mem1[(MEM_ST + origin) % b.mem_stride]) * ((&(*w))[W_GT]);
   }

   if (y_s <= 0. || y_Hy <= 0.)
     throw curv_ex;
   rho[0] = 1 / y_s;

   float gamma = (float)(y_s / y_Hy);

   for (int j = 0; j < lastj; j++)
   {
     alpha[j] = rho[j] * s_q;
     s_q = 0.;
     for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
     {
       mem = mem0 + (w.index() >> weights.stride_shift()) * b.mem_stride;
       (&(*w))[W_DIR] -= (float)alpha[j] * mem[(2 * j + MEM_YT + origin) % b.mem_stride];
       s_q += ((double)mem[(2 * j + 2 + MEM_ST + origin) % b.mem_stride]) * ((&(*w))[W_DIR]);
     }
   }

   alpha[lastj] = rho[lastj] * s_q;
   double y_r = 0.;

   for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
   {
     mem = mem0 + (w.index() >> weights.stride_shift()) * b.mem_stride;
     (&(*w))[W_DIR] -= (float)alpha[lastj] * mem[(2 * lastj + MEM_YT + origin) % b.mem_stride];
     (&(*w))[W_DIR] *= gamma * ((&(*w))[W_COND]);
     y_r += ((double)mem[(2 * lastj + MEM_YT + origin) % b.mem_stride]) * ((&(*w))[W_DIR]);
   }

   double coef_j;

   for (int j = lastj; j > 0; j--)
   {
     coef_j = alpha[j] - rho[j] * y_r;
     y_r = 0.;
     for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
     {
       mem = mem0 + (w.index() >> weights.stride_shift()) * b.mem_stride;
       (&(*w))[W_DIR] += (float)coef_j * mem[(2 * j + MEM_ST + origin) % b.mem_stride];
       y_r += ((double)mem[(2 * j - 2 + MEM_YT + origin) % b.mem_stride]) * ((&(*w))[W_DIR]);
     }
   }

   coef_j = alpha[0] - rho[0] * y_r;
   for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
   {
     mem = mem0 + (w.index() >> weights.stride_shift()) * b.mem_stride;
     (&(*w))[W_DIR] = -(&(*w))[W_DIR] - (float)coef_j * mem[(MEM_ST + origin) % b.mem_stride];
   }

   /*********************
   ** shift
   ********************/

   lastj = (lastj < b.m - 1) ? lastj + 1 : b.m - 1;
   origin = (origin + b.mem_stride - 2) % b.mem_stride;

   for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
   {
     mem = mem0 + (w.index() >> weights.stride_shift()) * b.mem_stride;
     mem[(MEM_GT + origin) % b.mem_stride] = (&(*w))[W_GT];
     mem[(MEM_XT + origin) % b.mem_stride] = (&(*w))[W_XT];
     (&(*w))[W_GT] = 0;
   }
   for (int j = lastj; j > 0; j--) rho[j] = rho[j - 1];
 }

 void bfgs_iter_middle(vw& all, bfgs& b, float* mem, double* rho, double* alpha, int& lastj, int& origin)
 {
   if (all.weights.sparse)
     bfgs_iter_middle(all, b, mem, rho, alpha, lastj, origin, all.weights.sparse_weights);
   else
     bfgs_iter_middle(all, b, mem, rho, alpha, lastj, origin, all.weights.dense_weights);
 }

 template <class T>
 double wolfe_eval(vw& all, bfgs& b, float* mem, double loss_sum, double previous_loss_sum, double step_size,
     double importance_weight_sum, int& origin, double& wolfe1, T& weights)
 {
   double g0_d = 0.;
   double g1_d = 0.;
   double g1_Hg1 = 0.;
   double g1_g1 = 0.;

   for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
   {
     float* mem1 = mem + (w.index() >> weights.stride_shift()) * b.mem_stride;
     g0_d += ((double)mem1[(MEM_GT + origin) % b.mem_stride]) * ((&(*w))[W_DIR]);
     g1_d += ((double)(&(*w))[W_GT]) * (&(*w))[W_DIR];
     g1_Hg1 += ((double)(&(*w))[W_GT]) * (&(*w))[W_GT] * ((&(*w))[W_COND]);
     g1_g1 += ((double)(&(*w))[W_GT]) * (&(*w))[W_GT];
   }

   wolfe1 = (loss_sum - previous_loss_sum) / (step_size * g0_d);
   double wolfe2 = g1_d / g0_d;
   // double new_step_cross = (loss_sum-previous_loss_sum-g1_d*step)/(g0_d-g1_d);

   if (!all.quiet)
     fprintf(stderr, "%-10.5f\t%-10.5f\t%s%-10f\t%-10f\t", g1_g1 / (importance_weight_sum * importance_weight_sum),
         g1_Hg1 / importance_weight_sum, " ", wolfe1, wolfe2);
   return 0.5 * step_size;
 }

 double wolfe_eval(vw& all, bfgs& b, float* mem, double loss_sum, double previous_loss_sum, double step_size,
     double importance_weight_sum, int& origin, double& wolfe1)
 {
   if (all.weights.sparse)
     return wolfe_eval(all, b, mem, loss_sum, previous_loss_sum, step_size, importance_weight_sum, origin, wolfe1,
         all.weights.sparse_weights);
   else
     return wolfe_eval(all, b, mem, loss_sum, previous_loss_sum, step_size, importance_weight_sum, origin, wolfe1,
         all.weights.dense_weights);
 }

 template <class T>
 double add_regularization(vw& all, bfgs& b, float regularization, T& weights)
 {
   // compute the derivative difference
   double ret = 0.;

   if (b.regularizers == nullptr)
     for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
     {
       (&(*w))[W_GT] += regularization * (*w);
       ret += 0.5 * regularization * (*w) * (*w);
     }
   else
     for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
     {
       uint64_t i = w.index() >> weights.stride_shift();
       weight delta_weight = *w - b.regularizers[2 * i + 1];
       (&(*w))[W_GT] += b.regularizers[2 * i] * delta_weight;
       ret += 0.5 * b.regularizers[2 * i] * delta_weight * delta_weight;
     }

   // if we're not regularizing the intercept term, then subtract it off from the result above
   // when accessing weights[constant], always use weights.strided_index(constant)
   if (all.no_bias)
   {
     if (b.regularizers == nullptr)
     {
       (&weights.strided_index(constant))[W_GT] -= regularization * (weights.strided_index(constant));
       ret -= 0.5 * regularization * (weights.strided_index(constant)) * (weights.strided_index(constant));
     }
     else
     {
       uint64_t i = constant >> weights.stride_shift();
       weight delta_weight = (weights.strided_index(constant)) - b.regularizers[2 * i + 1];
       (&weights.strided_index(constant))[W_GT] -= b.regularizers[2 * i] * delta_weight;
       ret -= 0.5 * b.regularizers[2 * i] * delta_weight * delta_weight;
     }
   }

   return ret;
 }

 double add_regularization(vw& all, bfgs& b, float regularization)
 {
   if (all.weights.sparse)
     return add_regularization(all, b, regularization, all.weights.sparse_weights);
   else
     return add_regularization(all, b, regularization, all.weights.dense_weights);
 }

 template <class T>
 void finalize_preconditioner(vw& /* all */, bfgs& b, float regularization, T& weights)
 {
   float max_hessian = 0.f;

   if (b.regularizers == nullptr)
     for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
     {
       (&(*w))[W_COND] += regularization;
       if ((&(*w))[W_COND] > max_hessian)
         max_hessian = (&(*w))[W_COND];
       if ((&(*w))[W_COND] > 0)
         (&(*w))[W_COND] = 1.f / (&(*w))[W_COND];
     }
   else
     for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
     {
       (&(*w))[W_COND] += b.regularizers[2 * (w.index() >> weights.stride_shift())];
       if ((&(*w))[W_COND] > max_hessian)
         max_hessian = (&(*w))[W_COND];
       if ((&(*w))[W_COND] > 0)
         (&(*w))[W_COND] = 1.f / (&(*w))[W_COND];
     }

   float max_precond = (max_hessian == 0.f) ? 0.f : max_precond_ratio / max_hessian;

   for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
   {
     if (std::isinf(*w) || *w > max_precond)
       (&(*w))[W_COND] = max_precond;
   }
 }
 void finalize_preconditioner(vw& all, bfgs& b, float regularization)
 {
   if (all.weights.sparse)
     finalize_preconditioner(all, b, regularization, all.weights.sparse_weights);
   else
     finalize_preconditioner(all, b, regularization, all.weights.dense_weights);
 }

 template <class T>
 void preconditioner_to_regularizer(vw& all, bfgs& b, float regularization, T& weights)
 {
   uint32_t length = 1 << all.num_bits;

   if (b.regularizers == nullptr)
   {
     b.regularizers = calloc_or_throw<weight>(2 * length);

     if (b.regularizers == nullptr)
       THROW("Failed to allocate weight array: try decreasing -b <bits>");

     for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
     {
       uint64_t i = w.index() >> weights.stride_shift();
       b.regularizers[2 * i] = regularization;
       if ((&(*w))[W_COND] > 0.f)
         b.regularizers[2 * i] += 1.f / (&(*w))[W_COND];
     }
   }
   else
     for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
     {
       if ((&(*w))[W_COND] > 0.f)
         b.regularizers[2 * (w.index() >> weights.stride_shift())] += 1.f / (&(*w))[W_COND];
     }

   for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
     b.regularizers[2 * (w.index() >> weights.stride_shift()) + 1] = *w;
 }
 void preconditioner_to_regularizer(vw& all, bfgs& b, float regularization)
 {
   if (all.weights.sparse)
     preconditioner_to_regularizer(all, b, regularization, all.weights.sparse_weights);
   else
     preconditioner_to_regularizer(all, b, regularization, all.weights.dense_weights);
 }

 template <class T>
 void regularizer_to_weight(vw& /* all */, bfgs& b, T& weights)
 {
   if (b.regularizers != nullptr)
   {
     for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
     {
       uint64_t i = w.index() >> weights.stride_shift();
       (&(*w))[W_COND] = b.regularizers[2 * i];
       *w = b.regularizers[2 * i + 1];
     }
   }
 }

 void regularizer_to_weight(vw& all, bfgs& b)
 {
   if (all.weights.sparse)
     regularizer_to_weight(all, b, all.weights.sparse_weights);
   else
     regularizer_to_weight(all, b, all.weights.dense_weights);
 }

 void zero_state(vw& all)
 {
   all.weights.set_zero(W_GT);
   all.weights.set_zero(W_DIR);
   all.weights.set_zero(W_COND);
 }

 template <class T>
 double derivative_in_direction(vw& /* all */, bfgs& b, float* mem, int& origin, T& weights)
 {
   double ret = 0.;
   for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
   {
     float* mem1 = mem + (w.index() >> weights.stride_shift()) * b.mem_stride;
     ret += ((double)mem1[(MEM_GT + origin) % b.mem_stride]) * (&(*w))[W_DIR];
   }
   return ret;
 }

 double derivative_in_direction(vw& all, bfgs& b, float* mem, int& origin)
 {
   if (all.weights.sparse)
     return derivative_in_direction(all, b, mem, origin, all.weights.sparse_weights);
   else
     return derivative_in_direction(all, b, mem, origin, all.weights.dense_weights);
 }

 template <class T>
 void update_weight(vw& /* all */, float step_size, T& w)
 {
   for (typename T::iterator iter = w.begin(); iter != w.end(); ++iter)
     (&(*iter))[W_XT] += step_size * (&(*iter))[W_DIR];
 }

 void update_weight(vw& all, float step_size)
 {
   if (all.weights.sparse)
     update_weight(all, step_size, all.weights.sparse_weights);
   else
     update_weight(all, step_size, all.weights.dense_weights);
 }

 int process_pass(vw& all, bfgs& b)
 {
   int status = LEARN_OK;

   finalize_preconditioner(all, b, all.l2_lambda);
   /********************************************************************/
   /* A) FIRST PASS FINISHED: INITIALIZE FIRST LINE SEARCH *************/
   /********************************************************************/
   if (b.first_pass)
   {
     if (all.all_reduce != nullptr)
     {
       accumulate(all, all.weights, W_COND);  // Accumulate preconditioner
       float temp = (float)b.importance_weight_sum;
       b.importance_weight_sum = accumulate_scalar(all, temp);
     }
     // finalize_preconditioner(all, b, all.l2_lambda);
     if (all.all_reduce != nullptr)
     {
       float temp = (float)b.loss_sum;
       b.loss_sum = accumulate_scalar(all, temp);  // Accumulate loss_sums
       accumulate(all, all.weights, 1);            // Accumulate gradients from all nodes
     }
     if (all.l2_lambda > 0.)
       b.loss_sum += add_regularization(all, b, all.l2_lambda);
     if (!all.quiet)
       fprintf(stderr, "%2lu %-10.5f\t", (long unsigned int)b.current_pass + 1, b.loss_sum / b.importance_weight_sum);

     b.previous_loss_sum = b.loss_sum;
     b.loss_sum = 0.;
     b.example_number = 0;
     b.curvature = 0;
     bfgs_iter_start(all, b, b.mem, b.lastj, b.importance_weight_sum, b.origin);
     if (b.first_hessian_on)
     {
       b.gradient_pass = false;  // now start computing curvature
     }
     else
     {
       b.step_size = 0.5;
       float d_mag = direction_magnitude(all);
       ftime(&b.t_end_global);
       b.net_time = (int)(1000.0 * (b.t_end_global.time - b.t_start_global.time) +
           (b.t_end_global.millitm - b.t_start_global.millitm));
       if (!all.quiet)
         fprintf(stderr, "%-10s\t%-10.5f\t%-.5f\n", "", d_mag, b.step_size);
       b.predictions.clear();
       update_weight(all, b.step_size);
     }
   }
   else
       /********************************************************************/
       /* B) GRADIENT CALCULATED *******************************************/
       /********************************************************************/
       if (b.gradient_pass)  // We just finished computing all gradients
   {
     if (all.all_reduce != nullptr)
     {
       float t = (float)b.loss_sum;
       b.loss_sum = accumulate_scalar(all, t);  // Accumulate loss_sums
       accumulate(all, all.weights, 1);         // Accumulate gradients from all nodes
     }
     if (all.l2_lambda > 0.)
       b.loss_sum += add_regularization(all, b, all.l2_lambda);
     if (!all.quiet)
     {
       if (!all.holdout_set_off && b.current_pass >= 1)
       {
         if (all.sd->holdout_sum_loss_since_last_pass == 0. && all.sd->weighted_holdout_examples_since_last_pass == 0.)
         {
           fprintf(stderr, "%2lu ", (long unsigned int)b.current_pass + 1);
           fprintf(stderr, "h unknown    ");
         }
         else
           fprintf(stderr, "%2lu h%-10.5f\t", (long unsigned int)b.current_pass + 1,
               all.sd->holdout_sum_loss_since_last_pass / all.sd->weighted_holdout_examples_since_last_pass);
       }
       else
         fprintf(stderr, "%2lu %-10.5f\t", (long unsigned int)b.current_pass + 1, b.loss_sum / b.importance_weight_sum);
     }
     double wolfe1;
     double new_step = wolfe_eval(
         all, b, b.mem, b.loss_sum, b.previous_loss_sum, b.step_size, b.importance_weight_sum, b.origin, wolfe1);

     /********************************************************************/
     /* B0) DERIVATIVE ZERO: MINIMUM FOUND *******************************/
     /********************************************************************/
     if (std::isnan((float)wolfe1))
     {
       fprintf(stderr, "\n");
       fprintf(stdout, "Derivative 0 detected.\n");
       b.step_size = 0.0;
       status = LEARN_CONV;
     }
     /********************************************************************/
     /* B1) LINE SEARCH FAILED *******************************************/
     /********************************************************************/
     else if (b.backstep_on && (wolfe1 < b.wolfe1_bound || b.loss_sum > b.previous_loss_sum))
     {
       // curvature violated, or we stepped too far last time: step back
       ftime(&b.t_end_global);
       b.net_time = (int)(1000.0 * (b.t_end_global.time - b.t_start_global.time) +
           (b.t_end_global.millitm - b.t_start_global.millitm));
       float ratio = (b.step_size == 0.f) ? 0.f : (float)new_step / (float)b.step_size;
       if (!all.quiet)
         fprintf(stderr, "%-10s\t%-10s\t(revise x %.1f)\t%-.5f\n", "", "", ratio, new_step);
       b.predictions.clear();
       update_weight(all, (float)(-b.step_size + new_step));
       b.step_size = (float)new_step;
       zero_derivative(all);
       b.loss_sum = 0.;
     }

     /********************************************************************/
     /* B2) LINE SEARCH SUCCESSFUL OR DISABLED          ******************/
     /*     DETERMINE NEXT SEARCH DIRECTION             ******************/
     /********************************************************************/
     else
     {
       double rel_decrease = (b.previous_loss_sum - b.loss_sum) / b.previous_loss_sum;
       if (!std::isnan((float)rel_decrease) && b.backstep_on && fabs(rel_decrease) < b.rel_threshold)
       {
         fprintf(stdout,
             "\nTermination condition reached in pass %ld: decrease in loss less than %.3f%%.\n"
             "If you want to optimize further, decrease termination threshold.\n",
             (long int)b.current_pass + 1, b.rel_threshold * 100.0);
         status = LEARN_CONV;
       }
       b.previous_loss_sum = b.loss_sum;
       b.loss_sum = 0.;
       b.example_number = 0;
       b.curvature = 0;
       b.step_size = 1.0;

       try
       {
         bfgs_iter_middle(all, b, b.mem, b.rho, b.alpha, b.lastj, b.origin);
       }
       catch (const curv_exception&)
       {
         fprintf(stdout, "In bfgs_iter_middle: %s", curv_message);
         b.step_size = 0.0;
         status = LEARN_CURV;
       }

       if (all.hessian_on)
       {
         b.gradient_pass = false;  // now start computing curvature
       }
       else
       {
         float d_mag = direction_magnitude(all);
         ftime(&b.t_end_global);
         b.net_time = (int)(1000.0 * (b.t_end_global.time - b.t_start_global.time) +
             (b.t_end_global.millitm - b.t_start_global.millitm));
         if (!all.quiet)
           fprintf(stderr, "%-10s\t%-10.5f\t%-.5f\n", "", d_mag, b.step_size);
         b.predictions.clear();
         update_weight(all, b.step_size);
       }
     }
   }

   /********************************************************************/
   /* C) NOT FIRST PASS, CURVATURE CALCULATED **************************/
   /********************************************************************/
   else  // just finished all second gradients
   {
     if (all.all_reduce != nullptr)
     {
       float t = (float)b.curvature;
       b.curvature = accumulate_scalar(all, t);  // Accumulate curvatures
     }
     if (all.l2_lambda > 0.)
       b.curvature += regularizer_direction_magnitude(all, b, all.l2_lambda);
     float dd = (float)derivative_in_direction(all, b, b.mem, b.origin);
     if (b.curvature == 0. && dd != 0.)
     {
       fprintf(stdout, "%s", curv_message);
       b.step_size = 0.0;
       status = LEARN_CURV;
     }
     else if (dd == 0.)
     {
       fprintf(stdout, "Derivative 0 detected.\n");
       b.step_size = 0.0;
       status = LEARN_CONV;
     }
     else
       b.step_size = -dd / (float)b.curvature;

     float d_mag = direction_magnitude(all);

     b.predictions.clear();
     update_weight(all, b.step_size);
     ftime(&b.t_end_global);
     b.net_time = (int)(1000.0 * (b.t_end_global.time - b.t_start_global.time) +
         (b.t_end_global.millitm - b.t_start_global.millitm));

     if (!all.quiet)
       fprintf(stderr, "%-10.5f\t%-10.5f\t%-.5f\n", b.curvature / b.importance_weight_sum, d_mag, b.step_size);
     b.gradient_pass = true;
   }  // now start computing derivatives.
   b.current_pass++;
   b.first_pass = false;
   b.preconditioner_pass = false;

   if (b.output_regularizer)  // need to accumulate and place the regularizer.
   {
     if (all.all_reduce != nullptr)
       accumulate(all, all.weights, W_COND);  // Accumulate preconditioner
     // preconditioner_to_regularizer(all, b, all.l2_lambda);
   }
   ftime(&b.t_end_global);
   b.net_time = (int)(1000.0 * (b.t_end_global.time - b.t_start_global.time) +
       (b.t_end_global.millitm - b.t_start_global.millitm));

   if (all.save_per_pass)
     save_predictor(all, all.final_regressor_name, b.current_pass);
   return status;
 }

 void process_example(vw& all, bfgs& b, example& ec)
 {
   label_data& ld = ec.l.simple;
   if (b.first_pass)
     b.importance_weight_sum += ec.weight;

   /********************************************************************/
   /* I) GRADIENT CALCULATION ******************************************/
   /********************************************************************/
   if (b.gradient_pass)
   {
     ec.pred.scalar = predict_and_gradient(all, ec);  // w[0] & w[1]
     ec.loss = all.loss->getLoss(all.sd, ec.pred.scalar, ld.label) * ec.weight;
     b.loss_sum += ec.loss;
     b.predictions.push_back(ec.pred.scalar);
   }
   /********************************************************************/
   /* II) CURVATURE CALCULATION ****************************************/
   /********************************************************************/
   else  // computing curvature
   {
     float d_dot_x = dot_with_direction(all, ec);   // w[2]
     if (b.example_number >= b.predictions.size())  // Make things safe in case example source is strange.
       b.example_number = b.predictions.size() - 1;
     ec.pred.scalar = b.predictions[b.example_number];
     ec.partial_prediction = b.predictions[b.example_number];
     ec.loss = all.loss->getLoss(all.sd, ec.pred.scalar, ld.label) * ec.weight;
     float sd = all.loss->second_derivative(all.sd, b.predictions[b.example_number++], ld.label);
     b.curvature += ((double)d_dot_x) * d_dot_x * sd * ec.weight;
   }
   ec.updated_prediction = ec.pred.scalar;

   if (b.preconditioner_pass)
     update_preconditioner(all, ec);  // w[3]
 }

 void end_pass(bfgs& b)
 {
   vw* all = b.all;

   if (b.current_pass <= b.final_pass)
   {
     if (b.current_pass < b.final_pass)
     {
       int status = process_pass(*all, b);

       // reaching the max number of passes regardless of convergence
       if (b.final_pass == b.current_pass)
       {
         b.all->trace_message << "Maximum number of passes reached. ";
         if (!b.output_regularizer)
           b.all->trace_message << "If you want to optimize further, increase the number of passes\n";
         if (b.output_regularizer)
         {
           b.all->trace_message << "\nRegular model file has been created. ";
           b.all->trace_message << "Output feature regularizer file is created only when the convergence is reached. "
                                   "Try increasing the number of passes for convergence\n";
           b.output_regularizer = false;
         }
       }

       // attain convergence before reaching max iterations
       if (status != LEARN_OK && b.final_pass > b.current_pass)
       {
         b.final_pass = b.current_pass;
       }
       else
       {
         // Not converged yet.
         // Reset preconditioner to zero so that it is correctly recomputed in the next pass
         zero_preconditioner(*all);
       }
       if (!all->holdout_set_off)
       {
         if (summarize_holdout_set(*all, b.no_win_counter))
           finalize_regressor(*all, all->final_regressor_name);
         if (b.early_stop_thres == b.no_win_counter)
         {
           set_done(*all);
           b.all->trace_message << "Early termination reached w.r.t. holdout set error";
         }
       }
       if (b.final_pass == b.current_pass)
       {
         finalize_regressor(*all, all->final_regressor_name);
         set_done(*all);
       }
     }
     else  // reaching convergence in the previous pass
       b.current_pass++;
   }
 }

 // placeholder
 template <bool audit>
 void predict(bfgs& b, base_learner&, example& ec)
 {
   vw* all = b.all;
   ec.pred.scalar = bfgs_predict(*all, ec);
   if (audit)
     GD::print_audit_features(*(b.all), ec);
 }

 template <bool audit>
 void learn(bfgs& b, base_learner& base, example& ec)
 {
   vw* all = b.all;
   assert(ec.in_use);

   if (b.current_pass <= b.final_pass)
   {
     if (test_example(ec))
       predict<audit>(b, base, ec);
     else
       process_example(*all, b, ec);
   }
 }

 void save_load_regularizer(vw& all, bfgs& b, io_buf& model_file, bool read, bool text)
 {
   int c = 0;
   uint32_t length = 2 * (1 << all.num_bits);
   uint32_t i = 0;
   size_t brw = 1;

   if (b.output_regularizer && !read)
     preconditioner_to_regularizer(*(b.all), b, b.all->l2_lambda);

   do
   {
     brw = 1;
     weight* v;
     if (read)
     {
       c++;
       brw = model_file.bin_read_fixed((char*)&i, sizeof(i), "");
       if (brw > 0)
       {
         assert(i < length);
         v = &(b.regularizers[i]);
         brw += model_file.bin_read_fixed((char*)v, sizeof(*v), "");
       }
     }
     else  // write binary or text
     {
       v = &(b.regularizers[i]);
       if (*v != 0.)
       {
         c++;
         std::stringstream msg;
         msg << i;
         brw = bin_text_write_fixed(model_file, (char*)&i, sizeof(i), msg, text);

         msg << ":" << *v << "\n";
         brw += bin_text_write_fixed(model_file, (char*)v, sizeof(*v), msg, text);
       }
     }
     if (!read)
       i++;
   } while ((!read && i < length) || (read && brw > 0));

   if (read)
     regularizer_to_weight(all, b);
 }

 void save_load(bfgs& b, io_buf& model_file, bool read, bool text)
 {
   vw* all = b.all;

   uint32_t length = 1 << all->num_bits;

   if (read)
   {
     initialize_regressor(*all);
     if (all->per_feature_regularizer_input != "")
     {
       b.regularizers = calloc_or_throw<weight>(2 * length);
       if (b.regularizers == nullptr)
         THROW("Failed to allocate regularizers array: try decreasing -b <bits>");
     }
     int m = b.m;

     b.mem_stride = (m == 0) ? CG_EXTRA : 2 * m;
     b.mem = calloc_or_throw<float>(all->length() * b.mem_stride);
     b.rho = calloc_or_throw<double>(m);
     b.alpha = calloc_or_throw<double>(m);

     uint32_t stride_shift = all->weights.stride_shift();

     if (!all->quiet)
       std::cerr << "m = " << m << std::endl
                 << "Allocated "
                 << ((long unsigned int)all->length() *
                            (sizeof(float) * (b.mem_stride) + (sizeof(weight) << stride_shift)) >>
                        20)
                 << "M for weights and mem" << std::endl;

     b.net_time = 0.0;
     ftime(&b.t_start_global);

     if (!all->quiet)
     {
       const char* header_fmt = "%2s %-10s\t%-10s\t%-10s\t %-10s\t%-10s\t%-10s\t%-10s\t%-10s\t%-s\n";
       fprintf(stderr, header_fmt, "##", "avg. loss", "der. mag.", "d. m. cond.", "wolfe1", "wolfe2", "mix fraction",
           "curvature", "dir. magnitude", "step size");
       std::cerr.precision(5);
     }

     if (b.regularizers != nullptr)
       all->l2_lambda = 1;  // To make sure we are adding the regularization
     b.output_regularizer = (all->per_feature_regularizer_output != "" || all->per_feature_regularizer_text != "");
     reset_state(*all, b, false);
   }

   // bool reg_vector = b.output_regularizer || all->per_feature_regularizer_input.length() > 0;
   bool reg_vector = (b.output_regularizer && !read) || (all->per_feature_regularizer_input.length() > 0 && read);

   if (model_file.files.size() > 0)
   {
     std::stringstream msg;
     msg << ":" << reg_vector << "\n";
     bin_text_read_write_fixed(model_file, (char*)&reg_vector, sizeof(reg_vector), "", read, msg, text);

     if (reg_vector)
       save_load_regularizer(*all, b, model_file, read, text);
     else
       GD::save_load_regressor(*all, model_file, read, text);
   }
 }

 void init_driver(bfgs& b) { b.backstep_on = true; }

 base_learner* bfgs_setup(options_i& options, vw& all)
 {
   auto b = scoped_calloc_or_throw<bfgs>();
   bool conjugate_gradient = false;
   bool bfgs_option = false;
   option_group_definition bfgs_outer_options("LBFGS and Conjugate Gradient options");
   bfgs_outer_options.add(
       make_option("conjugate_gradient", conjugate_gradient).keep().help("use conjugate gradient based optimization"));

   option_group_definition bfgs_inner_options("LBFGS and Conjugate Gradient options");
   bfgs_inner_options.add(make_option("bfgs", bfgs_option).keep().help("use conjugate gradient based optimization"));
   bfgs_inner_options.add(make_option("hessian_on", all.hessian_on).help("use second derivative in line search"));
   bfgs_inner_options.add(make_option("mem", b->m).default_value(15).help("memory in bfgs"));
   bfgs_inner_options.add(
       make_option("termination", b->rel_threshold).default_value(0.001f).help("Termination threshold"));

   options.add_and_parse(bfgs_outer_options);
   if (!conjugate_gradient)
   {
     options.add_and_parse(bfgs_inner_options);
     if (!bfgs_option)
     {
       return nullptr;
     }
   }

   b->all = &all;
   b->wolfe1_bound = 0.01;
   b->first_hessian_on = true;
   b->first_pass = true;
   b->gradient_pass = true;
   b->preconditioner_pass = true;
   b->backstep_on = false;
   b->final_pass = all.numpasses;
   b->no_win_counter = 0;

   if (!all.holdout_set_off)
   {
     all.sd->holdout_best_loss = FLT_MAX;
     b->early_stop_thres = options.get_typed_option<size_t>("early_terminate").value();
   }

   if (b->m == 0)
     all.hessian_on = true;

   if (!all.quiet)
   {
     if (b->m > 0)
       b->all->trace_message << "enabling BFGS based optimization ";
     else
       b->all->trace_message << "enabling conjugate gradient optimization via BFGS ";
     if (all.hessian_on)
       b->all->trace_message << "with curvature calculation" << std::endl;
     else
       b->all->trace_message << "**without** curvature calculation" << std::endl;
   }

   if (all.numpasses < 2 && all.training)
     THROW("you must make at least 2 passes to use BFGS");

   all.bfgs = true;
   all.weights.stride_shift(2);

   void (*learn_ptr)(bfgs&, base_learner&, example&) = nullptr;
   if (all.audit)
     learn_ptr = learn<true>;
   else
     learn_ptr = learn<false>;

   learner<bfgs, example>* l;
   if (all.audit || all.hash_inv)
     l = &init_learner(b, learn_ptr, predict<true>, all.weights.stride());
   else
     l = &init_learner(b, learn_ptr, predict<false>, all.weights.stride());

   l->set_save_load(save_load);
   l->set_init_driver(init_driver);
   l->set_end_pass(end_pass);

   return make_base(*l);
 }
bfgs_predict
float bfgs_predict(vw &all, example &ec)
Definition: bfgs.cc:149

vw::length
size_t length()
Definition: global_data.h:513

GD::finalize_prediction
float finalize_prediction(shared_data *sd, float ret)
Definition: gd.cc:339

zero_preconditioner
void zero_preconditioner(vw &all)
Definition: bfgs.cc:124

set_done
void set_done(vw &all)
Definition: parser.cc:578

preconditioner_to_regularizer
void preconditioner_to_regularizer(vw &all, bfgs &b, float regularization, T &weights)
Definition: bfgs.cc:538

vw::weights
parameters weights
Definition: global_data.h:537

vw::loss
loss_function * loss
Definition: global_data.h:523

add_regularization
double add_regularization(vw &all, bfgs &b, float regularization, T &weights)
Definition: bfgs.cc:448

W_GT
#define W_GT
Definition: bfgs.cc:37

GD::print_audit_features
void print_audit_features(vw &all, example &ec)
Definition: gd.cc:331

bfgs::backstep_on
bool backstep_on
Definition: bfgs.cc:85

test_example
constexpr bool test_example(example &ec) noexcept
Definition: bfgs.cc:147

accumulate
void accumulate(vw &all, parameters &weights, size_t offset)
Definition: accumulate.cc:20

bfgs::origin
int origin
Definition: bfgs.cc:96

initialize_regressor
void initialize_regressor(vw &all, T &weights)
Definition: parse_regressor.cc:97

W_DIR
#define W_DIR
Definition: bfgs.cc:38

bfgs::output_regularizer
bool output_regularizer
Definition: bfgs.cc:89

bfgs_setup
base_learner * bfgs_setup(options_i &options, vw &all)
Definition: bfgs.cc:1093

stride_shift
uint64_t stride_shift(const stagewise_poly &poly, uint64_t idx)
Definition: stagewise_poly.cc:81

shared_data::holdout_sum_loss_since_last_pass
double holdout_sum_loss_since_last_pass
Definition: global_data.h:163

polyprediction::scalar
float scalar
Definition: example.h:45

gd.h

vw::hash_inv
bool hash_inv
Definition: global_data.h:541

direction_magnitude
float direction_magnitude(vw &, T &weights)
Definition: bfgs.cc:218

loss_function::second_derivative
virtual float second_derivative(shared_data *, float prediction, float label)=0

bfgs::mem
float * mem
Definition: bfgs.cc:90

process_pass
int process_pass(vw &all, bfgs &b)
Definition: bfgs.cc:639

bfgs::m
int m
Definition: bfgs.cc:65

LEARN_CURV
#define LEARN_CURV
Definition: bfgs.cc:42

parameters::stride
uint32_t stride()
Definition: array_parameters.h:252

process_example
void process_example(vw &all, bfgs &b, example &ec)
Definition: bfgs.cc:861

LEARN_CONV
#define LEARN_CONV
Definition: bfgs.cc:43

bfgs::alpha
double * alpha
Definition: bfgs.cc:92

dot_with_direction
float dot_with_direction(vw &all, example &ec)
Definition: bfgs.cc:179

shared_data::holdout_best_loss
double holdout_best_loss
Definition: global_data.h:161

LEARNER::make_base
base_learner * make_base(learner< T, E > &base)
Definition: learner.h:462

VW::config::option_group_definition
Definition: options.h:85

bfgs::net_comm_time
double net_comm_time
Definition: bfgs.cc:72

example::partial_prediction
float partial_prediction
Definition: example.h:68

vw::quiet
bool quiet
Definition: global_data.h:487

bfgs::curvature
double curvature
Definition: bfgs.cc:100

finalize_regressor
void finalize_regressor(vw &all, std::string reg_name)
Definition: parse_regressor.cc:561

bfgs
Definition: bfgs.cc:62

VW::config::options_i::add_and_parse
virtual void add_and_parse(const option_group_definition &group)=0

label_data::label
float label
Definition: simple_label.h:14

example::updated_prediction
float updated_prediction
Definition: example.h:69

polylabel::simple
label_data simple
Definition: example.h:28

bfgs_iter_middle
void bfgs_iter_middle(vw &all, bfgs &b, float *mem, double *rho, double *alpha, int &lastj, int &origin, T &weights)
Definition: bfgs.cc:270

LEARN_OK
#define LEARN_OK
Definition: bfgs.cc:41

vw::holdout_set_off
bool holdout_set_off
Definition: global_data.h:499

bfgs::step_size
float step_size
Definition: bfgs.cc:98

LEARNER::learner
Definition: cb_explore.h:11

summarize_holdout_set
bool summarize_holdout_set(vw &all, size_t &no_win_counter)
Definition: simple_label.cc:143

vw::num_bits
uint32_t num_bits
Definition: global_data.h:398

bfgs::regularizers
weight * regularizers
Definition: bfgs.cc:94

vw::training
bool training
Definition: global_data.h:488

v_array::size
size_t size() const
Definition: v_array.h:68

MEM_XT
#define MEM_XT
Definition: bfgs.cc:32

MEM_ST
#define MEM_ST
Definition: bfgs.cc:34

loss_function::first_derivative
virtual float first_derivative(shared_data *, float prediction, float label)=0

GD::inline_predict
float inline_predict(vw &all, example &ec)
Definition: gd.h:98

vw::hessian_on
bool hessian_on
Definition: global_data.h:413

vw_exception.h

CG_EXTRA
#define CG_EXTRA
Definition: bfgs.cc:29

vw
Definition: global_data.h:369

save_predictor
void save_predictor(vw &all, std::string reg_name, size_t current_pass)
Definition: parse_regressor.cc:552

derivative_in_direction
double derivative_in_direction(vw &, bfgs &b, float *mem, int &origin, T &weights)
Definition: bfgs.cc:605

W_XT
#define W_XT
Definition: bfgs.cc:36

vw::set_minmax
void(* set_minmax)(shared_data *sd, float label)
Definition: global_data.h:394

vw::all_reduce
AllReduce * all_reduce
Definition: global_data.h:381

bin_text_write_fixed
size_t bin_text_write_fixed(io_buf &io, char *data, size_t len, std::stringstream &msg, bool text)
Definition: io_buf.h:313

save_load
void save_load(bfgs &b, io_buf &model_file, bool read, bool text)
Definition: bfgs.cc:1026

io_buf::bin_read_fixed
size_t bin_read_fixed(char *data, size_t len, const char *read_message)
Definition: io_buf.h:230

max_precond_ratio
constexpr float max_precond_ratio
Definition: bfgs.cc:60

bfgs::mem_stride
int mem_stride
Definition: bfgs.cc:88

curv_message
constexpr const char * curv_message
Definition: bfgs.cc:116

bfgs::predictions
v_array< float > predictions
Definition: bfgs.cc:77

loss_function::getLoss
virtual float getLoss(shared_data *, float prediction, float label)=0

vw::no_bias
bool no_bias
Definition: global_data.h:446

LEARNER::init_learner
learner< T, E > & init_learner(free_ptr< T > &dat, L *base, void(*learn)(T &, L &, E &), void(*predict)(T &, L &, E &), size_t ws, prediction_type::prediction_type_t pred_type)
Definition: learner.h:369

bfgs::~bfgs
~bfgs()
Definition: bfgs.cc:107

curv_ex
curv_exception curv_ex

v_array::push_back
void push_back(const T &new_ele)
Definition: v_array.h:107

vw::sd
shared_data * sd
Definition: global_data.h:375

VW::config::options_i::get_typed_option
typed_option< T > & get_typed_option(const std::string &key)
Definition: options.h:120

LEARNER::end_pass
void end_pass(example &ec, vw &all)
Definition: learner.cc:44

vw::l2_lambda
float l2_lambda
Definition: global_data.h:445

save_load_regularizer
void save_load_regularizer(vw &all, bfgs &b, io_buf &model_file, bool read, bool text)
Definition: bfgs.cc:979

io_buf::files
v_array< int > files
Definition: io_buf.h:64

v_array::clear
void clear()
Definition: v_array.h:88

label_data
Definition: simple_label.h:12

vw::trace_message
vw_ostream trace_message
Definition: global_data.h:424

vw::bfgs
bool bfgs
Definition: global_data.h:412

update_preconditioner
void update_preconditioner(vw &all, example &ec)
Definition: bfgs.cc:171

regularizer_to_weight
void regularizer_to_weight(vw &, bfgs &b, T &weights)
Definition: bfgs.cc:576

shared_data::weighted_holdout_examples_since_last_pass
double weighted_holdout_examples_since_last_pass
Definition: global_data.h:162

vw::per_feature_regularizer_output
std::string per_feature_regularizer_output
Definition: global_data.h:441

constant
constexpr uint64_t constant
Definition: constant.h:11

wolfe_eval
double wolfe_eval(vw &all, bfgs &b, float *mem, double loss_sum, double previous_loss_sum, double step_size, double importance_weight_sum, int &origin, double &wolfe1, T &weights)
Definition: bfgs.cc:409

VW::config::options_i
Definition: options.h:107

bfgs::net_time
double net_time
Definition: bfgs.cc:75

parameters::dense_weights
dense_parameters dense_weights
Definition: array_parameters.h:233

regularizer_direction_magnitude
double regularizer_direction_magnitude(vw &, bfgs &b, double regularizer, T &weights)
Definition: bfgs.cc:187

example
Definition: example.h:54

zero_derivative
void zero_derivative(vw &all)
Definition: bfgs.cc:122

curv_exception
Definition: bfgs.cc:45

label_data::initial
float initial
Definition: simple_label.h:16

bfgs::first_hessian_on
bool first_hessian_on
Definition: bfgs.cc:84

MEM_YT
#define MEM_YT
Definition: bfgs.cc:33

parameters::set_zero
void set_zero(size_t offset)
Definition: array_parameters.h:284

io_buf
Definition: io_buf.h:54

bfgs::all
vw * all
Definition: bfgs.cc:64

add_grad
void add_grad(float &d, float f, float &fw)
Definition: bfgs.cc:155

bfgs::gradient_pass
bool gradient_pass
Definition: bfgs.cc:104

zero_state
void zero_state(vw &all)
Definition: bfgs.cc:597

vw::per_feature_regularizer_text
std::string per_feature_regularizer_text
Definition: global_data.h:442

bfgs::lastj
int lastj
Definition: bfgs.cc:96

bfgs::final_pass
size_t final_pass
Definition: bfgs.cc:70

reductions.h

update_weight
void update_weight(vw &, float step_size, T &w)
Definition: bfgs.cc:625

vw::numpasses
size_t numpasses
Definition: global_data.h:451

example::loss
float loss
Definition: example.h:70

weight
float weight
Definition: array_parameters_dense.h:6

VW::config::option_group_definition::add
option_group_definition & add(T &&op)
Definition: options.h:90

W_COND
#define W_COND
Definition: bfgs.cc:39

MEM_GT
#define MEM_GT
Definition: bfgs.cc:31

bfgs::t_end_global
struct timeb t_start_global t_end_global
Definition: bfgs.cc:74

VW::config
Definition: options.h:11

example::l
polylabel l
Definition: example.h:57

bfgs::rho
double * rho
Definition: bfgs.cc:91

vw::save_per_pass
bool save_per_pass
Definition: global_data.h:408

init_driver
void init_driver(bfgs &b)
Definition: bfgs.cc:1091

finalize_preconditioner
void finalize_preconditioner(vw &, bfgs &b, float regularization, T &weights)
Definition: bfgs.cc:498

example::in_use
bool in_use
Definition: example.h:79

VW::config::make_option
typed_option< T > make_option(std::string name, T &location)
Definition: options.h:80

bfgs::example_number
size_t example_number
Definition: bfgs.cc:78

parameters::sparse_weights
sparse_parameters sparse_weights
Definition: array_parameters.h:234

bfgs::previous_loss_sum
double previous_loss_sum
Definition: bfgs.cc:97

accumulate.h

accumulate_scalar
float accumulate_scalar(vw &all, float local_sum)
Definition: accumulate.cc:44

predict_and_gradient
float predict_and_gradient(vw &all, example &ec)
Definition: bfgs.cc:157

parameters::stride_shift
uint32_t stride_shift()
Definition: array_parameters.h:244

reset_state
void reset_state(vw &all, bfgs &b, bool zero)
Definition: bfgs.cc:126

bfgs::current_pass
size_t current_pass
Definition: bfgs.cc:79

bfgs::no_win_counter
size_t no_win_counter
Definition: bfgs.cc:80

vw::per_feature_regularizer_input
std::string per_feature_regularizer_input
Definition: global_data.h:440

vw::audit
bool audit
Definition: global_data.h:486

predict
void predict(bfgs &b, base_learner &, example &ec)
Definition: bfgs.cc:956

bfgs::wolfe1_bound
double wolfe1_bound
Definition: bfgs.cc:68

example::pred
polyprediction pred
Definition: example.h:60

LEARNER
Definition: cb_explore.h:8

v_array::delete_v
void delete_v()
Definition: v_array.h:98

bfgs::loss_sum
double loss_sum
Definition: bfgs.cc:97

GD::save_load_regressor
void save_load_regressor(vw &all, io_buf &model_file, bool read, bool text, T &weights)
Definition: gd.cc:707

bfgs::importance_weight_sum
double importance_weight_sum
Definition: bfgs.cc:99

bfgs::early_stop_thres
size_t early_stop_thres
Definition: bfgs.cc:81

vw::final_regressor_name
std::string final_regressor_name
Definition: global_data.h:535

add_DIR
void add_DIR(float &p, const float fx, float &fw)
Definition: bfgs.cc:177

learn
void learn(bfgs &b, base_learner &base, example &ec)
Definition: bfgs.cc:965

example::weight
float weight
Definition: example.h:62

bfgs::preconditioner_pass
bool preconditioner_pass
Definition: bfgs.cc:105

v_array< float >

add_precond
void add_precond(float &d, float f, float &fw)
Definition: bfgs.cc:169

bin_text_read_write_fixed
size_t bin_text_read_write_fixed(io_buf &io, char *data, size_t len, const char *read_message, bool read, std::stringstream &msg, bool text)
Definition: io_buf.h:326

THROW
#define THROW(args)
Definition: vw_exception.h:181

c
constexpr uint64_t c
Definition: rand48.cc:12

parameters::sparse
bool sparse
Definition: array_parameters.h:232

f
float f
Definition: cache.cc:40

bfgs_iter_start
void bfgs_iter_start(vw &all, bfgs &b, float *mem, int &lastj, double importance_weight_sum, int &origin, T &weights)
Definition: bfgs.cc:238

bfgs::first_pass
bool first_pass
Definition: bfgs.cc:103

bfgs::rel_threshold
float rel_threshold
Definition: bfgs.cc:66