#include <cmath>
#include <fstream>
#include <float.h>
#include <netdb.h>
#include <string.h>
#include <stdio.h>
#include <assert.h>
#include <sys/timeb.h>
#include "accumulate.h"
#include "reductions.h"
#include "gd.h"
#include "vw_exception.h"
#include <exception>

Classes
class	curv_exception

struct	bfgs

Macros
#define	CG_EXTRA 1

#define	MEM_GT 0

#define	MEM_XT 1

#define	MEM_YT 0

#define	MEM_ST 1

#define	W_XT 0

#define	W_GT 1

#define	W_DIR 2

#define	W_COND 3

#define	LEARN_OK 0

#define	LEARN_CURV 1

#define	LEARN_CONV 2

Functions
void	zero_derivative (vw &all)

void	zero_preconditioner (vw &all)

void	reset_state (vw &all, bfgs &b, bool zero)

constexpr bool	test_example (example &ec) noexcept

float	bfgs_predict (vw &all, example &ec)

void	add_grad (float &d, float f, float &fw)

float	predict_and_gradient (vw &all, example &ec)

void	add_precond (float &d, float f, float &fw)

void	update_preconditioner (vw &all, example &ec)

void	add_DIR (float &p, const float fx, float &fw)

float	dot_with_direction (vw &all, example &ec)

template<class T >
double	regularizer_direction_magnitude (vw &, bfgs &b, double regularizer, T &weights)

double	regularizer_direction_magnitude (vw &all, bfgs &b, float regularizer)

template<class T >
float	direction_magnitude (vw &, T &weights)

float	direction_magnitude (vw &all)

template<class T >
void	bfgs_iter_start (vw &all, bfgs &b, float *mem, int &lastj, double importance_weight_sum, int &origin, T &weights)

void	bfgs_iter_start (vw &all, bfgs &b, float *mem, int &lastj, double importance_weight_sum, int &origin)

template<class T >
void	bfgs_iter_middle (vw &all, bfgs &b, float mem, double rho, double *alpha, int &lastj, int &origin, T &weights)

void	bfgs_iter_middle (vw &all, bfgs &b, float mem, double rho, double *alpha, int &lastj, int &origin)

template<class T >
double	wolfe_eval (vw &all, bfgs &b, float *mem, double loss_sum, double previous_loss_sum, double step_size, double importance_weight_sum, int &origin, double &wolfe1, T &weights)

double	wolfe_eval (vw &all, bfgs &b, float *mem, double loss_sum, double previous_loss_sum, double step_size, double importance_weight_sum, int &origin, double &wolfe1)

template<class T >
double	add_regularization (vw &all, bfgs &b, float regularization, T &weights)

double	add_regularization (vw &all, bfgs &b, float regularization)

template<class T >
void	finalize_preconditioner (vw &, bfgs &b, float regularization, T &weights)

void	finalize_preconditioner (vw &all, bfgs &b, float regularization)

template<class T >
void	preconditioner_to_regularizer (vw &all, bfgs &b, float regularization, T &weights)

void	preconditioner_to_regularizer (vw &all, bfgs &b, float regularization)

template<class T >
void	regularizer_to_weight (vw &, bfgs &b, T &weights)

void	regularizer_to_weight (vw &all, bfgs &b)

void	zero_state (vw &all)

template<class T >
double	derivative_in_direction (vw &, bfgs &b, float *mem, int &origin, T &weights)

double	derivative_in_direction (vw &all, bfgs &b, float *mem, int &origin)

template<class T >
void	update_weight (vw &, float step_size, T &w)

void	update_weight (vw &all, float step_size)

int	process_pass (vw &all, bfgs &b)

void	process_example (vw &all, bfgs &b, example &ec)

void	end_pass (bfgs &b)

template<bool audit>
void	predict (bfgs &b, base_learner &, example &ec)

template<bool audit>
void	learn (bfgs &b, base_learner &base, example &ec)

void	save_load_regularizer (vw &all, bfgs &b, io_buf &model_file, bool read, bool text)

void	save_load (bfgs &b, io_buf &model_file, bool read, bool text)

void	init_driver (bfgs &b)

base_learner *	bfgs_setup (options_i &options, vw &all)

Variables
curv_exception	curv_ex

constexpr float	max_precond_ratio = 10000.f

constexpr const char *	curv_message

Macro Definition Documentation

◆ CG_EXTRA

#define CG_EXTRA 1

Definition at line 29 of file bfgs.cc.

Referenced by save_load().

◆ LEARN_CONV

#define LEARN_CONV 2

Definition at line 43 of file bfgs.cc.

Referenced by process_pass().

◆ LEARN_CURV

#define LEARN_CURV 1

Definition at line 42 of file bfgs.cc.

Referenced by process_pass().

◆ LEARN_OK

#define LEARN_OK 0

Definition at line 41 of file bfgs.cc.

Referenced by end_pass(), and process_pass().

◆ MEM_GT

#define MEM_GT 0

Definition at line 31 of file bfgs.cc.

Referenced by bfgs_iter_middle(), bfgs_iter_start(), derivative_in_direction(), and wolfe_eval().

◆ MEM_ST

#define MEM_ST 1

Definition at line 34 of file bfgs.cc.

Referenced by bfgs_iter_middle().

◆ MEM_XT

#define MEM_XT 1

Definition at line 32 of file bfgs.cc.

Referenced by bfgs_iter_middle(), and bfgs_iter_start().

◆ MEM_YT

#define MEM_YT 0

Definition at line 33 of file bfgs.cc.

Referenced by bfgs_iter_middle().

◆ W_COND

#define W_COND 3

Definition at line 39 of file bfgs.cc.

Referenced by add_precond(), bfgs_iter_middle(), bfgs_iter_start(), finalize_preconditioner(), preconditioner_to_regularizer(), process_pass(), regularizer_to_weight(), wolfe_eval(), zero_preconditioner(), and zero_state().

◆ W_DIR

#define W_DIR 2

Definition at line 38 of file bfgs.cc.

Referenced by add_DIR(), bfgs_iter_middle(), bfgs_iter_start(), derivative_in_direction(), direction_magnitude(), regularizer_direction_magnitude(), update_weight(), wolfe_eval(), and zero_state().

◆ W_GT

#define W_GT 1

Definition at line 37 of file bfgs.cc.

Referenced by add_grad(), add_regularization(), bfgs_iter_middle(), bfgs_iter_start(), wolfe_eval(), zero_derivative(), and zero_state().

◆ W_XT

#define W_XT 0

Definition at line 36 of file bfgs.cc.

Referenced by bfgs_iter_middle(), bfgs_iter_start(), and update_weight().

Function Documentation

◆ add_DIR()

void add_DIR	(	float &	p,
		const float	fx,
		float &	fw
	)

inline

Definition at line 177 of file bfgs.cc.

References W_DIR.

177 { p += (&fw)[W_DIR] * fx; }

W_DIR

#define W_DIR

Definition: bfgs.cc:38

◆ add_grad()

void add_grad	(	float &	d,
		float	f,
		float &	fw
	)

inline

Definition at line 155 of file bfgs.cc.

References W_GT.

155 { (&fw)[W_GT] += d * f; }

W_GT

#define W_GT

Definition: bfgs.cc:37

f

float f

Definition: cache.cc:40

◆ add_precond()

void add_precond	(	float &	d,
		float	f,
		float &	fw
	)

inline

Definition at line 169 of file bfgs.cc.

References W_COND.

169 { (&fw)[W_COND] += d * f * f; }

W_COND

#define W_COND

Definition: bfgs.cc:39

f

float f

Definition: cache.cc:40

◆ add_regularization() [1/2]

template<class T >

double add_regularization	(	vw &	all,
		bfgs &	b,
		float	regularization,
		T &	weights
	)

Definition at line 448 of file bfgs.cc.

References constant, vw::no_bias, bfgs::regularizers, and W_GT.

Referenced by add_regularization(), and process_pass().

 {
   // compute the derivative difference
   double ret = 0.;
 
   if (b.regularizers == nullptr)
     for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
     {
       (&(*w))[W_GT] += regularization * (*w);
       ret += 0.5 * regularization * (*w) * (*w);
     }
   else
     for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
     {
       uint64_t i = w.index() >> weights.stride_shift();
       weight delta_weight = *w - b.regularizers[2 * i + 1];
       (&(*w))[W_GT] += b.regularizers[2 * i] * delta_weight;
       ret += 0.5 * b.regularizers[2 * i] * delta_weight * delta_weight;
     }
 
   // if we're not regularizing the intercept term, then subtract it off from the result above
   // when accessing weights[constant], always use weights.strided_index(constant)
   if (all.no_bias)
   {
     if (b.regularizers == nullptr)
     {
       (&weights.strided_index(constant))[W_GT] -= regularization * (weights.strided_index(constant));
       ret -= 0.5 * regularization * (weights.strided_index(constant)) * (weights.strided_index(constant));
     }
     else
     {
       uint64_t i = constant >> weights.stride_shift();
       weight delta_weight = (weights.strided_index(constant)) - b.regularizers[2 * i + 1];
       (&weights.strided_index(constant))[W_GT] -= b.regularizers[2 * i] * delta_weight;
       ret -= 0.5 * b.regularizers[2 * i] * delta_weight * delta_weight;
     }
   }
 
   return ret;
 }

◆ add_regularization() [2/2]

double add_regularization	(	vw &	all,
		bfgs &	b,
		float	regularization
	)

Definition at line 489 of file bfgs.cc.

References add_regularization(), parameters::dense_weights, parameters::sparse, parameters::sparse_weights, and vw::weights.

 {
   if (all.weights.sparse)
     return add_regularization(all, b, regularization, all.weights.sparse_weights);
   else
     return add_regularization(all, b, regularization, all.weights.dense_weights);
 }

◆ bfgs_iter_middle() [1/2]

template<class T >

void bfgs_iter_middle	(	vw &	all,
		bfgs &	b,
		float *	mem,
		double *	rho,
		double *	alpha,
		int &	lastj,
		int &	origin,
		T &	weights
	)

Definition at line 270 of file bfgs.cc.

References curv_ex, f, bfgs::m, MEM_GT, MEM_ST, bfgs::mem_stride, MEM_XT, MEM_YT, vw::num_bits, vw::quiet, W_COND, W_DIR, W_GT, and W_XT.

Referenced by bfgs_iter_middle(), and process_pass().

 {
   float* mem0 = mem;
   uint32_t length = 1 << all.num_bits;
   // implement conjugate gradient
   if (b.m == 0)
   {
     double g_Hy = 0.;
     double g_Hg = 0.;
     double y = 0.;
 
     for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
     {
       mem = mem0 + (w.index() >> weights.stride_shift()) * b.mem_stride;
       y = (&(*w))[W_GT] - mem[(MEM_GT + origin) % b.mem_stride];
       g_Hy += ((double)(&(*w))[W_GT]) * ((&(*w))[W_COND]) * y;
       g_Hg +=
           ((double)mem[(MEM_GT + origin) % b.mem_stride]) * ((&(*w))[W_COND]) * mem[(MEM_GT + origin) % b.mem_stride];
     }
 
     float beta = (float)(g_Hy / g_Hg);
 
     if (beta < 0.f || std::isnan(beta))
       beta = 0.f;
 
     for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
     {
       mem = mem0 + (w.index() >> weights.stride_shift()) * b.mem_stride;
       mem[(MEM_GT + origin) % b.mem_stride] = (&(*w))[W_GT];
 
       (&(*w))[W_DIR] *= beta;
       (&(*w))[W_DIR] -= ((&(*w))[W_COND]) * ((&(*w))[W_GT]);
       (&(*w))[W_GT] = 0;
     }
     if (!all.quiet)
       fprintf(stderr, "%f\t", beta);
     return;
 
     mem = mem0 + (length - 1) * b.mem_stride;
   }
   else
   {
     if (!all.quiet)
       fprintf(stderr, "%-10s\t", "");
   }
 
   // implement bfgs
   double y_s = 0.;
   double y_Hy = 0.;
   double s_q = 0.;
 
   for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
   {
     float* mem1 = mem + (w.index() >> weights.stride_shift()) * b.mem_stride;
     mem1[(MEM_YT + origin) % b.mem_stride] = (&(*w))[W_GT] - mem1[(MEM_GT + origin) % b.mem_stride];
     mem1[(MEM_ST + origin) % b.mem_stride] = (&(*w))[W_XT] - mem1[(MEM_XT + origin) % b.mem_stride];
     (&(*w))[W_DIR] = (&(*w))[W_GT];
     y_s += ((double)mem1[(MEM_YT + origin) % b.mem_stride]) * mem1[(MEM_ST + origin) % b.mem_stride];
     y_Hy +=
         ((double)mem1[(MEM_YT + origin) % b.mem_stride]) * mem1[(MEM_YT + origin) % b.mem_stride] * ((&(*w))[W_COND]);
     s_q += ((double)mem1[(MEM_ST + origin) % b.mem_stride]) * ((&(*w))[W_GT]);
   }
 
   if (y_s <= 0. || y_Hy <= 0.)
     throw curv_ex;
   rho[0] = 1 / y_s;
 
   float gamma = (float)(y_s / y_Hy);
 
   for (int j = 0; j < lastj; j++)
   {
     alpha[j] = rho[j] * s_q;
     s_q = 0.;
     for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
     {
       mem = mem0 + (w.index() >> weights.stride_shift()) * b.mem_stride;
       (&(*w))[W_DIR] -= (float)alpha[j] * mem[(2 * j + MEM_YT + origin) % b.mem_stride];
       s_q += ((double)mem[(2 * j + 2 + MEM_ST + origin) % b.mem_stride]) * ((&(*w))[W_DIR]);
     }
   }
 
   alpha[lastj] = rho[lastj] * s_q;
   double y_r = 0.;
 
   for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
   {
     mem = mem0 + (w.index() >> weights.stride_shift()) * b.mem_stride;
     (&(*w))[W_DIR] -= (float)alpha[lastj] * mem[(2 * lastj + MEM_YT + origin) % b.mem_stride];
     (&(*w))[W_DIR] *= gamma * ((&(*w))[W_COND]);
     y_r += ((double)mem[(2 * lastj + MEM_YT + origin) % b.mem_stride]) * ((&(*w))[W_DIR]);
   }
 
   double coef_j;
 
   for (int j = lastj; j > 0; j--)
   {
     coef_j = alpha[j] - rho[j] * y_r;
     y_r = 0.;
     for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
     {
       mem = mem0 + (w.index() >> weights.stride_shift()) * b.mem_stride;
       (&(*w))[W_DIR] += (float)coef_j * mem[(2 * j + MEM_ST + origin) % b.mem_stride];
       y_r += ((double)mem[(2 * j - 2 + MEM_YT + origin) % b.mem_stride]) * ((&(*w))[W_DIR]);
     }
   }
 
   coef_j = alpha[0] - rho[0] * y_r;
   for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
   {
     mem = mem0 + (w.index() >> weights.stride_shift()) * b.mem_stride;
     (&(*w))[W_DIR] = -(&(*w))[W_DIR] - (float)coef_j * mem[(MEM_ST + origin) % b.mem_stride];
   }
 
   /*********************
   ** shift
   ********************/
 
   lastj = (lastj < b.m - 1) ? lastj + 1 : b.m - 1;
   origin = (origin + b.mem_stride - 2) % b.mem_stride;
 
   for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
   {
     mem = mem0 + (w.index() >> weights.stride_shift()) * b.mem_stride;
     mem[(MEM_GT + origin) % b.mem_stride] = (&(*w))[W_GT];
     mem[(MEM_XT + origin) % b.mem_stride] = (&(*w))[W_XT];
     (&(*w))[W_GT] = 0;
   }
   for (int j = lastj; j > 0; j--) rho[j] = rho[j - 1];
 }

◆ bfgs_iter_middle() [2/2]

void bfgs_iter_middle	(	vw &	all,
		bfgs &	b,
		float *	mem,
		double *	rho,
		double *	alpha,
		int &	lastj,
		int &	origin
	)

Definition at line 400 of file bfgs.cc.

References bfgs_iter_middle(), parameters::dense_weights, parameters::sparse, parameters::sparse_weights, and vw::weights.

 {
   if (all.weights.sparse)
     bfgs_iter_middle(all, b, mem, rho, alpha, lastj, origin, all.weights.sparse_weights);
   else
     bfgs_iter_middle(all, b, mem, rho, alpha, lastj, origin, all.weights.dense_weights);
 }

◆ bfgs_iter_start() [1/2]

template<class T >

void bfgs_iter_start	(	vw &	all,
		bfgs &	b,
		float *	mem,
		int &	lastj,
		double	importance_weight_sum,
		int &	origin,
		T &	weights
	)

Definition at line 238 of file bfgs.cc.

References bfgs::m, MEM_GT, bfgs::mem_stride, MEM_XT, vw::quiet, W_COND, W_DIR, W_GT, and W_XT.

Referenced by bfgs_iter_start(), and process_pass().

 {
   double g1_Hg1 = 0.;
   double g1_g1 = 0.;
 
   origin = 0;
   for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
   {
     float* mem1 = mem + (w.index() >> weights.stride_shift()) * b.mem_stride;
     if (b.m > 0)
       mem1[(MEM_XT + origin) % b.mem_stride] = (&(*w))[W_XT];
     mem1[(MEM_GT + origin) % b.mem_stride] = (&(*w))[W_GT];
     g1_Hg1 += ((double)(&(*w))[W_GT]) * ((&(*w))[W_GT]) * ((&(*w))[W_COND]);
     g1_g1 += ((double)((&(*w))[W_GT])) * ((&(*w))[W_GT]);
     (&(*w))[W_DIR] = -(&(*w))[W_COND] * ((&(*w))[W_GT]);
     ((&(*w))[W_GT]) = 0;
   }
   lastj = 0;
   if (!all.quiet)
     fprintf(stderr, "%-10.5f\t%-10.5f\t%-10s\t%-10s\t%-10s\t", g1_g1 / (importance_weight_sum * importance_weight_sum),
         g1_Hg1 / importance_weight_sum, "", "", "");
 }

◆ bfgs_iter_start() [2/2]

void bfgs_iter_start	(	vw &	all,
		bfgs &	b,
		float *	mem,
		int &	lastj,
		double	importance_weight_sum,
		int &	origin
	)

Definition at line 261 of file bfgs.cc.

References bfgs_iter_start(), parameters::dense_weights, parameters::sparse, parameters::sparse_weights, and vw::weights.

 {
   if (all.weights.sparse)
     bfgs_iter_start(all, b, mem, lastj, importance_weight_sum, origin, all.weights.sparse_weights);
   else
     bfgs_iter_start(all, b, mem, lastj, importance_weight_sum, origin, all.weights.dense_weights);
 }

◆ bfgs_predict()

float bfgs_predict	(	vw &	all,
		example &	ec
	)

Definition at line 149 of file bfgs.cc.

References GD::finalize_prediction(), GD::inline_predict(), example::partial_prediction, and vw::sd.

Referenced by predict(), and predict_and_gradient().

 {
   ec.partial_prediction = GD::inline_predict(all, ec);
   return GD::finalize_prediction(all.sd, ec.partial_prediction);
 }

◆ bfgs_setup()

base_learner* bfgs_setup	(	options_i &	options,
		vw &	all
	)

Definition at line 1093 of file bfgs.cc.

References VW::config::option_group_definition::add(), VW::config::options_i::add_and_parse(), vw::audit, vw::bfgs, LEARNER::end_pass(), VW::config::options_i::get_typed_option(), vw::hash_inv, vw::hessian_on, shared_data::holdout_best_loss, vw::holdout_set_off, init_driver(), LEARNER::init_learner(), LEARNER::make_base(), VW::config::make_option(), vw::numpasses, vw::quiet, save_load(), vw::sd, parameters::stride(), parameters::stride_shift(), THROW, vw::training, and vw::weights.

Referenced by parse_reductions().

 {
   auto b = scoped_calloc_or_throw<bfgs>();
   bool conjugate_gradient = false;
   bool bfgs_option = false;
   option_group_definition bfgs_outer_options("LBFGS and Conjugate Gradient options");
   bfgs_outer_options.add(
       make_option("conjugate_gradient", conjugate_gradient).keep().help("use conjugate gradient based optimization"));
 
   option_group_definition bfgs_inner_options("LBFGS and Conjugate Gradient options");
   bfgs_inner_options.add(make_option("bfgs", bfgs_option).keep().help("use conjugate gradient based optimization"));
   bfgs_inner_options.add(make_option("hessian_on", all.hessian_on).help("use second derivative in line search"));
   bfgs_inner_options.add(make_option("mem", b->m).default_value(15).help("memory in bfgs"));
   bfgs_inner_options.add(
       make_option("termination", b->rel_threshold).default_value(0.001f).help("Termination threshold"));
 
   options.add_and_parse(bfgs_outer_options);
   if (!conjugate_gradient)
   {
     options.add_and_parse(bfgs_inner_options);
     if (!bfgs_option)
     {
       return nullptr;
     }
   }
 
   b->all = &all;
   b->wolfe1_bound = 0.01;
   b->first_hessian_on = true;
   b->first_pass = true;
   b->gradient_pass = true;
   b->preconditioner_pass = true;
   b->backstep_on = false;
   b->final_pass = all.numpasses;
   b->no_win_counter = 0;
 
   if (!all.holdout_set_off)
   {
     all.sd->holdout_best_loss = FLT_MAX;
     b->early_stop_thres = options.get_typed_option<size_t>("early_terminate").value();
   }
 
   if (b->m == 0)
     all.hessian_on = true;
 
   if (!all.quiet)
   {
     if (b->m > 0)
       b->all->trace_message << "enabling BFGS based optimization ";
     else
       b->all->trace_message << "enabling conjugate gradient optimization via BFGS ";
     if (all.hessian_on)
       b->all->trace_message << "with curvature calculation" << std::endl;
     else
       b->all->trace_message << "**without** curvature calculation" << std::endl;
   }
 
   if (all.numpasses < 2 && all.training)
     THROW("you must make at least 2 passes to use BFGS");
 
   all.bfgs = true;
   all.weights.stride_shift(2);
 
   void (*learn_ptr)(bfgs&, base_learner&, example&) = nullptr;
   if (all.audit)
     learn_ptr = learn<true>;
   else
     learn_ptr = learn<false>;
 
   learner<bfgs, example>* l;
   if (all.audit || all.hash_inv)
     l = &init_learner(b, learn_ptr, predict<true>, all.weights.stride());
   else
     l = &init_learner(b, learn_ptr, predict<false>, all.weights.stride());
 
   l->set_save_load(save_load);
   l->set_init_driver(init_driver);
   l->set_end_pass(end_pass);
 
   return make_base(*l);
 }

◆ derivative_in_direction() [1/2]

template<class T >

double derivative_in_direction	(	vw &	,
		bfgs &	b,
		float *	mem,
		int &	origin,
		T &	weights
	)

Definition at line 605 of file bfgs.cc.

References MEM_GT, bfgs::mem_stride, and W_DIR.

Referenced by derivative_in_direction(), and process_pass().

 {
   double ret = 0.;
   for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
   {
     float* mem1 = mem + (w.index() >> weights.stride_shift()) * b.mem_stride;
     ret += ((double)mem1[(MEM_GT + origin) % b.mem_stride]) * (&(*w))[W_DIR];
   }
   return ret;
 }

◆ derivative_in_direction() [2/2]

double derivative_in_direction	(	vw &	all,
		bfgs &	b,
		float *	mem,
		int &	origin
	)

Definition at line 616 of file bfgs.cc.

References parameters::dense_weights, derivative_in_direction(), parameters::sparse, parameters::sparse_weights, and vw::weights.

 {
   if (all.weights.sparse)
     return derivative_in_direction(all, b, mem, origin, all.weights.sparse_weights);
   else
     return derivative_in_direction(all, b, mem, origin, all.weights.dense_weights);
 }

◆ direction_magnitude() [1/2]

template<class T >

float direction_magnitude	(	vw &	,
		T &	weights
	)

Definition at line 218 of file bfgs.cc.

References W_DIR.

Referenced by direction_magnitude(), and process_pass().

 {
   // compute direction magnitude
   double ret = 0.;
   for (typename T::iterator iter = weights.begin(); iter != weights.end(); ++iter)
     ret += ((double)(&(*iter))[W_DIR]) * (&(*iter))[W_DIR];
 
   return (float)ret;
 }

◆ direction_magnitude() [2/2]

float direction_magnitude ( vw & all )

Definition at line 228 of file bfgs.cc.

References parameters::dense_weights, direction_magnitude(), parameters::sparse, parameters::sparse_weights, and vw::weights.

 {
   // compute direction magnitude
   if (all.weights.sparse)
     return direction_magnitude(all, all.weights.sparse_weights);
   else
     return direction_magnitude(all, all.weights.dense_weights);
 }

◆ dot_with_direction()

float dot_with_direction	(	vw &	all,
		example &	ec
	)

Definition at line 179 of file bfgs.cc.

References label_data::initial, example::l, and polylabel::simple.

Referenced by process_example().

 {
   float temp = ec.l.simple.initial;
   GD::foreach_feature<float, add_DIR>(all, ec, temp);
   return temp;
 }

◆ end_pass()

void end_pass ( bfgs & b )

Definition at line 897 of file bfgs.cc.

References bfgs::all, bfgs::current_pass, bfgs::early_stop_thres, bfgs::final_pass, vw::final_regressor_name, finalize_regressor(), vw::holdout_set_off, LEARN_OK, bfgs::no_win_counter, bfgs::output_regularizer, process_pass(), set_done(), summarize_holdout_set(), vw::trace_message, and zero_preconditioner().

 {
   vw* all = b.all;
 
   if (b.current_pass <= b.final_pass)
   {
     if (b.current_pass < b.final_pass)
     {
       int status = process_pass(*all, b);
 
       // reaching the max number of passes regardless of convergence
       if (b.final_pass == b.current_pass)
       {
         b.all->trace_message << "Maximum number of passes reached. ";
         if (!b.output_regularizer)
           b.all->trace_message << "If you want to optimize further, increase the number of passes\n";
         if (b.output_regularizer)
         {
           b.all->trace_message << "\nRegular model file has been created. ";
           b.all->trace_message << "Output feature regularizer file is created only when the convergence is reached. "
                                   "Try increasing the number of passes for convergence\n";
           b.output_regularizer = false;
         }
       }
 
       // attain convergence before reaching max iterations
       if (status != LEARN_OK && b.final_pass > b.current_pass)
       {
         b.final_pass = b.current_pass;
       }
       else
       {
         // Not converged yet.
         // Reset preconditioner to zero so that it is correctly recomputed in the next pass
         zero_preconditioner(*all);
       }
       if (!all->holdout_set_off)
       {
         if (summarize_holdout_set(*all, b.no_win_counter))
           finalize_regressor(*all, all->final_regressor_name);
         if (b.early_stop_thres == b.no_win_counter)
         {
           set_done(*all);
           b.all->trace_message << "Early termination reached w.r.t. holdout set error";
         }
       }
       if (b.final_pass == b.current_pass)
       {
         finalize_regressor(*all, all->final_regressor_name);
         set_done(*all);
       }
     }
     else  // reaching convergence in the previous pass
       b.current_pass++;
   }
 }

◆ finalize_preconditioner() [1/2]

template<class T >

void finalize_preconditioner	(	vw &	,
		bfgs &	b,
		float	regularization,
		T &	weights
	)

Definition at line 498 of file bfgs.cc.

References f, max_precond_ratio, bfgs::regularizers, and W_COND.

Referenced by finalize_preconditioner(), and process_pass().

 {
   float max_hessian = 0.f;
 
   if (b.regularizers == nullptr)
     for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
     {
       (&(*w))[W_COND] += regularization;
       if ((&(*w))[W_COND] > max_hessian)
         max_hessian = (&(*w))[W_COND];
       if ((&(*w))[W_COND] > 0)
         (&(*w))[W_COND] = 1.f / (&(*w))[W_COND];
     }
   else
     for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
     {
       (&(*w))[W_COND] += b.regularizers[2 * (w.index() >> weights.stride_shift())];
       if ((&(*w))[W_COND] > max_hessian)
         max_hessian = (&(*w))[W_COND];
       if ((&(*w))[W_COND] > 0)
         (&(*w))[W_COND] = 1.f / (&(*w))[W_COND];
     }
 
   float max_precond = (max_hessian == 0.f) ? 0.f : max_precond_ratio / max_hessian;
 
   for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
   {
     if (std::isinf(*w) || *w > max_precond)
       (&(*w))[W_COND] = max_precond;
   }
 }

◆ finalize_preconditioner() [2/2]

void finalize_preconditioner	(	vw &	all,
		bfgs &	b,
		float	regularization
	)

Definition at line 529 of file bfgs.cc.

References parameters::dense_weights, finalize_preconditioner(), parameters::sparse, parameters::sparse_weights, and vw::weights.

 {
   if (all.weights.sparse)
     finalize_preconditioner(all, b, regularization, all.weights.sparse_weights);
   else
     finalize_preconditioner(all, b, regularization, all.weights.dense_weights);
 }

◆ init_driver()

void init_driver ( bfgs & b )

Definition at line 1091 of file bfgs.cc.

References bfgs::backstep_on.

Referenced by bfgs_setup().

1091 { b.backstep_on = true; }

bfgs::backstep_on

bool backstep_on

Definition: bfgs.cc:85

◆ learn()

template<bool audit>

void learn	(	bfgs &	b,
		base_learner &	base,
		example &	ec
	)

Definition at line 965 of file bfgs.cc.

References bfgs::all, bfgs::current_pass, bfgs::final_pass, example::in_use, process_example(), and test_example().

Referenced by LEARNER::init_cost_sensitive_learner(), LEARNER::learner< CB_EXPLORE::cb_explore, example >::init_learner(), LEARNER::init_learner(), LEARNER::init_multiclass_learner(), MWT::predict_or_learn(), VW::cb_explore_adf::softmax::setup(), VW::cb_explore_adf::greedy::setup(), VW::cb_explore_adf::first::setup(), VW::cb_explore_adf::bag::setup(), VW::cb_explore_adf::cover::setup(), and VW::cb_explore_adf::regcb::setup().

 {
   vw* all = b.all;
   assert(ec.in_use);
 
   if (b.current_pass <= b.final_pass)
   {
     if (test_example(ec))
       predict<audit>(b, base, ec);
     else
       process_example(*all, b, ec);
   }
 }

◆ preconditioner_to_regularizer() [1/2]

template<class T >

void preconditioner_to_regularizer	(	vw &	all,
		bfgs &	b,
		float	regularization,
		T &	weights
	)

Definition at line 538 of file bfgs.cc.

References f, vw::num_bits, bfgs::regularizers, THROW, and W_COND.

Referenced by preconditioner_to_regularizer(), and save_load_regularizer().

 {
   uint32_t length = 1 << all.num_bits;
 
   if (b.regularizers == nullptr)
   {
     b.regularizers = calloc_or_throw<weight>(2 * length);
 
     if (b.regularizers == nullptr)
       THROW("Failed to allocate weight array: try decreasing -b <bits>");
 
     for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
     {
       uint64_t i = w.index() >> weights.stride_shift();
       b.regularizers[2 * i] = regularization;
       if ((&(*w))[W_COND] > 0.f)
         b.regularizers[2 * i] += 1.f / (&(*w))[W_COND];
     }
   }
   else
     for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
     {
       if ((&(*w))[W_COND] > 0.f)
         b.regularizers[2 * (w.index() >> weights.stride_shift())] += 1.f / (&(*w))[W_COND];
     }
 
   for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
     b.regularizers[2 * (w.index() >> weights.stride_shift()) + 1] = *w;
 }

◆ preconditioner_to_regularizer() [2/2]

void preconditioner_to_regularizer	(	vw &	all,
		bfgs &	b,
		float	regularization
	)

Definition at line 567 of file bfgs.cc.

References parameters::dense_weights, preconditioner_to_regularizer(), parameters::sparse, parameters::sparse_weights, and vw::weights.

 {
   if (all.weights.sparse)
     preconditioner_to_regularizer(all, b, regularization, all.weights.sparse_weights);
   else
     preconditioner_to_regularizer(all, b, regularization, all.weights.dense_weights);
 }

◆ predict()

template<bool audit>

void predict	(	bfgs &	b,
		base_learner &	,
		example &	ec
	)

Definition at line 956 of file bfgs.cc.

References bfgs::all, bfgs_predict(), example::pred, GD::print_audit_features(), and polyprediction::scalar.

Referenced by Search::search::get_metatask_data(), LEARNER::init_cost_sensitive_learner(), LEARNER::learner< CB_EXPLORE::cb_explore, example >::init_learner(), LEARNER::init_learner(), LEARNER::init_multiclass_learner(), ezexample::operator()(), vw_slim::vw_predict< W >::predict(), SequenceTask::run(), SequenceTaskCostToGo::run(), SequenceTask_DemoLDF::run(), VW::cb_explore_adf::softmax::setup(), VW::cb_explore_adf::greedy::setup(), VW::cb_explore_adf::first::setup(), VW::cb_explore_adf::bag::setup(), VW::cb_explore_adf::cover::setup(), and VW::cb_explore_adf::regcb::setup().

 {
   vw* all = b.all;
   ec.pred.scalar = bfgs_predict(*all, ec);
   if (audit)
     GD::print_audit_features(*(b.all), ec);
 }

◆ predict_and_gradient()

float predict_and_gradient	(	vw &	all,
		example &	ec
	)

Definition at line 157 of file bfgs.cc.

References bfgs_predict(), loss_function::first_derivative(), example::l, label_data::label, vw::loss, vw::sd, vw::set_minmax, polylabel::simple, and example::weight.

Referenced by process_example().

 {
   float fp = bfgs_predict(all, ec);
   label_data& ld = ec.l.simple;
   all.set_minmax(all.sd, ld.label);
 
   float loss_grad = all.loss->first_derivative(all.sd, fp, ld.label) * ec.weight;
   GD::foreach_feature<float, add_grad>(all, ec, loss_grad);
 
   return fp;
 }

◆ process_example()

void process_example	(	vw &	all,
		bfgs &	b,
		example &	ec
	)

Definition at line 861 of file bfgs.cc.

References bfgs::curvature, dot_with_direction(), bfgs::example_number, bfgs::first_pass, loss_function::getLoss(), bfgs::gradient_pass, bfgs::importance_weight_sum, example::l, label_data::label, example::loss, vw::loss, bfgs::loss_sum, example::partial_prediction, bfgs::preconditioner_pass, example::pred, predict_and_gradient(), bfgs::predictions, v_array< T >::push_back(), polyprediction::scalar, vw::sd, loss_function::second_derivative(), polylabel::simple, v_array< T >::size(), update_preconditioner(), example::updated_prediction, and example::weight.

Referenced by learn().

 {
   label_data& ld = ec.l.simple;
   if (b.first_pass)
     b.importance_weight_sum += ec.weight;
 
   /********************************************************************/
   /* I) GRADIENT CALCULATION ******************************************/
   /********************************************************************/
   if (b.gradient_pass)
   {
     ec.pred.scalar = predict_and_gradient(all, ec);  // w[0] & w[1]
     ec.loss = all.loss->getLoss(all.sd, ec.pred.scalar, ld.label) * ec.weight;
     b.loss_sum += ec.loss;
     b.predictions.push_back(ec.pred.scalar);
   }
   /********************************************************************/
   /* II) CURVATURE CALCULATION ****************************************/
   /********************************************************************/
   else  // computing curvature
   {
     float d_dot_x = dot_with_direction(all, ec);   // w[2]
     if (b.example_number >= b.predictions.size())  // Make things safe in case example source is strange.
       b.example_number = b.predictions.size() - 1;
     ec.pred.scalar = b.predictions[b.example_number];
     ec.partial_prediction = b.predictions[b.example_number];
     ec.loss = all.loss->getLoss(all.sd, ec.pred.scalar, ld.label) * ec.weight;
     float sd = all.loss->second_derivative(all.sd, b.predictions[b.example_number++], ld.label);
     b.curvature += ((double)d_dot_x) * d_dot_x * sd * ec.weight;
   }
   ec.updated_prediction = ec.pred.scalar;
 
   if (b.preconditioner_pass)
     update_preconditioner(all, ec);  // w[3]
 }

◆ process_pass()

int process_pass	(	vw &	all,
		bfgs &	b
	)

Definition at line 639 of file bfgs.cc.

References accumulate(), accumulate_scalar(), add_regularization(), vw::all_reduce, bfgs::alpha, bfgs::backstep_on, bfgs_iter_middle(), bfgs_iter_start(), v_array< T >::clear(), bfgs::current_pass, curv_message, bfgs::curvature, derivative_in_direction(), direction_magnitude(), bfgs::example_number, f, vw::final_regressor_name, finalize_preconditioner(), bfgs::first_hessian_on, bfgs::first_pass, bfgs::gradient_pass, vw::hessian_on, vw::holdout_set_off, shared_data::holdout_sum_loss_since_last_pass, bfgs::importance_weight_sum, vw::l2_lambda, bfgs::lastj, LEARN_CONV, LEARN_CURV, LEARN_OK, bfgs::loss_sum, bfgs::mem, bfgs::net_time, bfgs::origin, bfgs::output_regularizer, bfgs::preconditioner_pass, bfgs::predictions, bfgs::previous_loss_sum, vw::quiet, regularizer_direction_magnitude(), bfgs::rel_threshold, bfgs::rho, vw::save_per_pass, save_predictor(), vw::sd, bfgs::step_size, bfgs::t_end_global, update_weight(), W_COND, shared_data::weighted_holdout_examples_since_last_pass, vw::weights, wolfe_eval(), and zero_derivative().

Referenced by end_pass().

 {
   int status = LEARN_OK;
 
   finalize_preconditioner(all, b, all.l2_lambda);
   /********************************************************************/
   /* A) FIRST PASS FINISHED: INITIALIZE FIRST LINE SEARCH *************/
   /********************************************************************/
   if (b.first_pass)
   {
     if (all.all_reduce != nullptr)
     {
       accumulate(all, all.weights, W_COND);  // Accumulate preconditioner
       float temp = (float)b.importance_weight_sum;
       b.importance_weight_sum = accumulate_scalar(all, temp);
     }
     // finalize_preconditioner(all, b, all.l2_lambda);
     if (all.all_reduce != nullptr)
     {
       float temp = (float)b.loss_sum;
       b.loss_sum = accumulate_scalar(all, temp);  // Accumulate loss_sums
       accumulate(all, all.weights, 1);            // Accumulate gradients from all nodes
     }
     if (all.l2_lambda > 0.)
       b.loss_sum += add_regularization(all, b, all.l2_lambda);
     if (!all.quiet)
       fprintf(stderr, "%2lu %-10.5f\t", (long unsigned int)b.current_pass + 1, b.loss_sum / b.importance_weight_sum);
 
     b.previous_loss_sum = b.loss_sum;
     b.loss_sum = 0.;
     b.example_number = 0;
     b.curvature = 0;
     bfgs_iter_start(all, b, b.mem, b.lastj, b.importance_weight_sum, b.origin);
     if (b.first_hessian_on)
     {
       b.gradient_pass = false;  // now start computing curvature
     }
     else
     {
       b.step_size = 0.5;
       float d_mag = direction_magnitude(all);
       ftime(&b.t_end_global);
       b.net_time = (int)(1000.0 * (b.t_end_global.time - b.t_start_global.time) +
           (b.t_end_global.millitm - b.t_start_global.millitm));
       if (!all.quiet)
         fprintf(stderr, "%-10s\t%-10.5f\t%-.5f\n", "", d_mag, b.step_size);
       b.predictions.clear();
       update_weight(all, b.step_size);
     }
   }
   else
       /********************************************************************/
       /* B) GRADIENT CALCULATED *******************************************/
       /********************************************************************/
       if (b.gradient_pass)  // We just finished computing all gradients
   {
     if (all.all_reduce != nullptr)
     {
       float t = (float)b.loss_sum;
       b.loss_sum = accumulate_scalar(all, t);  // Accumulate loss_sums
       accumulate(all, all.weights, 1);         // Accumulate gradients from all nodes
     }
     if (all.l2_lambda > 0.)
       b.loss_sum += add_regularization(all, b, all.l2_lambda);
     if (!all.quiet)
     {
       if (!all.holdout_set_off && b.current_pass >= 1)
       {
         if (all.sd->holdout_sum_loss_since_last_pass == 0. && all.sd->weighted_holdout_examples_since_last_pass == 0.)
         {
           fprintf(stderr, "%2lu ", (long unsigned int)b.current_pass + 1);
           fprintf(stderr, "h unknown    ");
         }
         else
           fprintf(stderr, "%2lu h%-10.5f\t", (long unsigned int)b.current_pass + 1,
               all.sd->holdout_sum_loss_since_last_pass / all.sd->weighted_holdout_examples_since_last_pass);
       }
       else
         fprintf(stderr, "%2lu %-10.5f\t", (long unsigned int)b.current_pass + 1, b.loss_sum / b.importance_weight_sum);
     }
     double wolfe1;
     double new_step = wolfe_eval(
         all, b, b.mem, b.loss_sum, b.previous_loss_sum, b.step_size, b.importance_weight_sum, b.origin, wolfe1);
 
     /********************************************************************/
     /* B0) DERIVATIVE ZERO: MINIMUM FOUND *******************************/
     /********************************************************************/
     if (std::isnan((float)wolfe1))
     {
       fprintf(stderr, "\n");
       fprintf(stdout, "Derivative 0 detected.\n");
       b.step_size = 0.0;
       status = LEARN_CONV;
     }
     /********************************************************************/
     /* B1) LINE SEARCH FAILED *******************************************/
     /********************************************************************/
     else if (b.backstep_on && (wolfe1 < b.wolfe1_bound || b.loss_sum > b.previous_loss_sum))
     {
       // curvature violated, or we stepped too far last time: step back
       ftime(&b.t_end_global);
       b.net_time = (int)(1000.0 * (b.t_end_global.time - b.t_start_global.time) +
           (b.t_end_global.millitm - b.t_start_global.millitm));
       float ratio = (b.step_size == 0.f) ? 0.f : (float)new_step / (float)b.step_size;
       if (!all.quiet)
         fprintf(stderr, "%-10s\t%-10s\t(revise x %.1f)\t%-.5f\n", "", "", ratio, new_step);
       b.predictions.clear();
       update_weight(all, (float)(-b.step_size + new_step));
       b.step_size = (float)new_step;
       zero_derivative(all);
       b.loss_sum = 0.;
     }
 
     /********************************************************************/
     /* B2) LINE SEARCH SUCCESSFUL OR DISABLED          ******************/
     /*     DETERMINE NEXT SEARCH DIRECTION             ******************/
     /********************************************************************/
     else
     {
       double rel_decrease = (b.previous_loss_sum - b.loss_sum) / b.previous_loss_sum;
       if (!std::isnan((float)rel_decrease) && b.backstep_on && fabs(rel_decrease) < b.rel_threshold)
       {
         fprintf(stdout,
             "\nTermination condition reached in pass %ld: decrease in loss less than %.3f%%.\n"
             "If you want to optimize further, decrease termination threshold.\n",
             (long int)b.current_pass + 1, b.rel_threshold * 100.0);
         status = LEARN_CONV;
       }
       b.previous_loss_sum = b.loss_sum;
       b.loss_sum = 0.;
       b.example_number = 0;
       b.curvature = 0;
       b.step_size = 1.0;
 
       try
       {
         bfgs_iter_middle(all, b, b.mem, b.rho, b.alpha, b.lastj, b.origin);
       }
       catch (const curv_exception&)
       {
         fprintf(stdout, "In bfgs_iter_middle: %s", curv_message);
         b.step_size = 0.0;
         status = LEARN_CURV;
       }
 
       if (all.hessian_on)
       {
         b.gradient_pass = false;  // now start computing curvature
       }
       else
       {
         float d_mag = direction_magnitude(all);
         ftime(&b.t_end_global);
         b.net_time = (int)(1000.0 * (b.t_end_global.time - b.t_start_global.time) +
             (b.t_end_global.millitm - b.t_start_global.millitm));
         if (!all.quiet)
           fprintf(stderr, "%-10s\t%-10.5f\t%-.5f\n", "", d_mag, b.step_size);
         b.predictions.clear();
         update_weight(all, b.step_size);
       }
     }
   }
 
   /********************************************************************/
   /* C) NOT FIRST PASS, CURVATURE CALCULATED **************************/
   /********************************************************************/
   else  // just finished all second gradients
   {
     if (all.all_reduce != nullptr)
     {
       float t = (float)b.curvature;
       b.curvature = accumulate_scalar(all, t);  // Accumulate curvatures
     }
     if (all.l2_lambda > 0.)
       b.curvature += regularizer_direction_magnitude(all, b, all.l2_lambda);
     float dd = (float)derivative_in_direction(all, b, b.mem, b.origin);
     if (b.curvature == 0. && dd != 0.)
     {
       fprintf(stdout, "%s", curv_message);
       b.step_size = 0.0;
       status = LEARN_CURV;
     }
     else if (dd == 0.)
     {
       fprintf(stdout, "Derivative 0 detected.\n");
       b.step_size = 0.0;
       status = LEARN_CONV;
     }
     else
       b.step_size = -dd / (float)b.curvature;
 
     float d_mag = direction_magnitude(all);
 
     b.predictions.clear();
     update_weight(all, b.step_size);
     ftime(&b.t_end_global);
     b.net_time = (int)(1000.0 * (b.t_end_global.time - b.t_start_global.time) +
         (b.t_end_global.millitm - b.t_start_global.millitm));
 
     if (!all.quiet)
       fprintf(stderr, "%-10.5f\t%-10.5f\t%-.5f\n", b.curvature / b.importance_weight_sum, d_mag, b.step_size);
     b.gradient_pass = true;
   }  // now start computing derivatives.
   b.current_pass++;
   b.first_pass = false;
   b.preconditioner_pass = false;
 
   if (b.output_regularizer)  // need to accumulate and place the regularizer.
   {
     if (all.all_reduce != nullptr)
       accumulate(all, all.weights, W_COND);  // Accumulate preconditioner
     // preconditioner_to_regularizer(all, b, all.l2_lambda);
   }
   ftime(&b.t_end_global);
   b.net_time = (int)(1000.0 * (b.t_end_global.time - b.t_start_global.time) +
       (b.t_end_global.millitm - b.t_start_global.millitm));
 
   if (all.save_per_pass)
     save_predictor(all, all.final_regressor_name, b.current_pass);
   return status;
 }

◆ regularizer_direction_magnitude() [1/2]

template<class T >

double regularizer_direction_magnitude	(	vw &	,
		bfgs &	b,
		double	regularizer,
		T &	weights
	)

Definition at line 187 of file bfgs.cc.

References bfgs::regularizers, and W_DIR.

Referenced by process_pass(), and regularizer_direction_magnitude().

 {
   double ret = 0.;
   if (b.regularizers == nullptr)
     for (typename T::iterator iter = weights.begin(); iter != weights.end(); ++iter)
       ret += regularizer * (&(*iter))[W_DIR] * (&(*iter))[W_DIR];
 
   else
   {
     for (typename T::iterator iter = weights.begin(); iter != weights.end(); ++iter)
       ret += ((double)b.regularizers[2 * (iter.index() >> weights.stride_shift())]) * (&(*iter))[W_DIR] *
           (&(*iter))[W_DIR];
   }
   return ret;
 }

◆ regularizer_direction_magnitude() [2/2]

double regularizer_direction_magnitude	(	vw &	all,
		bfgs &	b,
		float	regularizer
	)

Definition at line 203 of file bfgs.cc.

References parameters::dense_weights, regularizer_direction_magnitude(), parameters::sparse, parameters::sparse_weights, and vw::weights.

 {
   // compute direction magnitude
   double ret = 0.;
 
   if (regularizer == 0.)
     return ret;
 
   if (all.weights.sparse)
     return regularizer_direction_magnitude(all, b, regularizer, all.weights.sparse_weights);
   else
     return regularizer_direction_magnitude(all, b, regularizer, all.weights.dense_weights);
 }

◆ regularizer_to_weight() [1/2]

template<class T >

void regularizer_to_weight	(	vw &	,
		bfgs &	b,
		T &	weights
	)

Definition at line 576 of file bfgs.cc.

References bfgs::regularizers, and W_COND.

Referenced by regularizer_to_weight(), and save_load_regularizer().

 {
   if (b.regularizers != nullptr)
   {
     for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
     {
       uint64_t i = w.index() >> weights.stride_shift();
       (&(*w))[W_COND] = b.regularizers[2 * i];
       *w = b.regularizers[2 * i + 1];
     }
   }
 }

◆ regularizer_to_weight() [2/2]

void regularizer_to_weight	(	vw &	all,
		bfgs &	b
	)

Definition at line 589 of file bfgs.cc.

References parameters::dense_weights, regularizer_to_weight(), parameters::sparse, parameters::sparse_weights, and vw::weights.

 {
   if (all.weights.sparse)
     regularizer_to_weight(all, b, all.weights.sparse_weights);
   else
     regularizer_to_weight(all, b, all.weights.dense_weights);
 }

◆ reset_state()

void reset_state	(	vw &	all,
		bfgs &	b,
		bool	zero
	)

Definition at line 126 of file bfgs.cc.

References bfgs::curvature, bfgs::first_pass, bfgs::gradient_pass, bfgs::importance_weight_sum, bfgs::lastj, bfgs::loss_sum, bfgs::origin, bfgs::preconditioner_pass, bfgs::previous_loss_sum, zero_derivative(), and zero_preconditioner().

Referenced by save_load().

 {
   b.lastj = b.origin = 0;
   b.loss_sum = b.previous_loss_sum = 0.;
   b.importance_weight_sum = 0.;
   b.curvature = 0.;
   b.first_pass = true;
   b.gradient_pass = true;
   b.preconditioner_pass = true;
   if (zero)
   {
     zero_derivative(all);
     zero_preconditioner(all);
   }
 }

◆ save_load()

void save_load	(	bfgs &	b,
		io_buf &	model_file,
		bool	read,
		bool	text
	)

Definition at line 1026 of file bfgs.cc.

References bfgs::all, bfgs::alpha, bin_text_read_write_fixed(), CG_EXTRA, io_buf::files, initialize_regressor(), vw::l2_lambda, vw::length(), bfgs::m, bfgs::mem, bfgs::mem_stride, bfgs::net_time, vw::num_bits, bfgs::output_regularizer, vw::per_feature_regularizer_input, vw::per_feature_regularizer_output, vw::per_feature_regularizer_text, vw::quiet, bfgs::regularizers, reset_state(), bfgs::rho, GD::save_load_regressor(), save_load_regularizer(), v_array< T >::size(), stride_shift(), parameters::stride_shift(), THROW, and vw::weights.

Referenced by bfgs_setup().

 {
   vw* all = b.all;
 
   uint32_t length = 1 << all->num_bits;
 
   if (read)
   {
     initialize_regressor(*all);
     if (all->per_feature_regularizer_input != "")
     {
       b.regularizers = calloc_or_throw<weight>(2 * length);
       if (b.regularizers == nullptr)
         THROW("Failed to allocate regularizers array: try decreasing -b <bits>");
     }
     int m = b.m;
 
     b.mem_stride = (m == 0) ? CG_EXTRA : 2 * m;
     b.mem = calloc_or_throw<float>(all->length() * b.mem_stride);
     b.rho = calloc_or_throw<double>(m);
     b.alpha = calloc_or_throw<double>(m);
 
     uint32_t stride_shift = all->weights.stride_shift();
 
     if (!all->quiet)
       std::cerr << "m = " << m << std::endl
                 << "Allocated "
                 << ((long unsigned int)all->length() *
                            (sizeof(float) * (b.mem_stride) + (sizeof(weight) << stride_shift)) >>
                        20)
                 << "M for weights and mem" << std::endl;
 
     b.net_time = 0.0;
     ftime(&b.t_start_global);
 
     if (!all->quiet)
     {
       const char* header_fmt = "%2s %-10s\t%-10s\t%-10s\t %-10s\t%-10s\t%-10s\t%-10s\t%-10s\t%-s\n";
       fprintf(stderr, header_fmt, "##", "avg. loss", "der. mag.", "d. m. cond.", "wolfe1", "wolfe2", "mix fraction",
           "curvature", "dir. magnitude", "step size");
       std::cerr.precision(5);
     }
 
     if (b.regularizers != nullptr)
       all->l2_lambda = 1;  // To make sure we are adding the regularization
     b.output_regularizer = (all->per_feature_regularizer_output != "" || all->per_feature_regularizer_text != "");
     reset_state(*all, b, false);
   }
 
   // bool reg_vector = b.output_regularizer || all->per_feature_regularizer_input.length() > 0;
   bool reg_vector = (b.output_regularizer && !read) || (all->per_feature_regularizer_input.length() > 0 && read);
 
   if (model_file.files.size() > 0)
   {
     std::stringstream msg;
     msg << ":" << reg_vector << "\n";
     bin_text_read_write_fixed(model_file, (char*)&reg_vector, sizeof(reg_vector), "", read, msg, text);
 
     if (reg_vector)
       save_load_regularizer(*all, b, model_file, read, text);
     else
       GD::save_load_regressor(*all, model_file, read, text);
   }
 }

◆ save_load_regularizer()

void save_load_regularizer	(	vw &	all,
		bfgs &	b,
		io_buf &	model_file,
		bool	read,
		bool	text
	)

Definition at line 979 of file bfgs.cc.

References bfgs::all, io_buf::bin_read_fixed(), bin_text_write_fixed(), c, vw::l2_lambda, vw::num_bits, bfgs::output_regularizer, preconditioner_to_regularizer(), regularizer_to_weight(), and bfgs::regularizers.

Referenced by save_load().

 {
   int c = 0;
   uint32_t length = 2 * (1 << all.num_bits);
   uint32_t i = 0;
   size_t brw = 1;
 
   if (b.output_regularizer && !read)
     preconditioner_to_regularizer(*(b.all), b, b.all->l2_lambda);
 
   do
   {
     brw = 1;
     weight* v;
     if (read)
     {
       c++;
       brw = model_file.bin_read_fixed((char*)&i, sizeof(i), "");
       if (brw > 0)
       {
         assert(i < length);
         v = &(b.regularizers[i]);
         brw += model_file.bin_read_fixed((char*)v, sizeof(*v), "");
       }
     }
     else  // write binary or text
     {
       v = &(b.regularizers[i]);
       if (*v != 0.)
       {
         c++;
         std::stringstream msg;
         msg << i;
         brw = bin_text_write_fixed(model_file, (char*)&i, sizeof(i), msg, text);
 
         msg << ":" << *v << "\n";
         brw += bin_text_write_fixed(model_file, (char*)v, sizeof(*v), msg, text);
       }
     }
     if (!read)
       i++;
   } while ((!read && i < length) || (read && brw > 0));
 
   if (read)
     regularizer_to_weight(all, b);
 }

◆ test_example()

constexpr bool test_example ( example & ec )

noexcept

Definition at line 147 of file bfgs.cc.

Referenced by learn().

147 { return ec.l.simple.label == FLT_MAX; }

label_data::label

float label

Definition: simple_label.h:14

polylabel::simple

label_data simple

Definition: example.h:28

example::l

polylabel l

Definition: example.h:57

◆ update_preconditioner()

void update_preconditioner	(	vw &	all,
		example &	ec
	)

Definition at line 171 of file bfgs.cc.

References example::l, label_data::label, vw::loss, example::pred, polyprediction::scalar, vw::sd, loss_function::second_derivative(), polylabel::simple, and example::weight.

Referenced by process_example().

 {
   float curvature = all.loss->second_derivative(all.sd, ec.pred.scalar, ec.l.simple.label) * ec.weight;
   GD::foreach_feature<float, add_precond>(all, ec, curvature);
 }

◆ update_weight() [1/2]

template<class T >

void update_weight	(	vw &	,
		float	step_size,
		T &	w
	)

Definition at line 625 of file bfgs.cc.

References W_DIR, and W_XT.

Referenced by process_pass(), and update_weight().

 {
   for (typename T::iterator iter = w.begin(); iter != w.end(); ++iter)
     (&(*iter))[W_XT] += step_size * (&(*iter))[W_DIR];
 }

◆ update_weight() [2/2]

void update_weight	(	vw &	all,
		float	step_size
	)

Definition at line 631 of file bfgs.cc.

References parameters::dense_weights, parameters::sparse, parameters::sparse_weights, update_weight(), and vw::weights.

 {
   if (all.weights.sparse)
     update_weight(all, step_size, all.weights.sparse_weights);
   else
     update_weight(all, step_size, all.weights.dense_weights);
 }

◆ wolfe_eval() [1/2]

template<class T >

double wolfe_eval	(	vw &	all,
		bfgs &	b,
		float *	mem,
		double	loss_sum,
		double	previous_loss_sum,
		double	step_size,
		double	importance_weight_sum,
		int &	origin,
		double &	wolfe1,
		T &	weights
	)

Definition at line 409 of file bfgs.cc.

References MEM_GT, bfgs::mem_stride, vw::quiet, W_COND, W_DIR, and W_GT.

Referenced by process_pass(), and wolfe_eval().

 {
   double g0_d = 0.;
   double g1_d = 0.;
   double g1_Hg1 = 0.;
   double g1_g1 = 0.;
 
   for (typename T::iterator w = weights.begin(); w != weights.end(); ++w)
   {
     float* mem1 = mem + (w.index() >> weights.stride_shift()) * b.mem_stride;
     g0_d += ((double)mem1[(MEM_GT + origin) % b.mem_stride]) * ((&(*w))[W_DIR]);
     g1_d += ((double)(&(*w))[W_GT]) * (&(*w))[W_DIR];
     g1_Hg1 += ((double)(&(*w))[W_GT]) * (&(*w))[W_GT] * ((&(*w))[W_COND]);
     g1_g1 += ((double)(&(*w))[W_GT]) * (&(*w))[W_GT];
   }
 
   wolfe1 = (loss_sum - previous_loss_sum) / (step_size * g0_d);
   double wolfe2 = g1_d / g0_d;
   // double new_step_cross = (loss_sum-previous_loss_sum-g1_d*step)/(g0_d-g1_d);
 
   if (!all.quiet)
     fprintf(stderr, "%-10.5f\t%-10.5f\t%s%-10f\t%-10f\t", g1_g1 / (importance_weight_sum * importance_weight_sum),
         g1_Hg1 / importance_weight_sum, " ", wolfe1, wolfe2);
   return 0.5 * step_size;
 }

◆ wolfe_eval() [2/2]

double wolfe_eval	(	vw &	all,
		bfgs &	b,
		float *	mem,
		double	loss_sum,
		double	previous_loss_sum,
		double	step_size,
		double	importance_weight_sum,
		int &	origin,
		double &	wolfe1
	)

Definition at line 436 of file bfgs.cc.

References parameters::dense_weights, parameters::sparse, parameters::sparse_weights, vw::weights, and wolfe_eval().

 {
   if (all.weights.sparse)
     return wolfe_eval(all, b, mem, loss_sum, previous_loss_sum, step_size, importance_weight_sum, origin, wolfe1,
         all.weights.sparse_weights);
   else
     return wolfe_eval(all, b, mem, loss_sum, previous_loss_sum, step_size, importance_weight_sum, origin, wolfe1,
         all.weights.dense_weights);
 }

◆ zero_derivative()

void zero_derivative ( vw & all )

Definition at line 122 of file bfgs.cc.

References parameters::set_zero(), W_GT, and vw::weights.

Referenced by process_pass(), and reset_state().

122 { all.weights.set_zero(W_GT); }

vw::weights

parameters weights

Definition: global_data.h:537

W_GT

#define W_GT

Definition: bfgs.cc:37

parameters::set_zero

void set_zero(size_t offset)

Definition: array_parameters.h:284

◆ zero_preconditioner()

void zero_preconditioner ( vw & all )

Definition at line 124 of file bfgs.cc.

References parameters::set_zero(), W_COND, and vw::weights.

Referenced by end_pass(), and reset_state().

124 { all.weights.set_zero(W_COND); }

vw::weights

parameters weights

Definition: global_data.h:537

parameters::set_zero

void set_zero(size_t offset)

Definition: array_parameters.h:284

W_COND

#define W_COND

Definition: bfgs.cc:39

◆ zero_state()

void zero_state ( vw & all )

Definition at line 597 of file bfgs.cc.

References parameters::set_zero(), W_COND, W_DIR, W_GT, and vw::weights.

 {
   all.weights.set_zero(W_GT);
   all.weights.set_zero(W_DIR);
   all.weights.set_zero(W_COND);
 }

Variable Documentation

◆ curv_ex

curv_exception curv_ex

Referenced by bfgs_iter_middle().

◆ curv_message

constexpr const char* curv_message

Initial value:

=
    "Zero or negative curvature detected.\n"
    "To increase curvature you can increase regularization or rescale features.\n"
    "It is also possible that you have reached numerical accuracy\n"
    "and further decrease in the objective cannot be reliably detected.\n"

Definition at line 116 of file bfgs.cc.

Referenced by process_pass().

◆ max_precond_ratio

constexpr float max_precond_ratio = 10000.f

Definition at line 60 of file bfgs.cc.

Referenced by finalize_preconditioner().

Classes

Macros

Functions

Variables

Macro Definition Documentation

◆ CG_EXTRA

◆ LEARN_CONV

◆ LEARN_CURV

◆ LEARN_OK

◆ MEM_GT

◆ MEM_ST

◆ MEM_XT

◆ MEM_YT

◆ W_COND

◆ W_DIR

◆ W_GT

◆ W_XT

Function Documentation

◆ add_DIR()

◆ add_grad()

◆ add_precond()

◆ add_regularization() [1/2]

◆ add_regularization() [2/2]

◆ bfgs_iter_middle() [1/2]

◆ bfgs_iter_middle() [2/2]

◆ bfgs_iter_start() [1/2]

◆ bfgs_iter_start() [2/2]

◆ bfgs_predict()

◆ bfgs_setup()

◆ derivative_in_direction() [1/2]

◆ derivative_in_direction() [2/2]

◆ direction_magnitude() [1/2]

◆ direction_magnitude() [2/2]

◆ dot_with_direction()

◆ end_pass()

◆ finalize_preconditioner() [1/2]

◆ finalize_preconditioner() [2/2]

◆ init_driver()

◆ learn()

◆ preconditioner_to_regularizer() [1/2]

◆ preconditioner_to_regularizer() [2/2]

◆ predict()

◆ predict_and_gradient()

◆ process_example()

◆ process_pass()

◆ regularizer_direction_magnitude() [1/2]

◆ regularizer_direction_magnitude() [2/2]

◆ regularizer_to_weight() [1/2]

◆ regularizer_to_weight() [2/2]

◆ reset_state()

◆ save_load()

◆ save_load_regularizer()

◆ test_example()

◆ update_preconditioner()

◆ update_weight() [1/2]

◆ update_weight() [2/2]

◆ wolfe_eval() [1/2]

◆ wolfe_eval() [2/2]

◆ zero_derivative()

◆ zero_preconditioner()

◆ zero_state()

Variable Documentation

◆ curv_ex

◆ curv_message

◆ max_precond_ratio