Classes
struct	audit_results

struct	gd

struct	multipredict_info

struct	norm_data

struct	power_data

class	set_initial_gd_wrapper

struct	string_value

struct	trunc_data

Functions
void	sync_weights (vw &all)

float	quake_InvSqrt (float x)

static float	InvSqrt (float x)

template<bool sqrt_rate, bool feature_mask_off, size_t adaptive, size_t normalized, size_t spare>
void	update_feature (float &update, float x, float &fw)

template<bool sqrt_rate, size_t adaptive, size_t normalized>
float	average_update (float total_weight, float normalized_sum_norm_x, float neg_norm_power)

template<bool sqrt_rate, bool feature_mask_off, size_t adaptive, size_t normalized, size_t spare>
void	train (gd &g, example &ec, float update)

void	end_pass (gd &g)

bool	operator< (const string_value &first, const string_value &second)

void	audit_interaction (audit_results &dat, const audit_strings *f)

void	audit_feature (audit_results &dat, const float ft_weight, const uint64_t ft_idx)

void	print_lda_features (vw &all, example &ec)

void	print_features (vw &all, example &ec)

void	print_audit_features (vw &all, example &ec)

float	finalize_prediction (shared_data *sd, float ret)

void	vec_add_trunc (trunc_data &p, const float fx, float &fw)

float	trunc_predict (vw &all, example &ec, double gravity)

void	vec_add_print (float &p, const float fx, float &fw)

template<bool l1, bool audit>
void	predict (gd &g, base_learner &, example &ec)

template<class T >
void	vec_add_trunc_multipredict (multipredict_info< T > &mp, const float fx, uint64_t fi)

template<bool l1, bool audit>
void	multipredict (gd &g, base_learner &, example &ec, size_t count, size_t step, polyprediction *pred, bool finalize_predictions)

template<bool sqrt_rate, size_t adaptive, size_t normalized>
float	compute_rate_decay (power_data &s, float &fw)

template<bool sqrt_rate, bool feature_mask_off, size_t adaptive, size_t normalized, size_t spare, bool stateless>
void	pred_per_update_feature (norm_data &nd, float x, float &fw)

template<bool sqrt_rate, bool feature_mask_off, bool adax, size_t adaptive, size_t normalized, size_t spare, bool stateless>
float	get_pred_per_update (gd &g, example &ec)

template<bool sqrt_rate, bool feature_mask_off, bool adax, size_t adaptive, size_t normalized, size_t spare, bool stateless>
float	sensitivity (gd &g, example &ec)

template<size_t adaptive>
float	get_scale (gd &g, example &, float weight)

template<bool sqrt_rate, bool feature_mask_off, bool adax, size_t adaptive, size_t normalized, size_t spare>
float	sensitivity (gd &g, base_learner &, example &ec)

template<bool sparse_l2, bool invariant, bool sqrt_rate, bool feature_mask_off, bool adax, size_t adaptive, size_t normalized, size_t spare>
float	compute_update (gd &g, example &ec)

template<bool sparse_l2, bool invariant, bool sqrt_rate, bool feature_mask_off, bool adax, size_t adaptive, size_t normalized, size_t spare>
void	update (gd &g, base_learner &, example &ec)

template<bool sparse_l2, bool invariant, bool sqrt_rate, bool feature_mask_off, bool adax, size_t adaptive, size_t normalized, size_t spare>
void	learn (gd &g, base_learner &base, example &ec)

size_t	write_index (io_buf &model_file, std::stringstream &msg, bool text, uint32_t num_bits, uint64_t i)

template<class T >
void	save_load_regressor (vw &all, io_buf &model_file, bool read, bool text, T &weights)

void	save_load_regressor (vw &all, io_buf &model_file, bool read, bool text)

template<class T >
void	save_load_online_state (vw &all, io_buf &model_file, bool read, bool text, gd *g, std::stringstream &msg, uint32_t ftrl_size, T &weights)

void	save_load_online_state (vw &all, io_buf &model_file, bool read, bool text, double &total_weight, gd *g, uint32_t ftrl_size)

void	save_load (gd &g, io_buf &model_file, bool read, bool text)

template<bool sparse_l2, bool invariant, bool sqrt_rate, bool feature_mask_off, uint64_t adaptive, uint64_t normalized, uint64_t spare, uint64_t next>
uint64_t	set_learn (vw &all, gd &g)

template<bool sparse_l2, bool invariant, bool sqrt_rate, uint64_t adaptive, uint64_t normalized, uint64_t spare, uint64_t next>
uint64_t	set_learn (vw &all, bool feature_mask_off, gd &g)

template<bool invariant, bool sqrt_rate, uint64_t adaptive, uint64_t normalized, uint64_t spare, uint64_t next>
uint64_t	set_learn (vw &all, bool feature_mask_off, gd &g)

template<bool sqrt_rate, uint64_t adaptive, uint64_t normalized, uint64_t spare, uint64_t next>
uint64_t	set_learn (vw &all, bool feature_mask_off, gd &g)

template<bool sqrt_rate, uint64_t adaptive, uint64_t spare>
uint64_t	set_learn (vw &all, bool feature_mask_off, gd &g)

template<bool sqrt_rate>
uint64_t	set_learn (vw &all, bool feature_mask_off, gd &g)

uint64_t	ceil_log_2 (uint64_t v)

base_learner *	setup (options_i &options, vw &all)

template<class T >
void	vec_add_multipredict (multipredict_info< T > &mp, const float fx, uint64_t fi)

template<class R , typename T >
void	foreach_feature (vw &all, features &fs, R &dat, uint64_t offset=0, float mult=1.)

template<class R , class S , void(*)(R &, float, S) T>
void	foreach_feature (vw &all, example &ec, R &dat)

template<class R , void(*)(R &, float, float &) T>
void	foreach_feature (vw &all, example &ec, R &dat)

float	inline_predict (vw &all, example &ec)

float	sign (float w)

float	trunc_weight (const float w, const float gravity)

template<class R , void(*)(R &, float, uint64_t) T, class W >
void	foreach_feature (W &, features &fs, R &dat, uint64_t offset=0, float mult=1.)

template<class R , void(*)(R &, const float, const float &) T, class W >
void	foreach_feature (const W &weights, features &fs, R &dat, uint64_t offset=0, float mult=1.)

template<class R >
void	dummy_func (R &, const audit_strings *)

template<class R , class S , void(*)(R &, float, S) T, class W >
void	generate_interactions (std::vector< std::string > &interactions, bool permutations, example_predict &ec, R &dat, W &weights)

template<class R , class S , void(*)(R &, float, S) T, class W >
void	foreach_feature (W &weights, bool ignore_some_linear, std::array< bool, NUM_NAMESPACES > &ignore_linear, std::vector< std::string > &interactions, bool permutations, example_predict &ec, R &dat)

void	vec_add (float &p, const float fx, const float &fw)

template<class W >
float	inline_predict (W &weights, bool ignore_some_linear, std::array< bool, NUM_NAMESPACES > &ignore_linear, std::vector< std::string > &interactions, bool permutations, example_predict &ec, float initial=0.f)

Variables
constexpr float	x_min = 1.084202e-19f

constexpr float	x2_min = x_min * x_min

constexpr float	x2_max = FLT_MAX

bool	global_print_features = false

Function Documentation

◆ audit_feature()

void GD::audit_feature	(	audit_results &	dat,
		const float	ft_weight,
		const uint64_t	ft_idx
	)

inline

Definition at line 241 of file gd.cc.

References parameters::adaptive, GD::audit_results::all, vw::audit, shared_data::contraction, vw::current_pass, shared_data::gravity, vw::hash_inv, parameters::mask(), vw::name_index_map, GD::audit_results::ns_pre, GD::audit_results::offset, GD::audit_results::results, vw::sd, stride_shift(), parameters::stride_shift(), vw::training, trunc_weight(), and vw::weights.

Referenced by print_features().

 {
   parameters& weights = dat.all.weights;
   uint64_t index = ft_idx & weights.mask();
   size_t stride_shift = weights.stride_shift();
 
   std::string ns_pre;
   for (std::string& s : dat.ns_pre) ns_pre += s;
 
   if (dat.all.audit)
   {
     std::ostringstream tempstream;
     tempstream << ':' << (index >> stride_shift) << ':' << ft_weight << ':'
                << trunc_weight(weights[index], (float)dat.all.sd->gravity) * (float)dat.all.sd->contraction;
 
     if (weights.adaptive)  // adaptive
       tempstream << '@' << (&weights[index])[1];
 
     string_value sv = {weights[index] * ft_weight, ns_pre + tempstream.str()};
     dat.results.push_back(sv);
   }
 
   if ((dat.all.current_pass == 0 || dat.all.training == false) && dat.all.hash_inv)
   {
     // for invert_hash
 
     if (dat.offset != 0)
     {
       // otherwise --oaa output no features for class > 0.
       std::ostringstream tempstream;
       tempstream << '[' << (dat.offset >> stride_shift) << ']';
       ns_pre += tempstream.str();
     }
 
     if (!dat.all.name_index_map.count(ns_pre))
       dat.all.name_index_map.insert(std::map<std::string, size_t>::value_type(ns_pre, index >> stride_shift));
   }
 }

◆ audit_interaction()

void GD::audit_interaction	(	audit_results &	dat,
		const audit_strings *	f
	)

inline

Definition at line 208 of file gd.cc.

References GD::audit_results::ns_pre.

Referenced by print_features().

 {
   if (f == nullptr)
   {
     if (!dat.ns_pre.empty())
     {
       dat.ns_pre.pop_back();
     }
 
     return;
   }
 
   std::string ns_pre;
   if (!dat.ns_pre.empty())
     ns_pre += '*';
 
   if (f->first != "" && ((f->first) != " "))
   {
     ns_pre.append(f->first);
     ns_pre += '^';
   }
 
   if (f->second != "")
   {
     ns_pre.append(f->second);
   }
 
   if (!ns_pre.empty())
   {
     dat.ns_pre.push_back(ns_pre);
   }
 }

◆ average_update()

template<bool sqrt_rate, size_t adaptive, size_t normalized>

float GD::average_update	(	float	total_weight,
		float	normalized_sum_norm_x,
		float	neg_norm_power
	)

Definition at line 122 of file gd.cc.

References ldamath::powf().

 {
   if (normalized)
   {
     if (sqrt_rate)
     {
       float avg_norm = (float)(total_weight / normalized_sum_norm_x);
       if (adaptive)
         return std::sqrt(avg_norm);
       else
         return avg_norm;
     }
     else
       return powf((float)(normalized_sum_norm_x / total_weight), neg_norm_power);
   }
   return 1.f;
 }

◆ ceil_log_2()

uint64_t GD::ceil_log_2 ( uint64_t v )

Definition at line 1111 of file gd.cc.

Referenced by setup().

 {
   if (v == 0)
     return 0;
   else
     return 1 + ceil_log_2(v >> 1);
 }

◆ compute_rate_decay()

template<bool sqrt_rate, size_t adaptive, size_t normalized>

float GD::compute_rate_decay	(	power_data &	s,
		float &	fw
	)

inline

Definition at line 447 of file gd.cc.

References InvSqrt(), GD::power_data::minus_power_t, GD::power_data::neg_norm_power, and ldamath::powf().

 {
   weight* w = &fw;
   float rate_decay = 1.f;
   if (adaptive)
   {
     if (sqrt_rate)
       rate_decay = InvSqrt(w[adaptive]);
     else
       rate_decay = powf(w[adaptive], s.minus_power_t);
   }
   if (normalized)
   {
     if (sqrt_rate)
     {
       float inv_norm = 1.f / w[normalized];
       if (adaptive)
         rate_decay *= inv_norm;
       else
         rate_decay *= inv_norm * inv_norm;
     }
     else
       rate_decay *= powf(w[normalized] * w[normalized], s.neg_norm_power);
   }
   return rate_decay;
 }

◆ compute_update()

template<bool sparse_l2, bool invariant, bool sqrt_rate, bool feature_mask_off, bool adax, size_t adaptive, size_t normalized, size_t spare>

float GD::compute_update	(	gd &	g,
		example &	ec
	)

Definition at line 609 of file gd.cc.

References GD::gd::all, shared_data::contraction, loss_function::first_derivative(), loss_function::getLoss(), loss_function::getUnsafeUpdate(), loss_function::getUpdate(), shared_data::gravity, example::l, vw::l1_lambda, vw::l2_lambda, label_data::label, vw::loss, example::pred, vw::reg_mode, polyprediction::scalar, vw::sd, polylabel::simple, GD::gd::sparse_l2, update(), example::updated_prediction, and example::weight.

 {
   // invariant: not a test label, importance weight > 0
   label_data& ld = ec.l.simple;
   vw& all = *g.all;
 
   float update = 0.;
   ec.updated_prediction = ec.pred.scalar;
   if (all.loss->getLoss(all.sd, ec.pred.scalar, ld.label) > 0.)
   {
     float pred_per_update = sensitivity<sqrt_rate, feature_mask_off, adax, adaptive, normalized, spare, false>(g, ec);
     float update_scale = get_scale<adaptive>(g, ec, ec.weight);
     if (invariant)
       update = all.loss->getUpdate(ec.pred.scalar, ld.label, update_scale, pred_per_update);
     else
       update = all.loss->getUnsafeUpdate(ec.pred.scalar, ld.label, update_scale);
     // changed from ec.partial_prediction to ld.prediction
     ec.updated_prediction += pred_per_update * update;
 
     if (all.reg_mode && fabs(update) > 1e-8)
     {
       double dev1 = all.loss->first_derivative(all.sd, ec.pred.scalar, ld.label);
       double eta_bar = (fabs(dev1) > 1e-8) ? (-update / dev1) : 0.0;
       if (fabs(dev1) > 1e-8)
         all.sd->contraction *= (1. - all.l2_lambda * eta_bar);
       update /= (float)all.sd->contraction;
       all.sd->gravity += eta_bar * all.l1_lambda;
     }
   }
 
   if (sparse_l2)
     update -= g.sparse_l2 * ec.pred.scalar;
 
   return update;
 }

◆ dummy_func()

template<class R >

void GD::dummy_func	(	R &	,
		const audit_strings *
	)

inline

Definition at line 39 of file gd_predict.h.

40 {

41 } // should never be called due to call_audit overload

◆ end_pass()

void GD::end_pass ( gd & g )

Definition at line 148 of file gd.cc.

References accumulate_avg(), accumulate_weighted_avg(), parameters::adaptive, GD::gd::all, vw::all_reduce, vw::check_holdout_every_n_passes, shared_data::contraction, vw::current_pass, GD::gd::early_stop_thres, vw::eta, vw::eta_decay_rate, vw::final_regressor_name, finalize_regressor(), VW::config::options_i::get_typed_option(), shared_data::gravity, vw::holdout_set_off, GD::gd::no_win_counter, vw::options, VW::config::options_i::replace(), vw::save_per_pass, save_predictor(), vw::save_resume, vw::sd, set_done(), summarize_holdout_set(), sync_weights(), prediction_type::to_string(), and vw::weights.

 {
   vw& all = *g.all;
   if (all.save_resume)
   {
     // TODO work out a better system to update state that will be saved in the model.
     if (all.sd->gravity != 0.)
     {
       g.all->options->replace("l1_state", std::to_string(all.sd->gravity));
       g.all->options->get_typed_option<double>("l1_state").value(all.sd->gravity);
     }
     if (all.sd->contraction != 1.)
     {
       g.all->options->replace("l2_state", std::to_string(all.sd->contraction));
       g.all->options->get_typed_option<double>("l2_state").value(all.sd->contraction);
     }
   }
   else
     sync_weights(all);
   if (all.all_reduce != nullptr)
   {
     if (all.weights.adaptive)
       accumulate_weighted_avg(all, all.weights);
     else
       accumulate_avg(all, all.weights, 0);
   }
   all.eta *= all.eta_decay_rate;
   if (all.save_per_pass)
     save_predictor(all, all.final_regressor_name, all.current_pass);
 
   if (!all.holdout_set_off)
   {
     if (summarize_holdout_set(all, g.no_win_counter))
       finalize_regressor(all, all.final_regressor_name);
     if ((g.early_stop_thres == g.no_win_counter) &&
         ((all.check_holdout_every_n_passes <= 1) || ((all.current_pass % all.check_holdout_every_n_passes) == 0)))
       set_done(all);
   }
 }

◆ finalize_prediction()

float GD::finalize_prediction	(	shared_data *	sd,
		float	ret
	)

Definition at line 339 of file gd.cc.

References shared_data::example_number, shared_data::max_label, and shared_data::min_label.

Referenced by bfgs_predict(), mf_predict(), multipredict(), multipredict(), SVRG::predict(), predict(), predict(), predict_or_learn_multi(), SVRG::predict_stable(), update_state_and_predict_cb(), and update_state_and_predict_pistol().

 {
   if (std::isnan(ret))
   {
     ret = 0.;
     std::cerr << "NAN prediction in example " << sd->example_number + 1 << ", forcing " << ret << std::endl;
     return ret;
   }
   if (ret > sd->max_label)
     return (float)sd->max_label;
   if (ret < sd->min_label)
     return (float)sd->min_label;
   return ret;
 }

◆ foreach_feature() [1/6]

template<class R , void(*)(R &, float, uint64_t) T, class W >

void GD::foreach_feature	(	W &	weights,
		features &	fs,
		R &	dat,
		uint64_t	offset = `0`,
		float	mult = `1.`
	)

inline

Definition at line 15 of file gd_predict.h.

References f, and foreach_feature().

 {
   for (features::iterator& f : fs) T(dat, mult * f.value(), f.index() + offset);
 }

◆ foreach_feature() [2/6]

template<class R , void(*)(R &, const float, const float &) T, class W >

void GD::foreach_feature	(	const W &	weights,
		features &	fs,
		R &	dat,
		uint64_t	offset = `0`,
		float	mult = `1.`
	)

inline

Definition at line 29 of file gd_predict.h.

References f.

 {
   for (features::iterator& f : fs)
   {
     const weight& w = weights[(f.index() + offset)];
     T(dat, mult * f.value(), w);
   }
 }

◆ foreach_feature() [3/6]

template<class R , class S , void(*)(R &, float, S) T, class W >

void GD::foreach_feature	(	W &	weights,
		bool	ignore_some_linear,
		std::array< bool, NUM_NAMESPACES > &	ignore_linear,
		std::vector< std::string > &	interactions,
		bool	permutations,
		example_predict &	ec,
		R &	dat
	)

inline

Definition at line 56 of file gd_predict.h.

References example_predict::begin(), example_predict::end(), f, and example_predict::ft_offset.

 {
   uint64_t offset = ec.ft_offset;
   if (ignore_some_linear)
     for (example_predict::iterator i = ec.begin(); i != ec.end(); ++i)
     {
       if (!ignore_linear[i.index()])
       {
         features& f = *i;
         foreach_feature<R, T, W>(weights, f, dat, offset);
       }
     }
   else
     for (features& f : ec) foreach_feature<R, T, W>(weights, f, dat, offset);
 
   generate_interactions<R, S, T, W>(interactions, permutations, ec, dat, weights);
 }

◆ foreach_feature() [4/6]

template<class R , typename T >

void GD::foreach_feature	(	vw &	all,
		features &	fs,
		R &	dat,
		uint64_t	offset = `0`,
		float	mult = `1.`
	)

inline

Definition at line 66 of file gd.h.

References parameters::dense_weights, parameters::sparse, parameters::sparse_weights, and vw::weights.

Referenced by foreach_feature(), and get_pred_per_update().

 {
   if (all.weights.sparse)
     foreach_feature(all.weights.sparse_weights, fs, dat, offset, mult);
   else
     foreach_feature(all.weights.dense_weights, fs, dat, offset, mult);
 }

◆ foreach_feature() [5/6]

template<class R , class S , void(*)(R &, float, S) T>

void GD::foreach_feature	(	vw &	all,
		example &	ec,
		R &	dat
	)

inline

Definition at line 75 of file gd.h.

References parameters::dense_weights, vw::ignore_linear, vw::ignore_some_linear, example_predict::interactions, vw::permutations, parameters::sparse, parameters::sparse_weights, and vw::weights.

 {
   return all.weights.sparse
       ? foreach_feature<R, S, T, sparse_parameters>(all.weights.sparse_weights, all.ignore_some_linear,
             all.ignore_linear, *ec.interactions, all.permutations, ec, dat)
       : foreach_feature<R, S, T, dense_parameters>(all.weights.dense_weights, all.ignore_some_linear, all.ignore_linear,
             *ec.interactions, all.permutations, ec, dat);
 }

◆ foreach_feature() [6/6]

template<class R , void(*)(R &, float, float &) T>

void GD::foreach_feature	(	vw &	all,
		example &	ec,
		R &	dat
	)

inline

Definition at line 87 of file gd.h.

References foreach_feature().

 {
   foreach_feature<R, float&, T>(all, ec, dat);
 }

◆ generate_interactions()

template<class R , class S , void(*)(R &, float, S) T, class W >

void GD::generate_interactions	(	std::vector< std::string > &	interactions,
		bool	permutations,
		example_predict &	ec,
		R &	dat,
		W &	weights
	)

inline

Definition at line 45 of file gd_predict.h.

 {
   INTERACTIONS::generate_interactions<R, S, T, false, dummy_func<R>, W>(interactions, permutations, ec, dat, weights);
 }

◆ get_pred_per_update()

template<bool sqrt_rate, bool feature_mask_off, bool adax, size_t adaptive, size_t normalized, size_t spare, bool stateless>

float GD::get_pred_per_update	(	gd &	g,
		example &	ec
	)

Definition at line 541 of file gd.cc.

References GD::gd::all, foreach_feature(), loss_function::getSquareGrad(), example::l, label_data::label, vw::loss, GD::gd::neg_norm_power, GD::gd::neg_power_t, GD::norm_data::norm_x, vw::normalized_sum_norm_x, example::pred, GD::norm_data::pred_per_update, polyprediction::scalar, polylabel::simple, GD::gd::total_weight, GD::gd::update_multiplier, and example::weight.

 {
   // We must traverse the features in _precisely_ the same order as during training.
   label_data& ld = ec.l.simple;
   vw& all = *g.all;
 
   float grad_squared = ec.weight;
   if (!adax)
     grad_squared *= all.loss->getSquareGrad(ec.pred.scalar, ld.label);
 
   if (grad_squared == 0 && !stateless)
     return 1.;
 
   norm_data nd = {grad_squared, 0., 0., {g.neg_power_t, g.neg_norm_power}, {0}};
   foreach_feature<norm_data,
       pred_per_update_feature<sqrt_rate, feature_mask_off, adaptive, normalized, spare, stateless> >(all, ec, nd);
   if (normalized)
   {
     if (!stateless)
     {
       g.all->normalized_sum_norm_x += ((double)ec.weight) * nd.norm_x;
       g.total_weight += ec.weight;
       g.update_multiplier = average_update<sqrt_rate, adaptive, normalized>(
           (float)g.total_weight, (float)g.all->normalized_sum_norm_x, g.neg_norm_power);
     }
     else
     {
       float nsnx = ((float)g.all->normalized_sum_norm_x) + ec.weight * nd.norm_x;
       float tw = (float)g.total_weight + ec.weight;
       g.update_multiplier = average_update<sqrt_rate, adaptive, normalized>(tw, nsnx, g.neg_norm_power);
     }
     nd.pred_per_update *= g.update_multiplier;
   }
   return nd.pred_per_update;
 }

◆ get_scale()

template<size_t adaptive>

float GD::get_scale	(	gd &	g,
		example &	,
		float	weight
	)

Definition at line 588 of file gd.cc.

References GD::gd::all, vw::eta, GD::gd::neg_power_t, ldamath::powf(), vw::sd, shared_data::t, shared_data::weighted_holdout_examples, and shared_data::weighted_unlabeled_examples.

 {
   float update_scale = g.all->eta * weight;
   if (!adaptive)
   {
     float t =
         (float)(g.all->sd->t + weight - g.all->sd->weighted_holdout_examples - g.all->sd->weighted_unlabeled_examples);
     update_scale *= powf(t, g.neg_power_t);
   }
   return update_scale;
 }

◆ inline_predict() [1/2]

template<class W >

float GD::inline_predict	(	W &	weights,
		bool	ignore_some_linear,
		std::array< bool, NUM_NAMESPACES > &	ignore_linear,
		std::vector< std::string > &	interactions,
		bool	permutations,
		example_predict &	ec,
		float	initial = `0.f`
	)

inline

Definition at line 78 of file gd_predict.h.

 {
   foreach_feature<float, const float&, vec_add, W>(
       weights, ignore_some_linear, ignore_linear, interactions, permutations, ec, initial);
   return initial;
 }

◆ inline_predict() [2/2]

float GD::inline_predict	(	vw &	all,
		example &	ec
	)

inline

Definition at line 98 of file gd.h.

References parameters::dense_weights, vw::ignore_linear, vw::ignore_some_linear, label_data::initial, example_predict::interactions, example::l, vw::permutations, polylabel::simple, parameters::sparse, parameters::sparse_weights, and vw::weights.

Referenced by bfgs_predict(), predict(), and predict().

 {
   return all.weights.sparse ? inline_predict<sparse_parameters>(all.weights.sparse_weights, all.ignore_some_linear,
                                   all.ignore_linear, *ec.interactions, all.permutations, ec, ec.l.simple.initial)
                             : inline_predict<dense_parameters>(all.weights.dense_weights, all.ignore_some_linear,
                                   all.ignore_linear, *ec.interactions, all.permutations, ec, ec.l.simple.initial);
 }

◆ InvSqrt()

static float GD::InvSqrt ( float x )

inlinestatic

Definition at line 80 of file gd.cc.

References quake_InvSqrt().

Referenced by compute_rate_decay().

 {
 #if !defined(VW_NO_INLINE_SIMD)
 #if defined(__ARM_NEON__)
   // Propagate into vector
   float32x2_t v1 = vdup_n_f32(x);
   // Estimate
   float32x2_t e1 = vrsqrte_f32(v1);
   // N-R iteration 1
   float32x2_t e2 = vmul_f32(e1, vrsqrts_f32(v1, vmul_f32(e1, e1)));
   // N-R iteration 2
   float32x2_t e3 = vmul_f32(e2, vrsqrts_f32(v1, vmul_f32(e2, e2)));
   // Extract result
   return vget_lane_f32(e3, 0);
 #elif defined(__SSE2__)
   __m128 eta = _mm_load_ss(&x);
   eta = _mm_rsqrt_ss(eta);
   _mm_store_ss(&x, eta);
 #else
   x = quake_InvSqrt(x);
 #endif
 #else
   x = quake_InvSqrt(x);
 #endif
 
   return x;
 }

◆ learn()

template<bool sparse_l2, bool invariant, bool sqrt_rate, bool feature_mask_off, bool adax, size_t adaptive, size_t normalized, size_t spare>

void GD::learn	(	gd &	g,
		base_learner &	base,
		example &	ec
	)

Definition at line 661 of file gd.cc.

References example::in_use, example::l, label_data::label, GD::gd::predict, polylabel::simple, and example::weight.

 {
   // invariant: not a test label, importance weight > 0
   assert(ec.in_use);
   assert(ec.l.simple.label != FLT_MAX);
   assert(ec.weight > 0.);
   g.predict(g, base, ec);
   update<sparse_l2, invariant, sqrt_rate, feature_mask_off, adax, adaptive, normalized, spare>(g, base, ec);
 }

◆ multipredict()

template<bool l1, bool audit>

void GD::multipredict	(	gd &	g,
		base_learner &	,
		example &	ec,
		size_t	count,
		size_t	step,
		polyprediction *	pred,
		bool	finalize_predictions
	)

Definition at line 402 of file gd.cc.

References GD::gd::all, c, shared_data::contraction, parameters::dense_weights, finalize_prediction(), example_predict::ft_offset, shared_data::gravity, label_data::initial, example::l, example::pred, print_audit_features(), prediction_type::scalar, polyprediction::scalar, vw::sd, polylabel::simple, parameters::sparse, parameters::sparse_weights, vec_add_multipredict(), vec_add_trunc_multipredict(), and vw::weights.

 {
   vw& all = *g.all;
   for (size_t c = 0; c < count; c++) pred[c].scalar = ec.l.simple.initial;
   if (g.all->weights.sparse)
   {
     multipredict_info<sparse_parameters> mp = {
         count, step, pred, g.all->weights.sparse_weights, (float)all.sd->gravity};
     if (l1)
       foreach_feature<multipredict_info<sparse_parameters>, uint64_t, vec_add_trunc_multipredict>(all, ec, mp);
     else
       foreach_feature<multipredict_info<sparse_parameters>, uint64_t, vec_add_multipredict>(all, ec, mp);
   }
   else
   {
     multipredict_info<dense_parameters> mp = {count, step, pred, g.all->weights.dense_weights, (float)all.sd->gravity};
     if (l1)
       foreach_feature<multipredict_info<dense_parameters>, uint64_t, vec_add_trunc_multipredict>(all, ec, mp);
     else
       foreach_feature<multipredict_info<dense_parameters>, uint64_t, vec_add_multipredict>(all, ec, mp);
   }
   if (all.sd->contraction != 1.)
     for (size_t c = 0; c < count; c++) pred[c].scalar *= (float)all.sd->contraction;
   if (finalize_predictions)
     for (size_t c = 0; c < count; c++) pred[c].scalar = finalize_prediction(all.sd, pred[c].scalar);
   if (audit)
   {
     for (size_t c = 0; c < count; c++)
     {
       ec.pred.scalar = pred[c].scalar;
       print_audit_features(all, ec);
       ec.ft_offset += (uint64_t)step;
     }
     ec.ft_offset -= (uint64_t)(step * count);
   }
 }

◆ operator<()

bool GD::operator<	(	const string_value &	first,
		const string_value &	second
	)

Definition at line 197 of file gd.cc.

References GD::string_value::v.

197 { return fabsf(first.v) > fabsf(second.v); }

◆ pred_per_update_feature()

template<bool sqrt_rate, bool feature_mask_off, size_t adaptive, size_t normalized, size_t spare, bool stateless>

void GD::pred_per_update_feature	(	norm_data &	nd,
		float	x,
		float &	fw
	)

inline

Definition at line 488 of file gd.cc.

References GD::norm_data::extra_state, GD::norm_data::grad_squared, GD::power_data::neg_norm_power, GD::norm_data::norm_x, GD::norm_data::pd, ldamath::powf(), GD::norm_data::pred_per_update, THROW, and x2_min.

 {
   if (feature_mask_off || fw != 0.)
   {
     weight* w = &fw;
     float x2 = x * x;
     if (x2 < x2_min)
     {
       x = (x > 0) ? x_min : -x_min;
       x2 = x2_min;
     }
     if (x2 > x2_max)
       THROW("your features have too much magnitude");
     if (stateless)  // we must not modify the parameter state so introduce a shadow version.
     {
       nd.extra_state[0] = w[0];
       nd.extra_state[adaptive] = w[adaptive];
       nd.extra_state[normalized] = w[normalized];
       w = nd.extra_state;
     }
     if (adaptive)
       w[adaptive] += nd.grad_squared * x2;
     if (normalized)
     {
       float x_abs = fabsf(x);
       if (x_abs > w[normalized])  // new scale discovered
       {
         if (w[normalized] >
             0.)  // If the normalizer is > 0 then rescale the weight so it's as if the new scale was the old scale.
         {
           if (sqrt_rate)
           {
             float rescale = w[normalized] / x_abs;
             w[0] *= (adaptive ? rescale : rescale * rescale);
           }
           else
           {
             float rescale = x_abs / w[normalized];
             w[0] *= powf(rescale * rescale, nd.pd.neg_norm_power);
           }
         }
         w[normalized] = x_abs;
       }
       nd.norm_x += x2 / (w[normalized] * w[normalized]);
     }
     w[spare] = compute_rate_decay<sqrt_rate, adaptive, normalized>(nd.pd, w[0]);
     nd.pred_per_update += x2 * w[spare];
   }
 }

◆ predict()

template<bool l1, bool audit>

void GD::predict	(	gd &	g,
		base_learner &	,
		example &	ec
	)

Definition at line 379 of file gd.cc.

References GD::gd::all, shared_data::contraction, finalize_prediction(), shared_data::gravity, inline_predict(), example::partial_prediction, example::pred, print_audit_features(), polyprediction::scalar, vw::sd, and trunc_predict().

 {
   vw& all = *g.all;
   if (l1)
     ec.partial_prediction = trunc_predict(all, ec, all.sd->gravity);
   else
     ec.partial_prediction = inline_predict(all, ec);
 
   ec.partial_prediction *= (float)all.sd->contraction;
   ec.pred.scalar = finalize_prediction(all.sd, ec.partial_prediction);
   if (audit)
     print_audit_features(all, ec);
 }

◆ print_audit_features()

void GD::print_audit_features	(	vw &	all,
		example &	ec
	)

Definition at line 331 of file gd.cc.

References vw::audit, example::pred, print_features(), print_result(), polyprediction::scalar, vw::stdout_fileno, and example::tag.

Referenced by learn_batch(), multipredict(), multipredict(), predict(), and predict().

 {
   if (all.audit)
     print_result(all.stdout_fileno, ec.pred.scalar, -1, ec.tag);
   fflush(stdout);
   print_features(all, ec);
 }

◆ print_features()

void GD::print_features	(	vw &	all,
		example &	ec
	)

Definition at line 298 of file gd.cc.

References audit_feature(), audit_interaction(), f, example_predict::ft_offset, vw::lda, print_lda_features(), GD::audit_results::results, and GD::string_value::s.

Referenced by print_audit_features().

 {
   if (all.lda > 0)
     print_lda_features(all, ec);
   else
   {
     audit_results dat(all, ec.ft_offset);
 
     for (features& fs : ec)
     {
       if (fs.space_names.size() > 0)
         for (features::iterator_all& f : fs.values_indices_audit())
         {
           audit_interaction(dat, f.audit().get());
           audit_feature(dat, f.value(), f.index() + ec.ft_offset);
           audit_interaction(dat, NULL);
         }
       else
         for (features::iterator& f : fs) audit_feature(dat, f.value(), f.index() + ec.ft_offset);
     }
 
     INTERACTIONS::generate_interactions<audit_results, const uint64_t, audit_feature, true, audit_interaction>(
         all, ec, dat);
 
     stable_sort(dat.results.begin(), dat.results.end());
     if (all.audit)
     {
       for (string_value& sv : dat.results) std::cout << '\t' << sv.s;
       std::cout << std::endl;
     }
   }
 }

◆ print_lda_features()

void GD::print_lda_features	(	vw &	all,
		example &	ec
	)

Definition at line 280 of file gd.cc.

References f, vw::lda, vw::parse_mask, stride_shift(), parameters::stride_shift(), and vw::weights.

Referenced by print_features().

 {
   parameters& weights = all.weights;
   uint32_t stride_shift = weights.stride_shift();
   size_t count = 0;
   for (features& fs : ec) count += fs.size();
   for (features& fs : ec)
   {
     for (features::iterator_all& f : fs.values_indices_audit())
     {
       std::cout << '\t' << f.audit().get()->first << '^' << f.audit().get()->second << ':'
                 << ((f.index() >> stride_shift) & all.parse_mask) << ':' << f.value();
       for (size_t k = 0; k < all.lda; k++) std::cout << ':' << (&weights[f.index()])[k];
     }
   }
   std::cout << " total of " << count << " features." << std::endl;
 }

◆ quake_InvSqrt()

float GD::quake_InvSqrt ( float x )

inline

Definition at line 68 of file gd.cc.

Referenced by InvSqrt().

 {
   // Carmack/Quake/SGI fast method:
   float xhalf = 0.5f * x;
   static_assert(sizeof(int) == sizeof(float), "Floats and ints are converted between, they must be the same size.");
   int i = reinterpret_cast<int&>(x);  // store floating-point bits in integer
   i = 0x5f3759d5 - (i >> 1);          // initial guess for Newton's method
   x = reinterpret_cast<float&>(i);    // convert new bits into float
   x = x * (1.5f - xhalf * x * x);     // One round of Newton's method
   return x;
 }

◆ save_load()

void GD::save_load	(	gd &	g,
		io_buf &	model_file,
		bool	read,
		bool	text
	)

Definition at line 992 of file gd.cc.

References parameters::adaptive, GD::gd::all, bin_text_read_write_fixed(), constant, parameters::dense_weights, io_buf::files, GD::gd::initial_constant, vw::initial_t, vw::initial_weight, initialize_regressor(), vw::model_file_ver, save_load_online_state(), save_load_regressor(), vw::save_resume, dense_parameters::set_default(), sparse_parameters::set_default(), VW::set_weight(), v_array< T >::size(), parameters::sparse, parameters::sparse_weights, sync_weights(), GD::gd::total_weight, vw::trace_message, vw::training, VERSION_SAVE_RESUME_FIX, and vw::weights.

Referenced by setup().

 {
   vw& all = *g.all;
   if (read)
   {
     initialize_regressor(all);
 
     if (all.weights.adaptive && all.initial_t > 0)
     {
       float init_weight = all.initial_weight;
       std::pair<float, float> p = std::make_pair(init_weight, all.initial_t);
       if (all.weights.sparse)
         all.weights.sparse_weights.set_default<std::pair<float, float>, set_initial_gd_wrapper<sparse_parameters> >(p);
       else
         all.weights.dense_weights.set_default<std::pair<float, float>, set_initial_gd_wrapper<dense_parameters> >(p);
       // for adaptive update, we interpret initial_t as previously seeing initial_t fake datapoints, all with squared
       // gradient=1 NOTE: this is not invariant to the scaling of the data (i.e. when combined with normalized). Since
       // scaling the data scales the gradient, this should ideally be feature_range*initial_t, or something like that.
       // We could potentially fix this by just adding this base quantity times the current range to the sum of gradients
       // stored in memory at each update, and always start sum of gradients to 0, at the price of additional additions
       // and multiplications during the update...
     }
     if (g.initial_constant != 0.0)
       VW::set_weight(all, constant, 0, g.initial_constant);
   }
 
   if (model_file.files.size() > 0)
   {
     bool resume = all.save_resume;
     std::stringstream msg;
     msg << ":" << resume << "\n";
     bin_text_read_write_fixed(model_file, (char*)&resume, sizeof(resume), "", read, msg, text);
     if (resume)
     {
       if (read && all.model_file_ver < VERSION_SAVE_RESUME_FIX)
         all.trace_message
             << std::endl
             << "WARNING: --save_resume functionality is known to have inaccuracy in model files version less than "
             << VERSION_SAVE_RESUME_FIX << std::endl
             << std::endl;
       save_load_online_state(all, model_file, read, text, g.total_weight, &g);
     }
     else
       save_load_regressor(all, model_file, read, text);
   }
   if (!all.training)  // If the regressor was saved as --save_resume, then when testing we want to materialize the
                       // weights.
     sync_weights(all);
 }

◆ save_load_online_state() [1/2]

template<class T >

void GD::save_load_online_state	(	vw &	all,
		io_buf &	model_file,
		bool	read,
		bool	text,
		gd *	g,
		std::stringstream &	msg,
		uint32_t	ftrl_size,
		T &	weights
	)

Definition at line 776 of file gd.cc.

References parameters::adaptive, GD::gd::adaptive_input, io_buf::bin_read_fixed(), bin_text_write_fixed(), parameters::normalized, GD::gd::normalized_input, vw::num_bits, THROW, vw::weights, and write_index().

Referenced by SVRG::save_load(), and save_load().

 {
   uint64_t length = (uint64_t)1 << all.num_bits;
 
   uint64_t i = 0;
   uint32_t old_i = 0;
   size_t brw = 1;
 
   if (read)
     do
     {
       brw = 1;
       if (all.num_bits < 31)  // backwards compatible
       {
         brw = model_file.bin_read_fixed((char*)&old_i, sizeof(old_i), "");
         i = old_i;
       }
       else
         brw = model_file.bin_read_fixed((char*)&i, sizeof(i), "");
       if (brw > 0)
       {
         if (i >= length)
           THROW("Model content is corrupted, weight vector index " << i << " must be less than total vector length "
                                                                    << length);
         weight buff[8] = {0, 0, 0, 0, 0, 0, 0, 0};
         if (ftrl_size > 0)
           brw += model_file.bin_read_fixed((char*)buff, sizeof(buff[0]) * ftrl_size, "");
         else if (g == NULL || (!g->adaptive_input && !g->normalized_input))
           brw += model_file.bin_read_fixed((char*)buff, sizeof(buff[0]), "");
         else if ((g->adaptive_input && !g->normalized_input) || (!g->adaptive_input && g->normalized_input))
           brw += model_file.bin_read_fixed((char*)buff, sizeof(buff[0]) * 2, "");
         else  // adaptive and normalized
           brw += model_file.bin_read_fixed((char*)buff, sizeof(buff[0]) * 3, "");
         uint32_t stride = 1 << weights.stride_shift();
         weight* v = &weights.strided_index(i);
         for (size_t i = 0; i < stride; i++) v[i] = buff[i];
       }
     } while (brw > 0);
   else  // write binary or text
     for (typename T::iterator v = weights.begin(); v != weights.end(); ++v)
     {
       i = v.index() >> weights.stride_shift();
 
       if (ftrl_size == 3)
       {
         if (*v != 0. || (&(*v))[1] != 0. || (&(*v))[2] != 0.)
         {
           brw = write_index(model_file, msg, text, all.num_bits, i);
           msg << ":" << *v << " " << (&(*v))[1] << " " << (&(*v))[2] << "\n";
           brw += bin_text_write_fixed(model_file, (char*)&(*v), 3 * sizeof(*v), msg, text);
         }
       }
       else if (ftrl_size == 4)
       {
         if (*v != 0. || (&(*v))[1] != 0. || (&(*v))[2] != 0. || (&(*v))[3] != 0.)
         {
           brw = write_index(model_file, msg, text, all.num_bits, i);
           msg << ":" << *v << " " << (&(*v))[1] << " " << (&(*v))[2] << " " << (&(*v))[3] << "\n";
           brw += bin_text_write_fixed(model_file, (char*)&(*v), 4 * sizeof(*v), msg, text);
         }
       }
       else if (ftrl_size == 6)
       {
         if (*v != 0. || (&(*v))[1] != 0. || (&(*v))[2] != 0. || (&(*v))[3] != 0. || (&(*v))[4] != 0. ||
             (&(*v))[5] != 0.)
         {
           brw = write_index(model_file, msg, text, all.num_bits, i);
           msg << ":" << *v << " " << (&(*v))[1] << " " << (&(*v))[2] << " " << (&(*v))[3] << " " << (&(*v))[4] << " "
               << (&(*v))[5] << "\n";
           brw += bin_text_write_fixed(model_file, (char*)&(*v), 6 * sizeof(*v), msg, text);
         }
       }
       else if (g == nullptr || (!all.weights.adaptive && !all.weights.normalized))
       {
         if (*v != 0.)
         {
           brw = write_index(model_file, msg, text, all.num_bits, i);
           msg << ":" << *v << "\n";
           brw += bin_text_write_fixed(model_file, (char*)&(*v), sizeof(*v), msg, text);
         }
       }
       else if ((all.weights.adaptive && !all.weights.normalized) || (!all.weights.adaptive && all.weights.normalized))
       {
         // either adaptive or normalized
         if (*v != 0. || (&(*v))[1] != 0.)
         {
           brw = write_index(model_file, msg, text, all.num_bits, i);
           msg << ":" << *v << " " << (&(*v))[1] << "\n";
           brw += bin_text_write_fixed(model_file, (char*)&(*v), 2 * sizeof(*v), msg, text);
         }
       }
       else
       {
         // adaptive and normalized
         if (*v != 0. || (&(*v))[1] != 0. || (&(*v))[2] != 0.)
         {
           brw = write_index(model_file, msg, text, all.num_bits, i);
           msg << ":" << *v << " " << (&(*v))[1] << " " << (&(*v))[2] << "\n";
           brw += bin_text_write_fixed(model_file, (char*)&(*v), 3 * sizeof(*v), msg, text);
         }
       }
     }
 }

◆ save_load_online_state() [2/2]

void GD::save_load_online_state	(	vw &	all,
		io_buf &	model_file,
		bool	read,
		bool	text,
		double &	total_weight,
		gd *	g,
		uint32_t	ftrl_size
	)

Definition at line 881 of file gd.cc.

References bin_text_read_write_fixed(), vw::current_pass, parameters::dense_weights, shared_data::dump_interval, shared_data::example_number, vw::initial_t, shared_data::max_label, shared_data::min_label, vw::model_file_ver, vw::normalized_sum_norm_x, shared_data::old_weighted_labeled_examples, vw::preserve_performance_counters, vw::sd, parameters::sparse, parameters::sparse_weights, shared_data::sum_loss, shared_data::sum_loss_since_last_dump, shared_data::t, shared_data::total_features, vw::training, VERSION_PASS_UINT64, VERSION_SAVE_RESUME_FIX, shared_data::weighted_labeled_examples, shared_data::weighted_labels, shared_data::weighted_unlabeled_examples, and vw::weights.

Referenced by save_load().

 {
   // vw& all = *g.all;
   std::stringstream msg;
 
   msg << "initial_t " << all.initial_t << "\n";
   bin_text_read_write_fixed(model_file, (char*)&all.initial_t, sizeof(all.initial_t), "", read, msg, text);
 
   msg << "norm normalizer " << all.normalized_sum_norm_x << "\n";
   bin_text_read_write_fixed(
       model_file, (char*)&all.normalized_sum_norm_x, sizeof(all.normalized_sum_norm_x), "", read, msg, text);
 
   msg << "t " << all.sd->t << "\n";
   bin_text_read_write_fixed(model_file, (char*)&all.sd->t, sizeof(all.sd->t), "", read, msg, text);
 
   msg << "sum_loss " << all.sd->sum_loss << "\n";
   bin_text_read_write_fixed(model_file, (char*)&all.sd->sum_loss, sizeof(all.sd->sum_loss), "", read, msg, text);
 
   msg << "sum_loss_since_last_dump " << all.sd->sum_loss_since_last_dump << "\n";
   bin_text_read_write_fixed(model_file, (char*)&all.sd->sum_loss_since_last_dump,
       sizeof(all.sd->sum_loss_since_last_dump), "", read, msg, text);
 
   float dump_interval = all.sd->dump_interval;
   msg << "dump_interval " << dump_interval << "\n";
   bin_text_read_write_fixed(model_file, (char*)&dump_interval, sizeof(dump_interval), "", read, msg, text);
   if (!read || (all.training && all.preserve_performance_counters))  // update dump_interval from input model
     all.sd->dump_interval = dump_interval;
 
   msg << "min_label " << all.sd->min_label << "\n";
   bin_text_read_write_fixed(model_file, (char*)&all.sd->min_label, sizeof(all.sd->min_label), "", read, msg, text);
 
   msg << "max_label " << all.sd->max_label << "\n";
   bin_text_read_write_fixed(model_file, (char*)&all.sd->max_label, sizeof(all.sd->max_label), "", read, msg, text);
 
   msg << "weighted_labeled_examples " << all.sd->weighted_labeled_examples << "\n";
   bin_text_read_write_fixed(model_file, (char*)&all.sd->weighted_labeled_examples,
       sizeof(all.sd->weighted_labeled_examples), "", read, msg, text);
 
   msg << "weighted_labels " << all.sd->weighted_labels << "\n";
   bin_text_read_write_fixed(
       model_file, (char*)&all.sd->weighted_labels, sizeof(all.sd->weighted_labels), "", read, msg, text);
 
   msg << "weighted_unlabeled_examples " << all.sd->weighted_unlabeled_examples << "\n";
   bin_text_read_write_fixed(model_file, (char*)&all.sd->weighted_unlabeled_examples,
       sizeof(all.sd->weighted_unlabeled_examples), "", read, msg, text);
 
   msg << "example_number " << all.sd->example_number << "\n";
   bin_text_read_write_fixed(
       model_file, (char*)&all.sd->example_number, sizeof(all.sd->example_number), "", read, msg, text);
 
   msg << "total_features " << all.sd->total_features << "\n";
   bin_text_read_write_fixed(
       model_file, (char*)&all.sd->total_features, sizeof(all.sd->total_features), "", read, msg, text);
 
   if (!read || all.model_file_ver >= VERSION_SAVE_RESUME_FIX)
   {
     // restore some data to allow --save_resume work more accurate
 
     // fix average loss
     msg << "total_weight " << total_weight << "\n";
     bin_text_read_write_fixed(model_file, (char*)&total_weight, sizeof(total_weight), "", read, msg, text);
 
     // fix "loss since last" for first printed out example details
     msg << "sd::oec.weighted_labeled_examples " << all.sd->old_weighted_labeled_examples << "\n";
     bin_text_read_write_fixed(model_file, (char*)&all.sd->old_weighted_labeled_examples,
         sizeof(all.sd->old_weighted_labeled_examples), "", read, msg, text);
 
     // fix "number of examples per pass"
     msg << "current_pass " << all.current_pass << "\n";
     if (all.model_file_ver >= VERSION_PASS_UINT64)
       bin_text_read_write_fixed(model_file, (char*)&all.current_pass, sizeof(all.current_pass), "", read, msg, text);
     else  // backwards compatiblity.
     {
       size_t temp_pass = (size_t)all.current_pass;
       bin_text_read_write_fixed(model_file, (char*)&temp_pass, sizeof(temp_pass), "", read, msg, text);
       all.current_pass = temp_pass;
     }
   }
 
   if (read &&
       (!all.training ||
           !all.preserve_performance_counters))  // reset various things so that we report test set performance properly
   {
     all.sd->sum_loss = 0;
     all.sd->sum_loss_since_last_dump = 0;
     all.sd->weighted_labeled_examples = 0.;
     all.sd->weighted_labels = 0.;
     all.sd->weighted_unlabeled_examples = 0.;
     all.sd->old_weighted_labeled_examples = 0.;
     all.sd->example_number = 0;
     all.sd->total_features = 0;
     all.current_pass = 0;
   }
   if (all.weights.sparse)
     save_load_online_state(all, model_file, read, text, g, msg, ftrl_size, all.weights.sparse_weights);
   else
     save_load_online_state(all, model_file, read, text, g, msg, ftrl_size, all.weights.dense_weights);
 }

◆ save_load_regressor() [1/2]

template<class T >

void GD::save_load_regressor	(	vw &	all,
		io_buf &	model_file,
		bool	read,
		bool	text,
		T &	weights
	)

Definition at line 707 of file gd.cc.

References io_buf::bin_read_fixed(), bin_text_write_fixed(), vw::name_index_map, vw::num_bits, vw::print_invert, THROW, and write_index().

Referenced by SVRG::save_load(), and save_load().

 {
   size_t brw = 1;
 
   if (all.print_invert)  // write readable model with feature names
   {
     std::stringstream msg;
     typedef std::map<std::string, size_t> str_int_map;
 
     for (str_int_map::iterator it = all.name_index_map.begin(); it != all.name_index_map.end(); ++it)
     {
       weight* v = &weights.strided_index(it->second);
       if (*v != 0.)
       {
         msg << it->first;
         brw = bin_text_write_fixed(model_file, (char*)it->first.c_str(), sizeof(*it->first.c_str()), msg, true);
 
         msg << ":" << it->second << ":" << *v << "\n";
         bin_text_write_fixed(model_file, (char*)&(*v), sizeof(*v), msg, true);
       }
     }
     return;
   }
 
   uint64_t i = 0;
   uint32_t old_i = 0;
   uint64_t length = (uint64_t)1 << all.num_bits;
   if (read)
     do
     {
       brw = 1;
       if (all.num_bits < 31)  // backwards compatible
       {
         brw = model_file.bin_read_fixed((char*)&old_i, sizeof(old_i), "");
         i = old_i;
       }
       else
         brw = model_file.bin_read_fixed((char*)&i, sizeof(i), "");
       if (brw > 0)
       {
         if (i >= length)
           THROW("Model content is corrupted, weight vector index " << i << " must be less than total vector length "
                                                                    << length);
         weight* v = &weights.strided_index(i);
         brw += model_file.bin_read_fixed((char*)&(*v), sizeof(*v), "");
       }
     } while (brw > 0);
   else  // write
     for (typename T::iterator v = weights.begin(); v != weights.end(); ++v)
       if (*v != 0.)
       {
         i = v.index() >> weights.stride_shift();
         std::stringstream msg;
 
         brw = write_index(model_file, msg, text, all.num_bits, i);
         msg << ":" << *v << "\n";
         brw += bin_text_write_fixed(model_file, (char*)&(*v), sizeof(*v), msg, text);
       }
 }

◆ save_load_regressor() [2/2]

void GD::save_load_regressor	(	vw &	all,
		io_buf &	model_file,
		bool	read,
		bool	text
	)

Definition at line 767 of file gd.cc.

References parameters::dense_weights, parameters::sparse, parameters::sparse_weights, and vw::weights.

Referenced by save_load().

 {
   if (all.weights.sparse)
     save_load_regressor(all, model_file, read, text, all.weights.sparse_weights);
   else
     save_load_regressor(all, model_file, read, text, all.weights.dense_weights);
 }

◆ sensitivity() [1/2]

template<bool sqrt_rate, bool feature_mask_off, bool adax, size_t adaptive, size_t normalized, size_t spare, bool stateless>

float GD::sensitivity	(	gd &	g,
		example &	ec
	)

Definition at line 579 of file gd.cc.

References example::total_sum_feat_sq.

 {
   if (adaptive || normalized)
     return get_pred_per_update<sqrt_rate, feature_mask_off, adax, adaptive, normalized, spare, stateless>(g, ec);
   else
     return ec.total_sum_feat_sq;
 }

◆ sensitivity() [2/2]

template<bool sqrt_rate, bool feature_mask_off, bool adax, size_t adaptive, size_t normalized, size_t spare>

float GD::sensitivity	(	gd &	g,
		base_learner &	,
		example &	ec
	)

Definition at line 601 of file gd.cc.

 {
   return get_scale<adaptive>(g, ec, 1.) *
       sensitivity<sqrt_rate, feature_mask_off, adax, adaptive, normalized, spare, true>(g, ec);
 }

◆ set_learn() [1/6]

template<bool sparse_l2, bool invariant, bool sqrt_rate, bool feature_mask_off, uint64_t adaptive, uint64_t normalized, uint64_t spare, uint64_t next>

uint64_t GD::set_learn	(	vw &	all,
		gd &	g
	)

Definition at line 1044 of file gd.cc.

References GD::gd::adax, GD::gd::learn, vw::normalized_idx, GD::gd::sensitivity, and GD::gd::update.

 {
   all.normalized_idx = normalized;
   if (g.adax)
   {
     g.learn = learn<sparse_l2, invariant, sqrt_rate, feature_mask_off, true, adaptive, normalized, spare>;
     g.update = update<sparse_l2, invariant, sqrt_rate, feature_mask_off, true, adaptive, normalized, spare>;
     g.sensitivity = sensitivity<sqrt_rate, feature_mask_off, true, adaptive, normalized, spare>;
     return next;
   }
   else
   {
     g.learn = learn<sparse_l2, invariant, sqrt_rate, feature_mask_off, false, adaptive, normalized, spare>;
     g.update = update<sparse_l2, invariant, sqrt_rate, feature_mask_off, false, adaptive, normalized, spare>;
     g.sensitivity = sensitivity<sqrt_rate, feature_mask_off, false, adaptive, normalized, spare>;
     return next;
   }
 }

◆ set_learn() [2/6]

template<bool sparse_l2, bool invariant, bool sqrt_rate, uint64_t adaptive, uint64_t normalized, uint64_t spare, uint64_t next>

uint64_t GD::set_learn	(	vw &	all,
		bool	feature_mask_off,
		gd &	g
	)

Definition at line 1065 of file gd.cc.

References vw::normalized_idx.

 {
   all.normalized_idx = normalized;
   if (feature_mask_off)
     return set_learn<sparse_l2, invariant, sqrt_rate, true, adaptive, normalized, spare, next>(all, g);
   else
     return set_learn<sparse_l2, invariant, sqrt_rate, false, adaptive, normalized, spare, next>(all, g);
 }

◆ set_learn() [3/6]

template<bool invariant, bool sqrt_rate, uint64_t adaptive, uint64_t normalized, uint64_t spare, uint64_t next>

uint64_t GD::set_learn	(	vw &	all,
		bool	feature_mask_off,
		gd &	g
	)

Definition at line 1075 of file gd.cc.

References GD::gd::sparse_l2.

 {
   if (g.sparse_l2 > 0.f)
     return set_learn<true, invariant, sqrt_rate, adaptive, normalized, spare, next>(all, feature_mask_off, g);
   else
     return set_learn<false, invariant, sqrt_rate, adaptive, normalized, spare, next>(all, feature_mask_off, g);
 }

◆ set_learn() [4/6]

template<bool sqrt_rate, uint64_t adaptive, uint64_t normalized, uint64_t spare, uint64_t next>

uint64_t GD::set_learn	(	vw &	all,
		bool	feature_mask_off,
		gd &	g
	)

Definition at line 1084 of file gd.cc.

References vw::invariant_updates.

 {
   if (all.invariant_updates)
     return set_learn<true, sqrt_rate, adaptive, normalized, spare, next>(all, feature_mask_off, g);
   else
     return set_learn<false, sqrt_rate, adaptive, normalized, spare, next>(all, feature_mask_off, g);
 }

◆ set_learn() [5/6]

template<bool sqrt_rate, uint64_t adaptive, uint64_t spare>

uint64_t GD::set_learn	(	vw &	all,
		bool	feature_mask_off,
		gd &	g
	)

Definition at line 1093 of file gd.cc.

References parameters::normalized, and vw::weights.

 {
   // select the appropriate learn function based on adaptive, normalization, and feature mask
   if (all.weights.normalized)
     return set_learn<sqrt_rate, adaptive, adaptive + 1, adaptive + 2, adaptive + 3>(all, feature_mask_off, g);
   else
     return set_learn<sqrt_rate, adaptive, 0, spare, spare + 1>(all, feature_mask_off, g);
 }

◆ set_learn() [6/6]

template<bool sqrt_rate>

uint64_t GD::set_learn	(	vw &	all,
		bool	feature_mask_off,
		gd &	g
	)

Definition at line 1103 of file gd.cc.

References parameters::adaptive, and vw::weights.

 {
   if (all.weights.adaptive)
     return set_learn<sqrt_rate, 1, 2>(all, feature_mask_off, g);
   else
     return set_learn<sqrt_rate, 0, 0>(all, feature_mask_off, g);
 }

◆ setup()

LEARNER::base_learner * GD::setup	(	options_i &	options,
		vw &	all
	)

Definition at line 1119 of file gd.cc.

References parameters::adaptive, VW::config::option_group_definition::add(), add(), VW::config::options_i::add_and_parse(), vw::audit, ceil_log_2(), shared_data::contraction, LEARNER::end_pass(), vw::eta, vw::eta_decay_rate, f, VW::config::options_i::get_typed_option(), shared_data::gravity, vw::hash_inv, shared_data::holdout_best_loss, vw::holdout_set_off, LEARNER::init_learner(), vw::initial_constant, vw::initial_t, vw::invariant_updates, LEARNER::make_base(), VW::config::make_option(), GD::gd::multipredict, parameters::normalized, vw::normalized_sum_norm_x, vw::numpasses, vw::power_t, ldamath::powf(), GD::gd::predict, vw::reg_mode, save_load(), vw::save_resume, vw::sd, GD::gd::sensitivity, LEARNER::learner< T, E >::set_end_pass(), LEARNER::learner< T, E >::set_multipredict(), LEARNER::learner< T, E >::set_save_load(), LEARNER::learner< T, E >::set_sensitivity(), LEARNER::learner< T, E >::set_update(), shared_data::t, THROW, vw::trace_message, vw::training, GD::gd::update, VW::config::options_i::was_supplied(), and vw::weights.

Referenced by parse_reductions().

 {
   auto g = scoped_calloc_or_throw<gd>();
 
   bool sgd = false;
   bool adaptive = false;
   bool adax = false;
   bool invariant = false;
   bool normalized = false;
 
   option_group_definition new_options("Gradient Descent options");
   new_options.add(make_option("sgd", sgd).help("use regular stochastic gradient descent update.").keep(all.save_resume))
       .add(make_option("adaptive", adaptive).help("use adaptive, individual learning rates.").keep(all.save_resume))
       .add(make_option("adax", adax).help("use adaptive learning rates with x^2 instead of g^2x^2"))
       .add(make_option("invariant", invariant).help("use safe/importance aware updates.").keep(all.save_resume))
       .add(make_option("normalized", normalized).help("use per feature normalized updates").keep(all.save_resume))
       .add(make_option("sparse_l2", g->sparse_l2).default_value(0.f).help("use per feature normalized updates"))
       .add(make_option("l1_state", all.sd->gravity)
                .keep(all.save_resume)
                .default_value(0.)
                .help("use per feature normalized updates"))
       .add(make_option("l2_state", all.sd->contraction)
                .keep(all.save_resume)
                .default_value(1.)
                .help("use per feature normalized updates"));
   options.add_and_parse(new_options);
 
   g->all = &all;
   g->all->normalized_sum_norm_x = 0;
   g->no_win_counter = 0;
   g->total_weight = 0.;
   all.weights.adaptive = true;
   all.weights.normalized = true;
   g->neg_norm_power = (all.weights.adaptive ? (all.power_t - 1.f) : -1.f);
   g->neg_power_t = -all.power_t;
 
   if (all.initial_t > 0)  // for the normalized update: if initial_t is bigger than 1 we interpret this as if we had
                           // seen (all.initial_t) previous fake datapoints all with norm 1
   {
     g->all->normalized_sum_norm_x = all.initial_t;
     g->total_weight = all.initial_t;
   }
 
   bool feature_mask_off = true;
   if (options.was_supplied("feature_mask"))
     feature_mask_off = false;
 
   if (!all.holdout_set_off)
   {
     all.sd->holdout_best_loss = FLT_MAX;
     g->early_stop_thres = options.get_typed_option<size_t>("early_terminate").value();
   }
 
   g->initial_constant = all.initial_constant;
 
   if (sgd || adaptive || invariant || normalized)
   {
     // nondefault
     all.weights.adaptive = adaptive;
     all.invariant_updates = all.training && invariant;
     all.weights.normalized = normalized;
 
     if (!options.was_supplied("learning_rate") && !options.was_supplied("l") &&
         !(all.weights.adaptive && all.weights.normalized))
       all.eta = 10;  // default learning rate to 10 for non default update rule
 
     // if not using normalized or adaptive, default initial_t to 1 instead of 0
     if (!all.weights.adaptive && !all.weights.normalized)
     {
       if (!options.was_supplied("initial_t"))
       {
         all.sd->t = 1.f;
         all.initial_t = 1.f;
       }
       all.eta *= powf((float)(all.sd->t), all.power_t);
     }
   }
   else
   {
     all.invariant_updates = all.training;
   }
   g->adaptive_input = all.weights.adaptive;
   g->normalized_input = all.weights.normalized;
 
   all.weights.adaptive = all.weights.adaptive && all.training;
   all.weights.normalized = all.weights.normalized && all.training;
 
   if (adax)
     g->adax = all.training && adax;
 
   if (g->adax && !all.weights.adaptive)
     THROW("Cannot use adax without adaptive");
 
   if (pow((double)all.eta_decay_rate, (double)all.numpasses) < 0.0001)
     all.trace_message << "Warning: the learning rate for the last pass is multiplied by: "
                       << pow((double)all.eta_decay_rate, (double)all.numpasses)
                       << " adjust --decay_learning_rate larger to avoid this." << std::endl;
 
   if (all.reg_mode % 2)
     if (all.audit || all.hash_inv)
     {
       g->predict = predict<true, true>;
       g->multipredict = multipredict<true, true>;
     }
     else
     {
       g->predict = predict<true, false>;
       g->multipredict = multipredict<true, false>;
     }
   else if (all.audit || all.hash_inv)
   {
     g->predict = predict<false, true>;
     g->multipredict = multipredict<false, true>;
   }
   else
   {
     g->predict = predict<false, false>;
     g->multipredict = multipredict<false, false>;
   }
 
   uint64_t stride;
   if (all.power_t == 0.5)
     stride = set_learn<true>(all, feature_mask_off, *g.get());
   else
     stride = set_learn<false>(all, feature_mask_off, *g.get());
 
   all.weights.stride_shift((uint32_t)ceil_log_2(stride - 1));
 
   gd* bare = g.get();
   learner<gd, example>& ret = init_learner(g, g->learn, bare->predict, ((uint64_t)1 << all.weights.stride_shift()));
   ret.set_sensitivity(bare->sensitivity);
   ret.set_multipredict(bare->multipredict);
   ret.set_update(bare->update);
   ret.set_save_load(save_load);
   ret.set_end_pass(end_pass);
   return make_base(ret);
 }

◆ sign()

float GD::sign ( float w )

inline

Definition at line 106 of file gd.h.

Referenced by trunc_weight().

 {
   if (w < 0.)
     return -1.;
   else
     return 1.;
 }

◆ sync_weights()

void GD::sync_weights ( vw & all )

Definition at line 671 of file gd.cc.

References shared_data::contraction, parameters::dense_weights, shared_data::gravity, vw::sd, parameters::sparse, parameters::sparse_weights, trunc_weight(), and vw::weights.

Referenced by end_pass(), save_load(), and update().

 {
   // todo, fix length dependence
   if (all.sd->gravity == 0. && all.sd->contraction == 1.)  // to avoid unnecessary weight synchronization
     return;
 
   if (all.weights.sparse)
     for (weight& w : all.weights.sparse_weights)
       w = trunc_weight(w, (float)all.sd->gravity) * (float)all.sd->contraction;
   else
     for (weight& w : all.weights.dense_weights)
       w = trunc_weight(w, (float)all.sd->gravity) * (float)all.sd->contraction;
 
   all.sd->gravity = 0.;
   all.sd->contraction = 1.;
 }

◆ train()

template<bool sqrt_rate, bool feature_mask_off, size_t adaptive, size_t normalized, size_t spare>

void GD::train	(	gd &	g,
		example &	ec,
		float	update
	)

Definition at line 141 of file gd.cc.

References GD::gd::all, update(), and GD::gd::update_multiplier.

 {
   if (normalized)
     update *= g.update_multiplier;
   foreach_feature<float, update_feature<sqrt_rate, feature_mask_off, adaptive, normalized, spare> >(*g.all, ec, update);
 }

◆ trunc_predict()

float GD::trunc_predict	(	vw &	all,
		example &	ec,
		double	gravity
	)

inline

Definition at line 365 of file gd.cc.

References label_data::initial, example::l, GD::trunc_data::prediction, and polylabel::simple.

Referenced by predict().

 {
   trunc_data temp = {ec.l.simple.initial, (float)gravity};
   foreach_feature<trunc_data, vec_add_trunc>(all, ec, temp);
   return temp.prediction;
 }

◆ trunc_weight()

float GD::trunc_weight	(	const float	w,
		const float	gravity
	)

inline

Definition at line 114 of file gd.h.

References GD::multipredict_info< T >::gravity, and sign().

Referenced by audit_feature(), sync_weights(), vec_add_trunc(), and vec_add_trunc_multipredict().

 {
   return (gravity < fabsf(w)) ? w - sign(w) * gravity : 0.f;
 }

◆ update()

template<bool sparse_l2, bool invariant, bool sqrt_rate, bool feature_mask_off, bool adax, size_t adaptive, size_t normalized, size_t spare>

void GD::update	(	gd &	g,
		base_learner &	,
		example &	ec
	)

Definition at line 647 of file gd.cc.

References sync_weights().

Referenced by compute_update(), mf_train(), and train().

 {
   // invariant: not a test label, importance weight > 0
   float update;
   if ((update = compute_update<sparse_l2, invariant, sqrt_rate, feature_mask_off, adax, adaptive, normalized, spare>(
            g, ec)) != 0.)
     train<sqrt_rate, feature_mask_off, adaptive, normalized, spare>(g, ec, update);
 
   if (g.all->sd->contraction < 1e-9 || g.all->sd->gravity > 1e3)  // updating weights now to avoid numerical instability
     sync_weights(*g.all);
 }

◆ update_feature()

template<bool sqrt_rate, bool feature_mask_off, size_t adaptive, size_t normalized, size_t spare>

void GD::update_feature	(	float &	update,
		float	x,
		float &	fw
	)

inline

Definition at line 109 of file gd.cc.

 {
   weight* w = &fw;
   if (feature_mask_off || fw != 0.)
   {
     if (spare != 0)
       x *= w[spare];
     w[0] += update * x;
   }
 }

◆ vec_add()

void GD::vec_add	(	float &	p,
		const float	fx,
		const float &	fw
	)

inline

Definition at line 75 of file gd_predict.h.

75 { p += fw * fx; }

◆ vec_add_multipredict()

template<class T >

void GD::vec_add_multipredict	(	multipredict_info< T > &	mp,
		const float	fx,
		uint64_t	fi
	)

inline

Definition at line 40 of file gd.h.

References c, GD::multipredict_info< T >::count, GD::multipredict_info< T >::pred, polyprediction::scalar, GD::multipredict_info< T >::step, and GD::multipredict_info< T >::weights.

Referenced by multipredict(), and multipredict().

 {
   if ((-1e-10 < fx) && (fx < 1e-10))
     return;
   uint64_t mask = mp.weights.mask();
   polyprediction* p = mp.pred;
   fi &= mask;
   uint64_t top = fi + (uint64_t)((mp.count - 1) * mp.step);
   uint64_t i = 0;
   if (top <= mask)
   {
     i += fi;
     for (; i <= top; i += mp.step, ++p)
       p->scalar +=
           fx * mp.weights[i];  // TODO: figure out how to use weight_parameters::iterator (not using change_begin())
   }
   else  // TODO: this could be faster by unrolling into two loops
     for (size_t c = 0; c < mp.count; ++c, fi += (uint64_t)mp.step, ++p)
     {
       fi &= mask;
       p->scalar += fx * mp.weights[fi];
     }
 }

◆ vec_add_print()

void GD::vec_add_print	(	float &	p,
		const float	fx,
		float &	fw
	)

inline

Definition at line 372 of file gd.cc.

 {
   p += fw * fx;
   std::cerr << " + " << fw << "*" << fx;
 }

◆ vec_add_trunc()

void GD::vec_add_trunc	(	trunc_data &	p,
		const float	fx,
		float &	fw
	)

inline

Definition at line 360 of file gd.cc.

References GD::trunc_data::gravity, GD::trunc_data::prediction, and trunc_weight().

 {
   p.prediction += trunc_weight(fw, p.gravity) * fx;
 }

◆ vec_add_trunc_multipredict()

template<class T >

void GD::vec_add_trunc_multipredict	(	multipredict_info< T > &	mp,
		const float	fx,
		uint64_t	fi
	)

inline

Definition at line 394 of file gd.cc.

References c, GD::multipredict_info< T >::count, GD::multipredict_info< T >::gravity, GD::multipredict_info< T >::pred, polyprediction::scalar, GD::multipredict_info< T >::step, trunc_weight(), and GD::multipredict_info< T >::weights.

Referenced by multipredict().

 {
   size_t index = fi;
   for (size_t c = 0; c < mp.count; c++, index += mp.step)
     mp.pred[c].scalar += fx * trunc_weight(mp.weights[index], mp.gravity);
 }

◆ write_index()

size_t GD::write_index	(	io_buf &	model_file,
		std::stringstream &	msg,
		bool	text,
		uint32_t	num_bits,
		uint64_t	i
	)

Definition at line 688 of file gd.cc.

References bin_text_write_fixed().

Referenced by save_load_online_state(), and save_load_regressor().

 {
   size_t brw;
   uint32_t old_i = 0;
 
   msg << i;
 
   if (num_bits < 31)
   {
     old_i = (uint32_t)i;
     brw = bin_text_write_fixed(model_file, (char*)&old_i, sizeof(old_i), msg, text);
   }
   else
     brw = bin_text_write_fixed(model_file, (char*)&i, sizeof(i), msg, text);
 
   return brw;
 }

Variable Documentation

◆ global_print_features

bool GD::global_print_features = false

Definition at line 538 of file gd.cc.

◆ x2_max

constexpr float GD::x2_max = FLT_MAX

Definition at line 485 of file gd.cc.

◆ x2_min

constexpr float GD::x2_min = x_min * x_min

Definition at line 484 of file gd.cc.

Referenced by pred_per_update_feature().

◆ x_min

constexpr float GD::x_min = 1.084202e-19f

Definition at line 483 of file gd.cc.

Classes

Functions

Variables

Function Documentation

◆ audit_feature()

◆ audit_interaction()

◆ average_update()

◆ ceil_log_2()

◆ compute_rate_decay()

◆ compute_update()

◆ dummy_func()

◆ end_pass()

◆ finalize_prediction()

◆ foreach_feature() [1/6]

◆ foreach_feature() [2/6]

◆ foreach_feature() [3/6]

◆ foreach_feature() [4/6]

◆ foreach_feature() [5/6]

◆ foreach_feature() [6/6]

◆ generate_interactions()

◆ get_pred_per_update()

◆ get_scale()

◆ inline_predict() [1/2]

◆ inline_predict() [2/2]

◆ InvSqrt()

◆ learn()

◆ multipredict()

◆ operator<()

◆ pred_per_update_feature()

◆ predict()

◆ print_audit_features()

◆ print_features()

◆ print_lda_features()

◆ quake_InvSqrt()

◆ save_load()

◆ save_load_online_state() [1/2]

◆ save_load_online_state() [2/2]

◆ save_load_regressor() [1/2]

◆ save_load_regressor() [2/2]

◆ sensitivity() [1/2]

◆ sensitivity() [2/2]

◆ set_learn() [1/6]

◆ set_learn() [2/6]

◆ set_learn() [3/6]

◆ set_learn() [4/6]

◆ set_learn() [5/6]

◆ set_learn() [6/6]

◆ setup()

◆ sign()

◆ sync_weights()

◆ train()

◆ trunc_predict()

◆ trunc_weight()

◆ update()

◆ update_feature()

◆ vec_add()

◆ vec_add_multipredict()

◆ vec_add_print()

◆ vec_add_trunc()

◆ vec_add_trunc_multipredict()

◆ write_index()

Variable Documentation

◆ global_print_features

◆ x2_max

◆ x2_min

◆ x_min