Vowpal Wabbit
Classes | Functions | Variables
GD Namespace Reference

Classes

struct  audit_results
 
struct  gd
 
struct  multipredict_info
 
struct  norm_data
 
struct  power_data
 
class  set_initial_gd_wrapper
 
struct  string_value
 
struct  trunc_data
 

Functions

void sync_weights (vw &all)
 
float quake_InvSqrt (float x)
 
static float InvSqrt (float x)
 
template<bool sqrt_rate, bool feature_mask_off, size_t adaptive, size_t normalized, size_t spare>
void update_feature (float &update, float x, float &fw)
 
template<bool sqrt_rate, size_t adaptive, size_t normalized>
float average_update (float total_weight, float normalized_sum_norm_x, float neg_norm_power)
 
template<bool sqrt_rate, bool feature_mask_off, size_t adaptive, size_t normalized, size_t spare>
void train (gd &g, example &ec, float update)
 
void end_pass (gd &g)
 
bool operator< (const string_value &first, const string_value &second)
 
void audit_interaction (audit_results &dat, const audit_strings *f)
 
void audit_feature (audit_results &dat, const float ft_weight, const uint64_t ft_idx)
 
void print_lda_features (vw &all, example &ec)
 
void print_features (vw &all, example &ec)
 
void print_audit_features (vw &all, example &ec)
 
float finalize_prediction (shared_data *sd, float ret)
 
void vec_add_trunc (trunc_data &p, const float fx, float &fw)
 
float trunc_predict (vw &all, example &ec, double gravity)
 
void vec_add_print (float &p, const float fx, float &fw)
 
template<bool l1, bool audit>
void predict (gd &g, base_learner &, example &ec)
 
template<class T >
void vec_add_trunc_multipredict (multipredict_info< T > &mp, const float fx, uint64_t fi)
 
template<bool l1, bool audit>
void multipredict (gd &g, base_learner &, example &ec, size_t count, size_t step, polyprediction *pred, bool finalize_predictions)
 
template<bool sqrt_rate, size_t adaptive, size_t normalized>
float compute_rate_decay (power_data &s, float &fw)
 
template<bool sqrt_rate, bool feature_mask_off, size_t adaptive, size_t normalized, size_t spare, bool stateless>
void pred_per_update_feature (norm_data &nd, float x, float &fw)
 
template<bool sqrt_rate, bool feature_mask_off, bool adax, size_t adaptive, size_t normalized, size_t spare, bool stateless>
float get_pred_per_update (gd &g, example &ec)
 
template<bool sqrt_rate, bool feature_mask_off, bool adax, size_t adaptive, size_t normalized, size_t spare, bool stateless>
float sensitivity (gd &g, example &ec)
 
template<size_t adaptive>
float get_scale (gd &g, example &, float weight)
 
template<bool sqrt_rate, bool feature_mask_off, bool adax, size_t adaptive, size_t normalized, size_t spare>
float sensitivity (gd &g, base_learner &, example &ec)
 
template<bool sparse_l2, bool invariant, bool sqrt_rate, bool feature_mask_off, bool adax, size_t adaptive, size_t normalized, size_t spare>
float compute_update (gd &g, example &ec)
 
template<bool sparse_l2, bool invariant, bool sqrt_rate, bool feature_mask_off, bool adax, size_t adaptive, size_t normalized, size_t spare>
void update (gd &g, base_learner &, example &ec)
 
template<bool sparse_l2, bool invariant, bool sqrt_rate, bool feature_mask_off, bool adax, size_t adaptive, size_t normalized, size_t spare>
void learn (gd &g, base_learner &base, example &ec)
 
size_t write_index (io_buf &model_file, std::stringstream &msg, bool text, uint32_t num_bits, uint64_t i)
 
template<class T >
void save_load_regressor (vw &all, io_buf &model_file, bool read, bool text, T &weights)
 
void save_load_regressor (vw &all, io_buf &model_file, bool read, bool text)
 
template<class T >
void save_load_online_state (vw &all, io_buf &model_file, bool read, bool text, gd *g, std::stringstream &msg, uint32_t ftrl_size, T &weights)
 
void save_load_online_state (vw &all, io_buf &model_file, bool read, bool text, double &total_weight, gd *g, uint32_t ftrl_size)
 
void save_load (gd &g, io_buf &model_file, bool read, bool text)
 
template<bool sparse_l2, bool invariant, bool sqrt_rate, bool feature_mask_off, uint64_t adaptive, uint64_t normalized, uint64_t spare, uint64_t next>
uint64_t set_learn (vw &all, gd &g)
 
template<bool sparse_l2, bool invariant, bool sqrt_rate, uint64_t adaptive, uint64_t normalized, uint64_t spare, uint64_t next>
uint64_t set_learn (vw &all, bool feature_mask_off, gd &g)
 
template<bool invariant, bool sqrt_rate, uint64_t adaptive, uint64_t normalized, uint64_t spare, uint64_t next>
uint64_t set_learn (vw &all, bool feature_mask_off, gd &g)
 
template<bool sqrt_rate, uint64_t adaptive, uint64_t normalized, uint64_t spare, uint64_t next>
uint64_t set_learn (vw &all, bool feature_mask_off, gd &g)
 
template<bool sqrt_rate, uint64_t adaptive, uint64_t spare>
uint64_t set_learn (vw &all, bool feature_mask_off, gd &g)
 
template<bool sqrt_rate>
uint64_t set_learn (vw &all, bool feature_mask_off, gd &g)
 
uint64_t ceil_log_2 (uint64_t v)
 
base_learnersetup (options_i &options, vw &all)
 
template<class T >
void vec_add_multipredict (multipredict_info< T > &mp, const float fx, uint64_t fi)
 
template<class R , typename T >
void foreach_feature (vw &all, features &fs, R &dat, uint64_t offset=0, float mult=1.)
 
template<class R , class S , void(*)(R &, float, S) T>
void foreach_feature (vw &all, example &ec, R &dat)
 
template<class R , void(*)(R &, float, float &) T>
void foreach_feature (vw &all, example &ec, R &dat)
 
float inline_predict (vw &all, example &ec)
 
float sign (float w)
 
float trunc_weight (const float w, const float gravity)
 
template<class R , void(*)(R &, float, uint64_t) T, class W >
void foreach_feature (W &, features &fs, R &dat, uint64_t offset=0, float mult=1.)
 
template<class R , void(*)(R &, const float, const float &) T, class W >
void foreach_feature (const W &weights, features &fs, R &dat, uint64_t offset=0, float mult=1.)
 
template<class R >
void dummy_func (R &, const audit_strings *)
 
template<class R , class S , void(*)(R &, float, S) T, class W >
void generate_interactions (std::vector< std::string > &interactions, bool permutations, example_predict &ec, R &dat, W &weights)
 
template<class R , class S , void(*)(R &, float, S) T, class W >
void foreach_feature (W &weights, bool ignore_some_linear, std::array< bool, NUM_NAMESPACES > &ignore_linear, std::vector< std::string > &interactions, bool permutations, example_predict &ec, R &dat)
 
void vec_add (float &p, const float fx, const float &fw)
 
template<class W >
float inline_predict (W &weights, bool ignore_some_linear, std::array< bool, NUM_NAMESPACES > &ignore_linear, std::vector< std::string > &interactions, bool permutations, example_predict &ec, float initial=0.f)
 

Variables

constexpr float x_min = 1.084202e-19f
 
constexpr float x2_min = x_min * x_min
 
constexpr float x2_max = FLT_MAX
 
bool global_print_features = false
 

Function Documentation

◆ audit_feature()

void GD::audit_feature ( audit_results dat,
const float  ft_weight,
const uint64_t  ft_idx 
)
inline

Definition at line 241 of file gd.cc.

References parameters::adaptive, GD::audit_results::all, vw::audit, shared_data::contraction, vw::current_pass, shared_data::gravity, vw::hash_inv, parameters::mask(), vw::name_index_map, GD::audit_results::ns_pre, GD::audit_results::offset, GD::audit_results::results, vw::sd, stride_shift(), parameters::stride_shift(), vw::training, trunc_weight(), and vw::weights.

Referenced by print_features().

242 {
243  parameters& weights = dat.all.weights;
244  uint64_t index = ft_idx & weights.mask();
245  size_t stride_shift = weights.stride_shift();
246 
247  std::string ns_pre;
248  for (std::string& s : dat.ns_pre) ns_pre += s;
249 
250  if (dat.all.audit)
251  {
252  std::ostringstream tempstream;
253  tempstream << ':' << (index >> stride_shift) << ':' << ft_weight << ':'
254  << trunc_weight(weights[index], (float)dat.all.sd->gravity) * (float)dat.all.sd->contraction;
255 
256  if (weights.adaptive) // adaptive
257  tempstream << '@' << (&weights[index])[1];
258 
259  string_value sv = {weights[index] * ft_weight, ns_pre + tempstream.str()};
260  dat.results.push_back(sv);
261  }
262 
263  if ((dat.all.current_pass == 0 || dat.all.training == false) && dat.all.hash_inv)
264  {
265  // for invert_hash
266 
267  if (dat.offset != 0)
268  {
269  // otherwise --oaa output no features for class > 0.
270  std::ostringstream tempstream;
271  tempstream << '[' << (dat.offset >> stride_shift) << ']';
272  ns_pre += tempstream.str();
273  }
274 
275  if (!dat.all.name_index_map.count(ns_pre))
276  dat.all.name_index_map.insert(std::map<std::string, size_t>::value_type(ns_pre, index >> stride_shift));
277  }
278 }
uint64_t stride_shift(const stagewise_poly &poly, uint64_t idx)
float trunc_weight(const float w, const float gravity)
Definition: gd.h:114
uint32_t stride_shift()
uint64_t mask()

◆ audit_interaction()

void GD::audit_interaction ( audit_results dat,
const audit_strings f 
)
inline

Definition at line 208 of file gd.cc.

References GD::audit_results::ns_pre.

Referenced by print_features().

209 {
210  if (f == nullptr)
211  {
212  if (!dat.ns_pre.empty())
213  {
214  dat.ns_pre.pop_back();
215  }
216 
217  return;
218  }
219 
220  std::string ns_pre;
221  if (!dat.ns_pre.empty())
222  ns_pre += '*';
223 
224  if (f->first != "" && ((f->first) != " "))
225  {
226  ns_pre.append(f->first);
227  ns_pre += '^';
228  }
229 
230  if (f->second != "")
231  {
232  ns_pre.append(f->second);
233  }
234 
235  if (!ns_pre.empty())
236  {
237  dat.ns_pre.push_back(ns_pre);
238  }
239 }
float f
Definition: cache.cc:40

◆ average_update()

template<bool sqrt_rate, size_t adaptive, size_t normalized>
float GD::average_update ( float  total_weight,
float  normalized_sum_norm_x,
float  neg_norm_power 
)

Definition at line 122 of file gd.cc.

References ldamath::powf().

123 {
124  if (normalized)
125  {
126  if (sqrt_rate)
127  {
128  float avg_norm = (float)(total_weight / normalized_sum_norm_x);
129  if (adaptive)
130  return std::sqrt(avg_norm);
131  else
132  return avg_norm;
133  }
134  else
135  return powf((float)(normalized_sum_norm_x / total_weight), neg_norm_power);
136  }
137  return 1.f;
138 }
T powf(T, T)
Definition: lda_core.cc:428

◆ ceil_log_2()

uint64_t GD::ceil_log_2 ( uint64_t  v)

Definition at line 1111 of file gd.cc.

Referenced by setup().

1112 {
1113  if (v == 0)
1114  return 0;
1115  else
1116  return 1 + ceil_log_2(v >> 1);
1117 }
uint64_t ceil_log_2(uint64_t v)
Definition: gd.cc:1111

◆ compute_rate_decay()

template<bool sqrt_rate, size_t adaptive, size_t normalized>
float GD::compute_rate_decay ( power_data s,
float &  fw 
)
inline

Definition at line 447 of file gd.cc.

References InvSqrt(), GD::power_data::minus_power_t, GD::power_data::neg_norm_power, and ldamath::powf().

448 {
449  weight* w = &fw;
450  float rate_decay = 1.f;
451  if (adaptive)
452  {
453  if (sqrt_rate)
454  rate_decay = InvSqrt(w[adaptive]);
455  else
456  rate_decay = powf(w[adaptive], s.minus_power_t);
457  }
458  if (normalized)
459  {
460  if (sqrt_rate)
461  {
462  float inv_norm = 1.f / w[normalized];
463  if (adaptive)
464  rate_decay *= inv_norm;
465  else
466  rate_decay *= inv_norm * inv_norm;
467  }
468  else
469  rate_decay *= powf(w[normalized] * w[normalized], s.neg_norm_power);
470  }
471  return rate_decay;
472 }
static float InvSqrt(float x)
Definition: gd.cc:80
T powf(T, T)
Definition: lda_core.cc:428
float weight

◆ compute_update()

template<bool sparse_l2, bool invariant, bool sqrt_rate, bool feature_mask_off, bool adax, size_t adaptive, size_t normalized, size_t spare>
float GD::compute_update ( gd g,
example ec 
)

Definition at line 609 of file gd.cc.

References GD::gd::all, shared_data::contraction, loss_function::first_derivative(), loss_function::getLoss(), loss_function::getUnsafeUpdate(), loss_function::getUpdate(), shared_data::gravity, example::l, vw::l1_lambda, vw::l2_lambda, label_data::label, vw::loss, example::pred, vw::reg_mode, polyprediction::scalar, vw::sd, polylabel::simple, GD::gd::sparse_l2, update(), example::updated_prediction, and example::weight.

610 {
611  // invariant: not a test label, importance weight > 0
612  label_data& ld = ec.l.simple;
613  vw& all = *g.all;
614 
615  float update = 0.;
617  if (all.loss->getLoss(all.sd, ec.pred.scalar, ld.label) > 0.)
618  {
619  float pred_per_update = sensitivity<sqrt_rate, feature_mask_off, adax, adaptive, normalized, spare, false>(g, ec);
620  float update_scale = get_scale<adaptive>(g, ec, ec.weight);
621  if (invariant)
622  update = all.loss->getUpdate(ec.pred.scalar, ld.label, update_scale, pred_per_update);
623  else
624  update = all.loss->getUnsafeUpdate(ec.pred.scalar, ld.label, update_scale);
625  // changed from ec.partial_prediction to ld.prediction
626  ec.updated_prediction += pred_per_update * update;
627 
628  if (all.reg_mode && fabs(update) > 1e-8)
629  {
630  double dev1 = all.loss->first_derivative(all.sd, ec.pred.scalar, ld.label);
631  double eta_bar = (fabs(dev1) > 1e-8) ? (-update / dev1) : 0.0;
632  if (fabs(dev1) > 1e-8)
633  all.sd->contraction *= (1. - all.l2_lambda * eta_bar);
634  update /= (float)all.sd->contraction;
635  all.sd->gravity += eta_bar * all.l1_lambda;
636  }
637  }
638 
639  if (sparse_l2)
640  update -= g.sparse_l2 * ec.pred.scalar;
641 
642  return update;
643 }
loss_function * loss
Definition: global_data.h:523
virtual float getUpdate(float prediction, float label, float update_scale, float pred_per_update)=0
float scalar
Definition: example.h:45
double contraction
Definition: global_data.h:149
float label
Definition: simple_label.h:14
float updated_prediction
Definition: example.h:69
label_data simple
Definition: example.h:28
virtual float first_derivative(shared_data *, float prediction, float label)=0
virtual float getLoss(shared_data *, float prediction, float label)=0
shared_data * sd
Definition: global_data.h:375
float l2_lambda
Definition: global_data.h:445
polylabel l
Definition: example.h:57
virtual float getUnsafeUpdate(float prediction, float label, float eta_t)=0
double gravity
Definition: global_data.h:148
void update(gd &g, base_learner &, example &ec)
Definition: gd.cc:647
polyprediction pred
Definition: example.h:60
float weight
Definition: example.h:62
float l1_lambda
Definition: global_data.h:444
int reg_mode
Definition: global_data.h:448

◆ dummy_func()

template<class R >
void GD::dummy_func ( R &  ,
const audit_strings  
)
inline

Definition at line 39 of file gd_predict.h.

40 {
41 } // should never be called due to call_audit overload

◆ end_pass()

void GD::end_pass ( gd g)

Definition at line 148 of file gd.cc.

References accumulate_avg(), accumulate_weighted_avg(), parameters::adaptive, GD::gd::all, vw::all_reduce, vw::check_holdout_every_n_passes, shared_data::contraction, vw::current_pass, GD::gd::early_stop_thres, vw::eta, vw::eta_decay_rate, vw::final_regressor_name, finalize_regressor(), VW::config::options_i::get_typed_option(), shared_data::gravity, vw::holdout_set_off, GD::gd::no_win_counter, vw::options, VW::config::options_i::replace(), vw::save_per_pass, save_predictor(), vw::save_resume, vw::sd, set_done(), summarize_holdout_set(), sync_weights(), prediction_type::to_string(), and vw::weights.

149 {
150  vw& all = *g.all;
151  if (all.save_resume)
152  {
153  // TODO work out a better system to update state that will be saved in the model.
154  if (all.sd->gravity != 0.)
155  {
156  g.all->options->replace("l1_state", std::to_string(all.sd->gravity));
157  g.all->options->get_typed_option<double>("l1_state").value(all.sd->gravity);
158  }
159  if (all.sd->contraction != 1.)
160  {
161  g.all->options->replace("l2_state", std::to_string(all.sd->contraction));
162  g.all->options->get_typed_option<double>("l2_state").value(all.sd->contraction);
163  }
164  }
165  else
166  sync_weights(all);
167  if (all.all_reduce != nullptr)
168  {
169  if (all.weights.adaptive)
171  else
172  accumulate_avg(all, all.weights, 0);
173  }
174  all.eta *= all.eta_decay_rate;
175  if (all.save_per_pass)
177 
178  if (!all.holdout_set_off)
179  {
180  if (summarize_holdout_set(all, g.no_win_counter))
182  if ((g.early_stop_thres == g.no_win_counter) &&
184  set_done(all);
185  }
186 }
void accumulate_weighted_avg(vw &all, parameters &weights)
Definition: accumulate.cc:117
void set_done(vw &all)
Definition: parser.cc:578
parameters weights
Definition: global_data.h:537
void sync_weights(vw &all)
Definition: gd.cc:671
VW::config::options_i * options
Definition: global_data.h:428
virtual void replace(const std::string &key, const std::string &value)=0
double contraction
Definition: global_data.h:149
void finalize_regressor(vw &all, std::string reg_name)
bool holdout_set_off
Definition: global_data.h:499
size_t check_holdout_every_n_passes
Definition: global_data.h:503
bool summarize_holdout_set(vw &all, size_t &no_win_counter)
void save_predictor(vw &all, std::string reg_name, size_t current_pass)
AllReduce * all_reduce
Definition: global_data.h:381
shared_data * sd
Definition: global_data.h:375
void accumulate_avg(vw &all, parameters &weights, size_t offset)
Definition: accumulate.cc:51
uint64_t current_pass
Definition: global_data.h:396
float eta
Definition: global_data.h:531
bool save_per_pass
Definition: global_data.h:408
double gravity
Definition: global_data.h:148
bool save_resume
Definition: global_data.h:415
std::string final_regressor_name
Definition: global_data.h:535
const char * to_string(prediction_type_t prediction_type)
Definition: learner.cc:12
float eta_decay_rate
Definition: global_data.h:532

◆ finalize_prediction()

float GD::finalize_prediction ( shared_data sd,
float  ret 
)

Definition at line 339 of file gd.cc.

References shared_data::example_number, shared_data::max_label, and shared_data::min_label.

Referenced by bfgs_predict(), mf_predict(), multipredict(), multipredict(), SVRG::predict(), predict(), predict(), predict_or_learn_multi(), SVRG::predict_stable(), update_state_and_predict_cb(), and update_state_and_predict_pistol().

340 {
341  if (std::isnan(ret))
342  {
343  ret = 0.;
344  std::cerr << "NAN prediction in example " << sd->example_number + 1 << ", forcing " << ret << std::endl;
345  return ret;
346  }
347  if (ret > sd->max_label)
348  return (float)sd->max_label;
349  if (ret < sd->min_label)
350  return (float)sd->min_label;
351  return ret;
352 }
uint64_t example_number
Definition: global_data.h:137
float min_label
Definition: global_data.h:150
float max_label
Definition: global_data.h:151

◆ foreach_feature() [1/6]

template<class R , void(*)(R &, float, uint64_t) T, class W >
void GD::foreach_feature ( W &  weights,
features fs,
R &  dat,
uint64_t  offset = 0,
float  mult = 1. 
)
inline

Definition at line 15 of file gd_predict.h.

References f, and foreach_feature().

16 {
17  for (features::iterator& f : fs) T(dat, mult * f.value(), f.index() + offset);
18 }
iterator over values and indicies
float f
Definition: cache.cc:40

◆ foreach_feature() [2/6]

template<class R , void(*)(R &, const float, const float &) T, class W >
void GD::foreach_feature ( const W &  weights,
features fs,
R &  dat,
uint64_t  offset = 0,
float  mult = 1. 
)
inline

Definition at line 29 of file gd_predict.h.

References f.

30 {
31  for (features::iterator& f : fs)
32  {
33  const weight& w = weights[(f.index() + offset)];
34  T(dat, mult * f.value(), w);
35  }
36 }
float weight
iterator over values and indicies
float f
Definition: cache.cc:40

◆ foreach_feature() [3/6]

template<class R , class S , void(*)(R &, float, S) T, class W >
void GD::foreach_feature ( W &  weights,
bool  ignore_some_linear,
std::array< bool, NUM_NAMESPACES > &  ignore_linear,
std::vector< std::string > &  interactions,
bool  permutations,
example_predict ec,
R &  dat 
)
inline

Definition at line 56 of file gd_predict.h.

References example_predict::begin(), example_predict::end(), f, and example_predict::ft_offset.

58 {
59  uint64_t offset = ec.ft_offset;
60  if (ignore_some_linear)
61  for (example_predict::iterator i = ec.begin(); i != ec.end(); ++i)
62  {
63  if (!ignore_linear[i.index()])
64  {
65  features& f = *i;
66  foreach_feature<R, T, W>(weights, f, dat, offset);
67  }
68  }
69  else
70  for (features& f : ec) foreach_feature<R, T, W>(weights, f, dat, offset);
71 
72  generate_interactions<R, S, T, W>(interactions, permutations, ec, dat, weights);
73 }
the core definition of a set of features.
iterator begin()
float f
Definition: cache.cc:40

◆ foreach_feature() [4/6]

template<class R , typename T >
void GD::foreach_feature ( vw all,
features fs,
R &  dat,
uint64_t  offset = 0,
float  mult = 1. 
)
inline

Definition at line 66 of file gd.h.

References parameters::dense_weights, parameters::sparse, parameters::sparse_weights, and vw::weights.

Referenced by foreach_feature(), and get_pred_per_update().

67 {
68  if (all.weights.sparse)
69  foreach_feature(all.weights.sparse_weights, fs, dat, offset, mult);
70  else
71  foreach_feature(all.weights.dense_weights, fs, dat, offset, mult);
72 }
parameters weights
Definition: global_data.h:537
void foreach_feature(vw &all, example &ec, R &dat)
Definition: gd.h:87
dense_parameters dense_weights
sparse_parameters sparse_weights

◆ foreach_feature() [5/6]

template<class R , class S , void(*)(R &, float, S) T>
void GD::foreach_feature ( vw all,
example ec,
R &  dat 
)
inline

Definition at line 75 of file gd.h.

References parameters::dense_weights, vw::ignore_linear, vw::ignore_some_linear, example_predict::interactions, vw::permutations, parameters::sparse, parameters::sparse_weights, and vw::weights.

76 {
77  return all.weights.sparse
78  ? foreach_feature<R, S, T, sparse_parameters>(all.weights.sparse_weights, all.ignore_some_linear,
79  all.ignore_linear, *ec.interactions, all.permutations, ec, dat)
80  : foreach_feature<R, S, T, dense_parameters>(all.weights.dense_weights, all.ignore_some_linear, all.ignore_linear,
81  *ec.interactions, all.permutations, ec, dat);
82 }
bool ignore_some_linear
Definition: global_data.h:464
parameters weights
Definition: global_data.h:537
std::vector< std::string > * interactions
std::array< bool, NUM_NAMESPACES > ignore_linear
Definition: global_data.h:465
dense_parameters dense_weights
sparse_parameters sparse_weights
bool permutations
Definition: global_data.h:454

◆ foreach_feature() [6/6]

template<class R , void(*)(R &, float, float &) T>
void GD::foreach_feature ( vw all,
example ec,
R &  dat 
)
inline

Definition at line 87 of file gd.h.

References foreach_feature().

88 {
89  foreach_feature<R, float&, T>(all, ec, dat);
90 }

◆ generate_interactions()

template<class R , class S , void(*)(R &, float, S) T, class W >
void GD::generate_interactions ( std::vector< std::string > &  interactions,
bool  permutations,
example_predict ec,
R &  dat,
W &  weights 
)
inline

Definition at line 45 of file gd_predict.h.

49 {
50  INTERACTIONS::generate_interactions<R, S, T, false, dummy_func<R>, W>(interactions, permutations, ec, dat, weights);
51 }

◆ get_pred_per_update()

template<bool sqrt_rate, bool feature_mask_off, bool adax, size_t adaptive, size_t normalized, size_t spare, bool stateless>
float GD::get_pred_per_update ( gd g,
example ec 
)

Definition at line 541 of file gd.cc.

References GD::gd::all, foreach_feature(), loss_function::getSquareGrad(), example::l, label_data::label, vw::loss, GD::gd::neg_norm_power, GD::gd::neg_power_t, GD::norm_data::norm_x, vw::normalized_sum_norm_x, example::pred, GD::norm_data::pred_per_update, polyprediction::scalar, polylabel::simple, GD::gd::total_weight, GD::gd::update_multiplier, and example::weight.

542 {
543  // We must traverse the features in _precisely_ the same order as during training.
544  label_data& ld = ec.l.simple;
545  vw& all = *g.all;
546 
547  float grad_squared = ec.weight;
548  if (!adax)
549  grad_squared *= all.loss->getSquareGrad(ec.pred.scalar, ld.label);
550 
551  if (grad_squared == 0 && !stateless)
552  return 1.;
553 
554  norm_data nd = {grad_squared, 0., 0., {g.neg_power_t, g.neg_norm_power}, {0}};
555  foreach_feature<norm_data,
556  pred_per_update_feature<sqrt_rate, feature_mask_off, adaptive, normalized, spare, stateless> >(all, ec, nd);
557  if (normalized)
558  {
559  if (!stateless)
560  {
561  g.all->normalized_sum_norm_x += ((double)ec.weight) * nd.norm_x;
562  g.total_weight += ec.weight;
563  g.update_multiplier = average_update<sqrt_rate, adaptive, normalized>(
564  (float)g.total_weight, (float)g.all->normalized_sum_norm_x, g.neg_norm_power);
565  }
566  else
567  {
568  float nsnx = ((float)g.all->normalized_sum_norm_x) + ec.weight * nd.norm_x;
569  float tw = (float)g.total_weight + ec.weight;
570  g.update_multiplier = average_update<sqrt_rate, adaptive, normalized>(tw, nsnx, g.neg_norm_power);
571  }
572  nd.pred_per_update *= g.update_multiplier;
573  }
574  return nd.pred_per_update;
575 }
loss_function * loss
Definition: global_data.h:523
float scalar
Definition: example.h:45
float label
Definition: simple_label.h:14
label_data simple
Definition: example.h:28
polylabel l
Definition: example.h:57
void foreach_feature(vw &all, features &fs, R &dat, uint64_t offset=0, float mult=1.)
Definition: gd.h:66
polyprediction pred
Definition: example.h:60
float weight
Definition: example.h:62
virtual float getSquareGrad(float prediction, float label)=0

◆ get_scale()

template<size_t adaptive>
float GD::get_scale ( gd g,
example ,
float  weight 
)

Definition at line 588 of file gd.cc.

References GD::gd::all, vw::eta, GD::gd::neg_power_t, ldamath::powf(), vw::sd, shared_data::t, shared_data::weighted_holdout_examples, and shared_data::weighted_unlabeled_examples.

589 {
590  float update_scale = g.all->eta * weight;
591  if (!adaptive)
592  {
593  float t =
594  (float)(g.all->sd->t + weight - g.all->sd->weighted_holdout_examples - g.all->sd->weighted_unlabeled_examples);
595  update_scale *= powf(t, g.neg_power_t);
596  }
597  return update_scale;
598 }
T powf(T, T)
Definition: lda_core.cc:428
float weight

◆ inline_predict() [1/2]

template<class W >
float GD::inline_predict ( W &  weights,
bool  ignore_some_linear,
std::array< bool, NUM_NAMESPACES > &  ignore_linear,
std::vector< std::string > &  interactions,
bool  permutations,
example_predict ec,
float  initial = 0.f 
)
inline

Definition at line 78 of file gd_predict.h.

80 {
81  foreach_feature<float, const float&, vec_add, W>(
82  weights, ignore_some_linear, ignore_linear, interactions, permutations, ec, initial);
83  return initial;
84 }

◆ inline_predict() [2/2]

float GD::inline_predict ( vw all,
example ec 
)
inline

Definition at line 98 of file gd.h.

References parameters::dense_weights, vw::ignore_linear, vw::ignore_some_linear, label_data::initial, example_predict::interactions, example::l, vw::permutations, polylabel::simple, parameters::sparse, parameters::sparse_weights, and vw::weights.

Referenced by bfgs_predict(), predict(), and predict().

99 {
100  return all.weights.sparse ? inline_predict<sparse_parameters>(all.weights.sparse_weights, all.ignore_some_linear,
101  all.ignore_linear, *ec.interactions, all.permutations, ec, ec.l.simple.initial)
102  : inline_predict<dense_parameters>(all.weights.dense_weights, all.ignore_some_linear,
103  all.ignore_linear, *ec.interactions, all.permutations, ec, ec.l.simple.initial);
104 }
bool ignore_some_linear
Definition: global_data.h:464
parameters weights
Definition: global_data.h:537
std::vector< std::string > * interactions
label_data simple
Definition: example.h:28
std::array< bool, NUM_NAMESPACES > ignore_linear
Definition: global_data.h:465
dense_parameters dense_weights
float initial
Definition: simple_label.h:16
polylabel l
Definition: example.h:57
sparse_parameters sparse_weights
bool permutations
Definition: global_data.h:454

◆ InvSqrt()

static float GD::InvSqrt ( float  x)
inlinestatic

Definition at line 80 of file gd.cc.

References quake_InvSqrt().

Referenced by compute_rate_decay().

81 {
82 #if !defined(VW_NO_INLINE_SIMD)
83 #if defined(__ARM_NEON__)
84  // Propagate into vector
85  float32x2_t v1 = vdup_n_f32(x);
86  // Estimate
87  float32x2_t e1 = vrsqrte_f32(v1);
88  // N-R iteration 1
89  float32x2_t e2 = vmul_f32(e1, vrsqrts_f32(v1, vmul_f32(e1, e1)));
90  // N-R iteration 2
91  float32x2_t e3 = vmul_f32(e2, vrsqrts_f32(v1, vmul_f32(e2, e2)));
92  // Extract result
93  return vget_lane_f32(e3, 0);
94 #elif defined(__SSE2__)
95  __m128 eta = _mm_load_ss(&x);
96  eta = _mm_rsqrt_ss(eta);
97  _mm_store_ss(&x, eta);
98 #else
99  x = quake_InvSqrt(x);
100 #endif
101 #else
102  x = quake_InvSqrt(x);
103 #endif
104 
105  return x;
106 }
float quake_InvSqrt(float x)
Definition: gd.cc:68

◆ learn()

template<bool sparse_l2, bool invariant, bool sqrt_rate, bool feature_mask_off, bool adax, size_t adaptive, size_t normalized, size_t spare>
void GD::learn ( gd g,
base_learner base,
example ec 
)

Definition at line 661 of file gd.cc.

References example::in_use, example::l, label_data::label, GD::gd::predict, polylabel::simple, and example::weight.

662 {
663  // invariant: not a test label, importance weight > 0
664  assert(ec.in_use);
665  assert(ec.l.simple.label != FLT_MAX);
666  assert(ec.weight > 0.);
667  g.predict(g, base, ec);
668  update<sparse_l2, invariant, sqrt_rate, feature_mask_off, adax, adaptive, normalized, spare>(g, base, ec);
669 }
float label
Definition: simple_label.h:14
label_data simple
Definition: example.h:28
polylabel l
Definition: example.h:57
bool in_use
Definition: example.h:79
float weight
Definition: example.h:62

◆ multipredict()

template<bool l1, bool audit>
void GD::multipredict ( gd g,
base_learner ,
example ec,
size_t  count,
size_t  step,
polyprediction pred,
bool  finalize_predictions 
)

Definition at line 402 of file gd.cc.

References GD::gd::all, c, shared_data::contraction, parameters::dense_weights, finalize_prediction(), example_predict::ft_offset, shared_data::gravity, label_data::initial, example::l, example::pred, print_audit_features(), prediction_type::scalar, polyprediction::scalar, vw::sd, polylabel::simple, parameters::sparse, parameters::sparse_weights, vec_add_multipredict(), vec_add_trunc_multipredict(), and vw::weights.

404 {
405  vw& all = *g.all;
406  for (size_t c = 0; c < count; c++) pred[c].scalar = ec.l.simple.initial;
407  if (g.all->weights.sparse)
408  {
409  multipredict_info<sparse_parameters> mp = {
410  count, step, pred, g.all->weights.sparse_weights, (float)all.sd->gravity};
411  if (l1)
412  foreach_feature<multipredict_info<sparse_parameters>, uint64_t, vec_add_trunc_multipredict>(all, ec, mp);
413  else
414  foreach_feature<multipredict_info<sparse_parameters>, uint64_t, vec_add_multipredict>(all, ec, mp);
415  }
416  else
417  {
418  multipredict_info<dense_parameters> mp = {count, step, pred, g.all->weights.dense_weights, (float)all.sd->gravity};
419  if (l1)
420  foreach_feature<multipredict_info<dense_parameters>, uint64_t, vec_add_trunc_multipredict>(all, ec, mp);
421  else
422  foreach_feature<multipredict_info<dense_parameters>, uint64_t, vec_add_multipredict>(all, ec, mp);
423  }
424  if (all.sd->contraction != 1.)
425  for (size_t c = 0; c < count; c++) pred[c].scalar *= (float)all.sd->contraction;
426  if (finalize_predictions)
427  for (size_t c = 0; c < count; c++) pred[c].scalar = finalize_prediction(all.sd, pred[c].scalar);
428  if (audit)
429  {
430  for (size_t c = 0; c < count; c++)
431  {
432  ec.pred.scalar = pred[c].scalar;
433  print_audit_features(all, ec);
434  ec.ft_offset += (uint64_t)step;
435  }
436  ec.ft_offset -= (uint64_t)(step * count);
437  }
438 }
float finalize_prediction(shared_data *sd, float ret)
Definition: gd.cc:339
void print_audit_features(vw &all, example &ec)
Definition: gd.cc:331
float scalar
Definition: example.h:45
void vec_add_trunc_multipredict(multipredict_info< T > &mp, const float fx, uint64_t fi)
Definition: gd.cc:394
void vec_add_multipredict(multipredict_info< T > &mp, const float fx, uint64_t fi)
Definition: gd.h:40
double contraction
Definition: global_data.h:149
label_data simple
Definition: example.h:28
shared_data * sd
Definition: global_data.h:375
float initial
Definition: simple_label.h:16
polylabel l
Definition: example.h:57
double gravity
Definition: global_data.h:148
polyprediction pred
Definition: example.h:60
constexpr uint64_t c
Definition: rand48.cc:12

◆ operator<()

bool GD::operator< ( const string_value first,
const string_value second 
)

Definition at line 197 of file gd.cc.

References GD::string_value::v.

197 { return fabsf(first.v) > fabsf(second.v); }

◆ pred_per_update_feature()

template<bool sqrt_rate, bool feature_mask_off, size_t adaptive, size_t normalized, size_t spare, bool stateless>
void GD::pred_per_update_feature ( norm_data nd,
float  x,
float &  fw 
)
inline

Definition at line 488 of file gd.cc.

References GD::norm_data::extra_state, GD::norm_data::grad_squared, GD::power_data::neg_norm_power, GD::norm_data::norm_x, GD::norm_data::pd, ldamath::powf(), GD::norm_data::pred_per_update, THROW, and x2_min.

489 {
490  if (feature_mask_off || fw != 0.)
491  {
492  weight* w = &fw;
493  float x2 = x * x;
494  if (x2 < x2_min)
495  {
496  x = (x > 0) ? x_min : -x_min;
497  x2 = x2_min;
498  }
499  if (x2 > x2_max)
500  THROW("your features have too much magnitude");
501  if (stateless) // we must not modify the parameter state so introduce a shadow version.
502  {
503  nd.extra_state[0] = w[0];
504  nd.extra_state[adaptive] = w[adaptive];
505  nd.extra_state[normalized] = w[normalized];
506  w = nd.extra_state;
507  }
508  if (adaptive)
509  w[adaptive] += nd.grad_squared * x2;
510  if (normalized)
511  {
512  float x_abs = fabsf(x);
513  if (x_abs > w[normalized]) // new scale discovered
514  {
515  if (w[normalized] >
516  0.) // If the normalizer is > 0 then rescale the weight so it's as if the new scale was the old scale.
517  {
518  if (sqrt_rate)
519  {
520  float rescale = w[normalized] / x_abs;
521  w[0] *= (adaptive ? rescale : rescale * rescale);
522  }
523  else
524  {
525  float rescale = x_abs / w[normalized];
526  w[0] *= powf(rescale * rescale, nd.pd.neg_norm_power);
527  }
528  }
529  w[normalized] = x_abs;
530  }
531  nd.norm_x += x2 / (w[normalized] * w[normalized]);
532  }
533  w[spare] = compute_rate_decay<sqrt_rate, adaptive, normalized>(nd.pd, w[0]);
534  nd.pred_per_update += x2 * w[spare];
535  }
536 }
T powf(T, T)
Definition: lda_core.cc:428
constexpr float x2_min
Definition: gd.cc:484
float weight
constexpr float x_min
Definition: gd.cc:483
#define THROW(args)
Definition: vw_exception.h:181
constexpr float x2_max
Definition: gd.cc:485

◆ predict()

template<bool l1, bool audit>
void GD::predict ( gd g,
base_learner ,
example ec 
)

Definition at line 379 of file gd.cc.

References GD::gd::all, shared_data::contraction, finalize_prediction(), shared_data::gravity, inline_predict(), example::partial_prediction, example::pred, print_audit_features(), polyprediction::scalar, vw::sd, and trunc_predict().

380 {
381  vw& all = *g.all;
382  if (l1)
383  ec.partial_prediction = trunc_predict(all, ec, all.sd->gravity);
384  else
385  ec.partial_prediction = inline_predict(all, ec);
386 
387  ec.partial_prediction *= (float)all.sd->contraction;
389  if (audit)
390  print_audit_features(all, ec);
391 }
float finalize_prediction(shared_data *sd, float ret)
Definition: gd.cc:339
void print_audit_features(vw &all, example &ec)
Definition: gd.cc:331
float scalar
Definition: example.h:45
float partial_prediction
Definition: example.h:68
double contraction
Definition: global_data.h:149
float trunc_predict(vw &all, example &ec, double gravity)
Definition: gd.cc:365
float inline_predict(vw &all, example &ec)
Definition: gd.h:98
shared_data * sd
Definition: global_data.h:375
double gravity
Definition: global_data.h:148
polyprediction pred
Definition: example.h:60

◆ print_audit_features()

void GD::print_audit_features ( vw all,
example ec 
)

Definition at line 331 of file gd.cc.

References vw::audit, example::pred, print_features(), print_result(), polyprediction::scalar, vw::stdout_fileno, and example::tag.

Referenced by learn_batch(), multipredict(), multipredict(), predict(), and predict().

332 {
333  if (all.audit)
334  print_result(all.stdout_fileno, ec.pred.scalar, -1, ec.tag);
335  fflush(stdout);
336  print_features(all, ec);
337 }
v_array< char > tag
Definition: example.h:63
void print_features(vw &all, example &ec)
Definition: gd.cc:298
float scalar
Definition: example.h:45
int stdout_fileno
Definition: global_data.h:434
void print_result(int f, float res, v_array< char > tag, float lb, float ub)
Definition: bs.cc:136
bool audit
Definition: global_data.h:486
polyprediction pred
Definition: example.h:60

◆ print_features()

void GD::print_features ( vw all,
example ec 
)

Definition at line 298 of file gd.cc.

References audit_feature(), audit_interaction(), f, example_predict::ft_offset, vw::lda, print_lda_features(), GD::audit_results::results, and GD::string_value::s.

Referenced by print_audit_features().

299 {
300  if (all.lda > 0)
301  print_lda_features(all, ec);
302  else
303  {
304  audit_results dat(all, ec.ft_offset);
305 
306  for (features& fs : ec)
307  {
308  if (fs.space_names.size() > 0)
309  for (features::iterator_all& f : fs.values_indices_audit())
310  {
311  audit_interaction(dat, f.audit().get());
312  audit_feature(dat, f.value(), f.index() + ec.ft_offset);
313  audit_interaction(dat, NULL);
314  }
315  else
316  for (features::iterator& f : fs) audit_feature(dat, f.value(), f.index() + ec.ft_offset);
317  }
318 
319  INTERACTIONS::generate_interactions<audit_results, const uint64_t, audit_feature, true, audit_interaction>(
320  all, ec, dat);
321 
322  stable_sort(dat.results.begin(), dat.results.end());
323  if (all.audit)
324  {
325  for (string_value& sv : dat.results) std::cout << '\t' << sv.s;
326  std::cout << std::endl;
327  }
328  }
329 }
the core definition of a set of features.
void print_lda_features(vw &all, example &ec)
Definition: gd.cc:280
void audit_feature(audit_results &dat, const float ft_weight, const uint64_t ft_idx)
Definition: gd.cc:241
uint32_t lda
Definition: global_data.h:508
void audit_interaction(audit_results &dat, const audit_strings *f)
Definition: gd.cc:208
iterator over values, indicies and audit space names
iterator over values and indicies
float f
Definition: cache.cc:40

◆ print_lda_features()

void GD::print_lda_features ( vw all,
example ec 
)

Definition at line 280 of file gd.cc.

References f, vw::lda, vw::parse_mask, stride_shift(), parameters::stride_shift(), and vw::weights.

Referenced by print_features().

281 {
282  parameters& weights = all.weights;
283  uint32_t stride_shift = weights.stride_shift();
284  size_t count = 0;
285  for (features& fs : ec) count += fs.size();
286  for (features& fs : ec)
287  {
288  for (features::iterator_all& f : fs.values_indices_audit())
289  {
290  std::cout << '\t' << f.audit().get()->first << '^' << f.audit().get()->second << ':'
291  << ((f.index() >> stride_shift) & all.parse_mask) << ':' << f.value();
292  for (size_t k = 0; k < all.lda; k++) std::cout << ':' << (&weights[f.index()])[k];
293  }
294  }
295  std::cout << " total of " << count << " features." << std::endl;
296 }
parameters weights
Definition: global_data.h:537
uint64_t stride_shift(const stagewise_poly &poly, uint64_t idx)
the core definition of a set of features.
uint32_t lda
Definition: global_data.h:508
iterator over values, indicies and audit space names
uint64_t parse_mask
Definition: global_data.h:453
uint32_t stride_shift()
float f
Definition: cache.cc:40

◆ quake_InvSqrt()

float GD::quake_InvSqrt ( float  x)
inline

Definition at line 68 of file gd.cc.

Referenced by InvSqrt().

69 {
70  // Carmack/Quake/SGI fast method:
71  float xhalf = 0.5f * x;
72  static_assert(sizeof(int) == sizeof(float), "Floats and ints are converted between, they must be the same size.");
73  int i = reinterpret_cast<int&>(x); // store floating-point bits in integer
74  i = 0x5f3759d5 - (i >> 1); // initial guess for Newton's method
75  x = reinterpret_cast<float&>(i); // convert new bits into float
76  x = x * (1.5f - xhalf * x * x); // One round of Newton's method
77  return x;
78 }

◆ save_load()

void GD::save_load ( gd g,
io_buf model_file,
bool  read,
bool  text 
)

Definition at line 992 of file gd.cc.

References parameters::adaptive, GD::gd::all, bin_text_read_write_fixed(), constant, parameters::dense_weights, io_buf::files, GD::gd::initial_constant, vw::initial_t, vw::initial_weight, initialize_regressor(), vw::model_file_ver, save_load_online_state(), save_load_regressor(), vw::save_resume, dense_parameters::set_default(), sparse_parameters::set_default(), VW::set_weight(), v_array< T >::size(), parameters::sparse, parameters::sparse_weights, sync_weights(), GD::gd::total_weight, vw::trace_message, vw::training, VERSION_SAVE_RESUME_FIX, and vw::weights.

Referenced by setup().

993 {
994  vw& all = *g.all;
995  if (read)
996  {
998 
999  if (all.weights.adaptive && all.initial_t > 0)
1000  {
1001  float init_weight = all.initial_weight;
1002  std::pair<float, float> p = std::make_pair(init_weight, all.initial_t);
1003  if (all.weights.sparse)
1004  all.weights.sparse_weights.set_default<std::pair<float, float>, set_initial_gd_wrapper<sparse_parameters> >(p);
1005  else
1006  all.weights.dense_weights.set_default<std::pair<float, float>, set_initial_gd_wrapper<dense_parameters> >(p);
1007  // for adaptive update, we interpret initial_t as previously seeing initial_t fake datapoints, all with squared
1008  // gradient=1 NOTE: this is not invariant to the scaling of the data (i.e. when combined with normalized). Since
1009  // scaling the data scales the gradient, this should ideally be feature_range*initial_t, or something like that.
1010  // We could potentially fix this by just adding this base quantity times the current range to the sum of gradients
1011  // stored in memory at each update, and always start sum of gradients to 0, at the price of additional additions
1012  // and multiplications during the update...
1013  }
1014  if (g.initial_constant != 0.0)
1015  VW::set_weight(all, constant, 0, g.initial_constant);
1016  }
1017 
1018  if (model_file.files.size() > 0)
1019  {
1020  bool resume = all.save_resume;
1021  std::stringstream msg;
1022  msg << ":" << resume << "\n";
1023  bin_text_read_write_fixed(model_file, (char*)&resume, sizeof(resume), "", read, msg, text);
1024  if (resume)
1025  {
1026  if (read && all.model_file_ver < VERSION_SAVE_RESUME_FIX)
1027  all.trace_message
1028  << std::endl
1029  << "WARNING: --save_resume functionality is known to have inaccuracy in model files version less than "
1030  << VERSION_SAVE_RESUME_FIX << std::endl
1031  << std::endl;
1032  save_load_online_state(all, model_file, read, text, g.total_weight, &g);
1033  }
1034  else
1035  save_load_regressor(all, model_file, read, text);
1036  }
1037  if (!all.training) // If the regressor was saved as --save_resume, then when testing we want to materialize the
1038  // weights.
1039  sync_weights(all);
1040 }
parameters weights
Definition: global_data.h:537
void sync_weights(vw &all)
Definition: gd.cc:671
void initialize_regressor(vw &all, T &weights)
float initial_t
Definition: global_data.h:530
float initial_weight
Definition: global_data.h:409
void set_default(R &info)
void save_load_online_state(vw &all, io_buf &model_file, bool read, bool text, double &total_weight, gd *g, uint32_t ftrl_size)
Definition: gd.cc:881
bool training
Definition: global_data.h:488
size_t size() const
Definition: v_array.h:68
void save_load_regressor(vw &all, io_buf &model_file, bool read, bool text)
Definition: gd.cc:767
VW::version_struct model_file_ver
Definition: global_data.h:419
v_array< int > files
Definition: io_buf.h:64
vw_ostream trace_message
Definition: global_data.h:424
constexpr uint64_t constant
Definition: constant.h:11
void set_weight(vw &all, uint32_t index, uint32_t offset, float value)
Definition: vw.h:182
dense_parameters dense_weights
sparse_parameters sparse_weights
bool save_resume
Definition: global_data.h:415
#define VERSION_SAVE_RESUME_FIX
Definition: gd.cc:33
size_t bin_text_read_write_fixed(io_buf &io, char *data, size_t len, const char *read_message, bool read, std::stringstream &msg, bool text)
Definition: io_buf.h:326

◆ save_load_online_state() [1/2]

template<class T >
void GD::save_load_online_state ( vw all,
io_buf model_file,
bool  read,
bool  text,
gd g,
std::stringstream &  msg,
uint32_t  ftrl_size,
T &  weights 
)

Definition at line 776 of file gd.cc.

References parameters::adaptive, GD::gd::adaptive_input, io_buf::bin_read_fixed(), bin_text_write_fixed(), parameters::normalized, GD::gd::normalized_input, vw::num_bits, THROW, vw::weights, and write_index().

Referenced by SVRG::save_load(), and save_load().

778 {
779  uint64_t length = (uint64_t)1 << all.num_bits;
780 
781  uint64_t i = 0;
782  uint32_t old_i = 0;
783  size_t brw = 1;
784 
785  if (read)
786  do
787  {
788  brw = 1;
789  if (all.num_bits < 31) // backwards compatible
790  {
791  brw = model_file.bin_read_fixed((char*)&old_i, sizeof(old_i), "");
792  i = old_i;
793  }
794  else
795  brw = model_file.bin_read_fixed((char*)&i, sizeof(i), "");
796  if (brw > 0)
797  {
798  if (i >= length)
799  THROW("Model content is corrupted, weight vector index " << i << " must be less than total vector length "
800  << length);
801  weight buff[8] = {0, 0, 0, 0, 0, 0, 0, 0};
802  if (ftrl_size > 0)
803  brw += model_file.bin_read_fixed((char*)buff, sizeof(buff[0]) * ftrl_size, "");
804  else if (g == NULL || (!g->adaptive_input && !g->normalized_input))
805  brw += model_file.bin_read_fixed((char*)buff, sizeof(buff[0]), "");
806  else if ((g->adaptive_input && !g->normalized_input) || (!g->adaptive_input && g->normalized_input))
807  brw += model_file.bin_read_fixed((char*)buff, sizeof(buff[0]) * 2, "");
808  else // adaptive and normalized
809  brw += model_file.bin_read_fixed((char*)buff, sizeof(buff[0]) * 3, "");
810  uint32_t stride = 1 << weights.stride_shift();
811  weight* v = &weights.strided_index(i);
812  for (size_t i = 0; i < stride; i++) v[i] = buff[i];
813  }
814  } while (brw > 0);
815  else // write binary or text
816  for (typename T::iterator v = weights.begin(); v != weights.end(); ++v)
817  {
818  i = v.index() >> weights.stride_shift();
819 
820  if (ftrl_size == 3)
821  {
822  if (*v != 0. || (&(*v))[1] != 0. || (&(*v))[2] != 0.)
823  {
824  brw = write_index(model_file, msg, text, all.num_bits, i);
825  msg << ":" << *v << " " << (&(*v))[1] << " " << (&(*v))[2] << "\n";
826  brw += bin_text_write_fixed(model_file, (char*)&(*v), 3 * sizeof(*v), msg, text);
827  }
828  }
829  else if (ftrl_size == 4)
830  {
831  if (*v != 0. || (&(*v))[1] != 0. || (&(*v))[2] != 0. || (&(*v))[3] != 0.)
832  {
833  brw = write_index(model_file, msg, text, all.num_bits, i);
834  msg << ":" << *v << " " << (&(*v))[1] << " " << (&(*v))[2] << " " << (&(*v))[3] << "\n";
835  brw += bin_text_write_fixed(model_file, (char*)&(*v), 4 * sizeof(*v), msg, text);
836  }
837  }
838  else if (ftrl_size == 6)
839  {
840  if (*v != 0. || (&(*v))[1] != 0. || (&(*v))[2] != 0. || (&(*v))[3] != 0. || (&(*v))[4] != 0. ||
841  (&(*v))[5] != 0.)
842  {
843  brw = write_index(model_file, msg, text, all.num_bits, i);
844  msg << ":" << *v << " " << (&(*v))[1] << " " << (&(*v))[2] << " " << (&(*v))[3] << " " << (&(*v))[4] << " "
845  << (&(*v))[5] << "\n";
846  brw += bin_text_write_fixed(model_file, (char*)&(*v), 6 * sizeof(*v), msg, text);
847  }
848  }
849  else if (g == nullptr || (!all.weights.adaptive && !all.weights.normalized))
850  {
851  if (*v != 0.)
852  {
853  brw = write_index(model_file, msg, text, all.num_bits, i);
854  msg << ":" << *v << "\n";
855  brw += bin_text_write_fixed(model_file, (char*)&(*v), sizeof(*v), msg, text);
856  }
857  }
858  else if ((all.weights.adaptive && !all.weights.normalized) || (!all.weights.adaptive && all.weights.normalized))
859  {
860  // either adaptive or normalized
861  if (*v != 0. || (&(*v))[1] != 0.)
862  {
863  brw = write_index(model_file, msg, text, all.num_bits, i);
864  msg << ":" << *v << " " << (&(*v))[1] << "\n";
865  brw += bin_text_write_fixed(model_file, (char*)&(*v), 2 * sizeof(*v), msg, text);
866  }
867  }
868  else
869  {
870  // adaptive and normalized
871  if (*v != 0. || (&(*v))[1] != 0. || (&(*v))[2] != 0.)
872  {
873  brw = write_index(model_file, msg, text, all.num_bits, i);
874  msg << ":" << *v << " " << (&(*v))[1] << " " << (&(*v))[2] << "\n";
875  brw += bin_text_write_fixed(model_file, (char*)&(*v), 3 * sizeof(*v), msg, text);
876  }
877  }
878  }
879 }
size_t write_index(io_buf &model_file, std::stringstream &msg, bool text, uint32_t num_bits, uint64_t i)
Definition: gd.cc:688
parameters weights
Definition: global_data.h:537
uint32_t num_bits
Definition: global_data.h:398
size_t bin_text_write_fixed(io_buf &io, char *data, size_t len, std::stringstream &msg, bool text)
Definition: io_buf.h:313
size_t bin_read_fixed(char *data, size_t len, const char *read_message)
Definition: io_buf.h:230
float weight
#define THROW(args)
Definition: vw_exception.h:181

◆ save_load_online_state() [2/2]

void GD::save_load_online_state ( vw all,
io_buf model_file,
bool  read,
bool  text,
double &  total_weight,
gd g,
uint32_t  ftrl_size 
)

Definition at line 881 of file gd.cc.

References bin_text_read_write_fixed(), vw::current_pass, parameters::dense_weights, shared_data::dump_interval, shared_data::example_number, vw::initial_t, shared_data::max_label, shared_data::min_label, vw::model_file_ver, vw::normalized_sum_norm_x, shared_data::old_weighted_labeled_examples, vw::preserve_performance_counters, vw::sd, parameters::sparse, parameters::sparse_weights, shared_data::sum_loss, shared_data::sum_loss_since_last_dump, shared_data::t, shared_data::total_features, vw::training, VERSION_PASS_UINT64, VERSION_SAVE_RESUME_FIX, shared_data::weighted_labeled_examples, shared_data::weighted_labels, shared_data::weighted_unlabeled_examples, and vw::weights.

Referenced by save_load().

883 {
884  // vw& all = *g.all;
885  std::stringstream msg;
886 
887  msg << "initial_t " << all.initial_t << "\n";
888  bin_text_read_write_fixed(model_file, (char*)&all.initial_t, sizeof(all.initial_t), "", read, msg, text);
889 
890  msg << "norm normalizer " << all.normalized_sum_norm_x << "\n";
892  model_file, (char*)&all.normalized_sum_norm_x, sizeof(all.normalized_sum_norm_x), "", read, msg, text);
893 
894  msg << "t " << all.sd->t << "\n";
895  bin_text_read_write_fixed(model_file, (char*)&all.sd->t, sizeof(all.sd->t), "", read, msg, text);
896 
897  msg << "sum_loss " << all.sd->sum_loss << "\n";
898  bin_text_read_write_fixed(model_file, (char*)&all.sd->sum_loss, sizeof(all.sd->sum_loss), "", read, msg, text);
899 
900  msg << "sum_loss_since_last_dump " << all.sd->sum_loss_since_last_dump << "\n";
901  bin_text_read_write_fixed(model_file, (char*)&all.sd->sum_loss_since_last_dump,
902  sizeof(all.sd->sum_loss_since_last_dump), "", read, msg, text);
903 
904  float dump_interval = all.sd->dump_interval;
905  msg << "dump_interval " << dump_interval << "\n";
906  bin_text_read_write_fixed(model_file, (char*)&dump_interval, sizeof(dump_interval), "", read, msg, text);
907  if (!read || (all.training && all.preserve_performance_counters)) // update dump_interval from input model
908  all.sd->dump_interval = dump_interval;
909 
910  msg << "min_label " << all.sd->min_label << "\n";
911  bin_text_read_write_fixed(model_file, (char*)&all.sd->min_label, sizeof(all.sd->min_label), "", read, msg, text);
912 
913  msg << "max_label " << all.sd->max_label << "\n";
914  bin_text_read_write_fixed(model_file, (char*)&all.sd->max_label, sizeof(all.sd->max_label), "", read, msg, text);
915 
916  msg << "weighted_labeled_examples " << all.sd->weighted_labeled_examples << "\n";
917  bin_text_read_write_fixed(model_file, (char*)&all.sd->weighted_labeled_examples,
918  sizeof(all.sd->weighted_labeled_examples), "", read, msg, text);
919 
920  msg << "weighted_labels " << all.sd->weighted_labels << "\n";
922  model_file, (char*)&all.sd->weighted_labels, sizeof(all.sd->weighted_labels), "", read, msg, text);
923 
924  msg << "weighted_unlabeled_examples " << all.sd->weighted_unlabeled_examples << "\n";
926  sizeof(all.sd->weighted_unlabeled_examples), "", read, msg, text);
927 
928  msg << "example_number " << all.sd->example_number << "\n";
930  model_file, (char*)&all.sd->example_number, sizeof(all.sd->example_number), "", read, msg, text);
931 
932  msg << "total_features " << all.sd->total_features << "\n";
934  model_file, (char*)&all.sd->total_features, sizeof(all.sd->total_features), "", read, msg, text);
935 
936  if (!read || all.model_file_ver >= VERSION_SAVE_RESUME_FIX)
937  {
938  // restore some data to allow --save_resume work more accurate
939 
940  // fix average loss
941  msg << "total_weight " << total_weight << "\n";
942  bin_text_read_write_fixed(model_file, (char*)&total_weight, sizeof(total_weight), "", read, msg, text);
943 
944  // fix "loss since last" for first printed out example details
945  msg << "sd::oec.weighted_labeled_examples " << all.sd->old_weighted_labeled_examples << "\n";
947  sizeof(all.sd->old_weighted_labeled_examples), "", read, msg, text);
948 
949  // fix "number of examples per pass"
950  msg << "current_pass " << all.current_pass << "\n";
952  bin_text_read_write_fixed(model_file, (char*)&all.current_pass, sizeof(all.current_pass), "", read, msg, text);
953  else // backwards compatiblity.
954  {
955  size_t temp_pass = (size_t)all.current_pass;
956  bin_text_read_write_fixed(model_file, (char*)&temp_pass, sizeof(temp_pass), "", read, msg, text);
957  all.current_pass = temp_pass;
958  }
959  }
960 
961  if (read &&
962  (!all.training ||
963  !all.preserve_performance_counters)) // reset various things so that we report test set performance properly
964  {
965  all.sd->sum_loss = 0;
966  all.sd->sum_loss_since_last_dump = 0;
967  all.sd->weighted_labeled_examples = 0.;
968  all.sd->weighted_labels = 0.;
971  all.sd->example_number = 0;
972  all.sd->total_features = 0;
973  all.current_pass = 0;
974  }
975  if (all.weights.sparse)
976  save_load_online_state(all, model_file, read, text, g, msg, ftrl_size, all.weights.sparse_weights);
977  else
978  save_load_online_state(all, model_file, read, text, g, msg, ftrl_size, all.weights.dense_weights);
979 }
double sum_loss
Definition: global_data.h:145
parameters weights
Definition: global_data.h:537
float initial_t
Definition: global_data.h:530
double weighted_unlabeled_examples
Definition: global_data.h:143
void save_load_online_state(vw &all, io_buf &model_file, bool read, bool text, double &total_weight, gd *g, uint32_t ftrl_size)
Definition: gd.cc:881
bool training
Definition: global_data.h:488
double sum_loss_since_last_dump
Definition: global_data.h:146
shared_data * sd
Definition: global_data.h:375
VW::version_struct model_file_ver
Definition: global_data.h:419
double old_weighted_labeled_examples
Definition: global_data.h:142
double weighted_labels
Definition: global_data.h:144
dense_parameters dense_weights
uint64_t current_pass
Definition: global_data.h:396
uint64_t example_number
Definition: global_data.h:137
float min_label
Definition: global_data.h:150
sparse_parameters sparse_weights
bool preserve_performance_counters
Definition: global_data.h:416
float max_label
Definition: global_data.h:151
double weighted_labeled_examples
Definition: global_data.h:141
#define VERSION_SAVE_RESUME_FIX
Definition: gd.cc:33
#define VERSION_PASS_UINT64
Definition: gd.cc:34
float dump_interval
Definition: global_data.h:147
size_t bin_text_read_write_fixed(io_buf &io, char *data, size_t len, const char *read_message, bool read, std::stringstream &msg, bool text)
Definition: io_buf.h:326
uint64_t total_features
Definition: global_data.h:138
double normalized_sum_norm_x
Definition: global_data.h:420

◆ save_load_regressor() [1/2]

template<class T >
void GD::save_load_regressor ( vw all,
io_buf model_file,
bool  read,
bool  text,
T &  weights 
)

Definition at line 707 of file gd.cc.

References io_buf::bin_read_fixed(), bin_text_write_fixed(), vw::name_index_map, vw::num_bits, vw::print_invert, THROW, and write_index().

Referenced by SVRG::save_load(), and save_load().

708 {
709  size_t brw = 1;
710 
711  if (all.print_invert) // write readable model with feature names
712  {
713  std::stringstream msg;
714  typedef std::map<std::string, size_t> str_int_map;
715 
716  for (str_int_map::iterator it = all.name_index_map.begin(); it != all.name_index_map.end(); ++it)
717  {
718  weight* v = &weights.strided_index(it->second);
719  if (*v != 0.)
720  {
721  msg << it->first;
722  brw = bin_text_write_fixed(model_file, (char*)it->first.c_str(), sizeof(*it->first.c_str()), msg, true);
723 
724  msg << ":" << it->second << ":" << *v << "\n";
725  bin_text_write_fixed(model_file, (char*)&(*v), sizeof(*v), msg, true);
726  }
727  }
728  return;
729  }
730 
731  uint64_t i = 0;
732  uint32_t old_i = 0;
733  uint64_t length = (uint64_t)1 << all.num_bits;
734  if (read)
735  do
736  {
737  brw = 1;
738  if (all.num_bits < 31) // backwards compatible
739  {
740  brw = model_file.bin_read_fixed((char*)&old_i, sizeof(old_i), "");
741  i = old_i;
742  }
743  else
744  brw = model_file.bin_read_fixed((char*)&i, sizeof(i), "");
745  if (brw > 0)
746  {
747  if (i >= length)
748  THROW("Model content is corrupted, weight vector index " << i << " must be less than total vector length "
749  << length);
750  weight* v = &weights.strided_index(i);
751  brw += model_file.bin_read_fixed((char*)&(*v), sizeof(*v), "");
752  }
753  } while (brw > 0);
754  else // write
755  for (typename T::iterator v = weights.begin(); v != weights.end(); ++v)
756  if (*v != 0.)
757  {
758  i = v.index() >> weights.stride_shift();
759  std::stringstream msg;
760 
761  brw = write_index(model_file, msg, text, all.num_bits, i);
762  msg << ":" << *v << "\n";
763  brw += bin_text_write_fixed(model_file, (char*)&(*v), sizeof(*v), msg, text);
764  }
765 }
size_t write_index(io_buf &model_file, std::stringstream &msg, bool text, uint32_t num_bits, uint64_t i)
Definition: gd.cc:688
uint32_t num_bits
Definition: global_data.h:398
size_t bin_text_write_fixed(io_buf &io, char *data, size_t len, std::stringstream &msg, bool text)
Definition: io_buf.h:313
size_t bin_read_fixed(char *data, size_t len, const char *read_message)
Definition: io_buf.h:230
bool print_invert
Definition: global_data.h:542
float weight
std::map< std::string, size_t > name_index_map
Definition: global_data.h:548
#define THROW(args)
Definition: vw_exception.h:181

◆ save_load_regressor() [2/2]

void GD::save_load_regressor ( vw all,
io_buf model_file,
bool  read,
bool  text 
)

Definition at line 767 of file gd.cc.

References parameters::dense_weights, parameters::sparse, parameters::sparse_weights, and vw::weights.

Referenced by save_load().

768 {
769  if (all.weights.sparse)
770  save_load_regressor(all, model_file, read, text, all.weights.sparse_weights);
771  else
772  save_load_regressor(all, model_file, read, text, all.weights.dense_weights);
773 }
parameters weights
Definition: global_data.h:537
void save_load_regressor(vw &all, io_buf &model_file, bool read, bool text)
Definition: gd.cc:767
dense_parameters dense_weights
sparse_parameters sparse_weights

◆ sensitivity() [1/2]

template<bool sqrt_rate, bool feature_mask_off, bool adax, size_t adaptive, size_t normalized, size_t spare, bool stateless>
float GD::sensitivity ( gd g,
example ec 
)

Definition at line 579 of file gd.cc.

References example::total_sum_feat_sq.

580 {
581  if (adaptive || normalized)
582  return get_pred_per_update<sqrt_rate, feature_mask_off, adax, adaptive, normalized, spare, stateless>(g, ec);
583  else
584  return ec.total_sum_feat_sq;
585 }
float total_sum_feat_sq
Definition: example.h:71

◆ sensitivity() [2/2]

template<bool sqrt_rate, bool feature_mask_off, bool adax, size_t adaptive, size_t normalized, size_t spare>
float GD::sensitivity ( gd g,
base_learner ,
example ec 
)

Definition at line 601 of file gd.cc.

602 {
603  return get_scale<adaptive>(g, ec, 1.) *
604  sensitivity<sqrt_rate, feature_mask_off, adax, adaptive, normalized, spare, true>(g, ec);
605 }

◆ set_learn() [1/6]

template<bool sparse_l2, bool invariant, bool sqrt_rate, bool feature_mask_off, uint64_t adaptive, uint64_t normalized, uint64_t spare, uint64_t next>
uint64_t GD::set_learn ( vw all,
gd g 
)

Definition at line 1044 of file gd.cc.

References GD::gd::adax, GD::gd::learn, vw::normalized_idx, GD::gd::sensitivity, and GD::gd::update.

1045 {
1046  all.normalized_idx = normalized;
1047  if (g.adax)
1048  {
1049  g.learn = learn<sparse_l2, invariant, sqrt_rate, feature_mask_off, true, adaptive, normalized, spare>;
1050  g.update = update<sparse_l2, invariant, sqrt_rate, feature_mask_off, true, adaptive, normalized, spare>;
1051  g.sensitivity = sensitivity<sqrt_rate, feature_mask_off, true, adaptive, normalized, spare>;
1052  return next;
1053  }
1054  else
1055  {
1056  g.learn = learn<sparse_l2, invariant, sqrt_rate, feature_mask_off, false, adaptive, normalized, spare>;
1057  g.update = update<sparse_l2, invariant, sqrt_rate, feature_mask_off, false, adaptive, normalized, spare>;
1058  g.sensitivity = sensitivity<sqrt_rate, feature_mask_off, false, adaptive, normalized, spare>;
1059  return next;
1060  }
1061 }
size_t normalized_idx
Definition: global_data.h:506

◆ set_learn() [2/6]

template<bool sparse_l2, bool invariant, bool sqrt_rate, uint64_t adaptive, uint64_t normalized, uint64_t spare, uint64_t next>
uint64_t GD::set_learn ( vw all,
bool  feature_mask_off,
gd g 
)

Definition at line 1065 of file gd.cc.

References vw::normalized_idx.

1066 {
1067  all.normalized_idx = normalized;
1068  if (feature_mask_off)
1069  return set_learn<sparse_l2, invariant, sqrt_rate, true, adaptive, normalized, spare, next>(all, g);
1070  else
1071  return set_learn<sparse_l2, invariant, sqrt_rate, false, adaptive, normalized, spare, next>(all, g);
1072 }
size_t normalized_idx
Definition: global_data.h:506

◆ set_learn() [3/6]

template<bool invariant, bool sqrt_rate, uint64_t adaptive, uint64_t normalized, uint64_t spare, uint64_t next>
uint64_t GD::set_learn ( vw all,
bool  feature_mask_off,
gd g 
)

Definition at line 1075 of file gd.cc.

References GD::gd::sparse_l2.

1076 {
1077  if (g.sparse_l2 > 0.f)
1078  return set_learn<true, invariant, sqrt_rate, adaptive, normalized, spare, next>(all, feature_mask_off, g);
1079  else
1080  return set_learn<false, invariant, sqrt_rate, adaptive, normalized, spare, next>(all, feature_mask_off, g);
1081 }

◆ set_learn() [4/6]

template<bool sqrt_rate, uint64_t adaptive, uint64_t normalized, uint64_t spare, uint64_t next>
uint64_t GD::set_learn ( vw all,
bool  feature_mask_off,
gd g 
)

Definition at line 1084 of file gd.cc.

References vw::invariant_updates.

1085 {
1086  if (all.invariant_updates)
1087  return set_learn<true, sqrt_rate, adaptive, normalized, spare, next>(all, feature_mask_off, g);
1088  else
1089  return set_learn<false, sqrt_rate, adaptive, normalized, spare, next>(all, feature_mask_off, g);
1090 }
bool invariant_updates
Definition: global_data.h:490

◆ set_learn() [5/6]

template<bool sqrt_rate, uint64_t adaptive, uint64_t spare>
uint64_t GD::set_learn ( vw all,
bool  feature_mask_off,
gd g 
)

Definition at line 1093 of file gd.cc.

References parameters::normalized, and vw::weights.

1094 {
1095  // select the appropriate learn function based on adaptive, normalization, and feature mask
1096  if (all.weights.normalized)
1097  return set_learn<sqrt_rate, adaptive, adaptive + 1, adaptive + 2, adaptive + 3>(all, feature_mask_off, g);
1098  else
1099  return set_learn<sqrt_rate, adaptive, 0, spare, spare + 1>(all, feature_mask_off, g);
1100 }
parameters weights
Definition: global_data.h:537

◆ set_learn() [6/6]

template<bool sqrt_rate>
uint64_t GD::set_learn ( vw all,
bool  feature_mask_off,
gd g 
)

Definition at line 1103 of file gd.cc.

References parameters::adaptive, and vw::weights.

1104 {
1105  if (all.weights.adaptive)
1106  return set_learn<sqrt_rate, 1, 2>(all, feature_mask_off, g);
1107  else
1108  return set_learn<sqrt_rate, 0, 0>(all, feature_mask_off, g);
1109 }
parameters weights
Definition: global_data.h:537

◆ setup()

LEARNER::base_learner * GD::setup ( options_i options,
vw all 
)

Definition at line 1119 of file gd.cc.

References parameters::adaptive, VW::config::option_group_definition::add(), add(), VW::config::options_i::add_and_parse(), vw::audit, ceil_log_2(), shared_data::contraction, LEARNER::end_pass(), vw::eta, vw::eta_decay_rate, f, VW::config::options_i::get_typed_option(), shared_data::gravity, vw::hash_inv, shared_data::holdout_best_loss, vw::holdout_set_off, LEARNER::init_learner(), vw::initial_constant, vw::initial_t, vw::invariant_updates, LEARNER::make_base(), VW::config::make_option(), GD::gd::multipredict, parameters::normalized, vw::normalized_sum_norm_x, vw::numpasses, vw::power_t, ldamath::powf(), GD::gd::predict, vw::reg_mode, save_load(), vw::save_resume, vw::sd, GD::gd::sensitivity, LEARNER::learner< T, E >::set_end_pass(), LEARNER::learner< T, E >::set_multipredict(), LEARNER::learner< T, E >::set_save_load(), LEARNER::learner< T, E >::set_sensitivity(), LEARNER::learner< T, E >::set_update(), shared_data::t, THROW, vw::trace_message, vw::training, GD::gd::update, VW::config::options_i::was_supplied(), and vw::weights.

Referenced by parse_reductions().

1120 {
1121  auto g = scoped_calloc_or_throw<gd>();
1122 
1123  bool sgd = false;
1124  bool adaptive = false;
1125  bool adax = false;
1126  bool invariant = false;
1127  bool normalized = false;
1128 
1129  option_group_definition new_options("Gradient Descent options");
1130  new_options.add(make_option("sgd", sgd).help("use regular stochastic gradient descent update.").keep(all.save_resume))
1131  .add(make_option("adaptive", adaptive).help("use adaptive, individual learning rates.").keep(all.save_resume))
1132  .add(make_option("adax", adax).help("use adaptive learning rates with x^2 instead of g^2x^2"))
1133  .add(make_option("invariant", invariant).help("use safe/importance aware updates.").keep(all.save_resume))
1134  .add(make_option("normalized", normalized).help("use per feature normalized updates").keep(all.save_resume))
1135  .add(make_option("sparse_l2", g->sparse_l2).default_value(0.f).help("use per feature normalized updates"))
1136  .add(make_option("l1_state", all.sd->gravity)
1137  .keep(all.save_resume)
1138  .default_value(0.)
1139  .help("use per feature normalized updates"))
1140  .add(make_option("l2_state", all.sd->contraction)
1141  .keep(all.save_resume)
1142  .default_value(1.)
1143  .help("use per feature normalized updates"));
1144  options.add_and_parse(new_options);
1145 
1146  g->all = &all;
1147  g->all->normalized_sum_norm_x = 0;
1148  g->no_win_counter = 0;
1149  g->total_weight = 0.;
1150  all.weights.adaptive = true;
1151  all.weights.normalized = true;
1152  g->neg_norm_power = (all.weights.adaptive ? (all.power_t - 1.f) : -1.f);
1153  g->neg_power_t = -all.power_t;
1154 
1155  if (all.initial_t > 0) // for the normalized update: if initial_t is bigger than 1 we interpret this as if we had
1156  // seen (all.initial_t) previous fake datapoints all with norm 1
1157  {
1158  g->all->normalized_sum_norm_x = all.initial_t;
1159  g->total_weight = all.initial_t;
1160  }
1161 
1162  bool feature_mask_off = true;
1163  if (options.was_supplied("feature_mask"))
1164  feature_mask_off = false;
1165 
1166  if (!all.holdout_set_off)
1167  {
1168  all.sd->holdout_best_loss = FLT_MAX;
1169  g->early_stop_thres = options.get_typed_option<size_t>("early_terminate").value();
1170  }
1171 
1172  g->initial_constant = all.initial_constant;
1173 
1174  if (sgd || adaptive || invariant || normalized)
1175  {
1176  // nondefault
1177  all.weights.adaptive = adaptive;
1178  all.invariant_updates = all.training && invariant;
1179  all.weights.normalized = normalized;
1180 
1181  if (!options.was_supplied("learning_rate") && !options.was_supplied("l") &&
1182  !(all.weights.adaptive && all.weights.normalized))
1183  all.eta = 10; // default learning rate to 10 for non default update rule
1184 
1185  // if not using normalized or adaptive, default initial_t to 1 instead of 0
1186  if (!all.weights.adaptive && !all.weights.normalized)
1187  {
1188  if (!options.was_supplied("initial_t"))
1189  {
1190  all.sd->t = 1.f;
1191  all.initial_t = 1.f;
1192  }
1193  all.eta *= powf((float)(all.sd->t), all.power_t);
1194  }
1195  }
1196  else
1197  {
1198  all.invariant_updates = all.training;
1199  }
1200  g->adaptive_input = all.weights.adaptive;
1201  g->normalized_input = all.weights.normalized;
1202 
1203  all.weights.adaptive = all.weights.adaptive && all.training;
1204  all.weights.normalized = all.weights.normalized && all.training;
1205 
1206  if (adax)
1207  g->adax = all.training && adax;
1208 
1209  if (g->adax && !all.weights.adaptive)
1210  THROW("Cannot use adax without adaptive");
1211 
1212  if (pow((double)all.eta_decay_rate, (double)all.numpasses) < 0.0001)
1213  all.trace_message << "Warning: the learning rate for the last pass is multiplied by: "
1214  << pow((double)all.eta_decay_rate, (double)all.numpasses)
1215  << " adjust --decay_learning_rate larger to avoid this." << std::endl;
1216 
1217  if (all.reg_mode % 2)
1218  if (all.audit || all.hash_inv)
1219  {
1220  g->predict = predict<true, true>;
1221  g->multipredict = multipredict<true, true>;
1222  }
1223  else
1224  {
1225  g->predict = predict<true, false>;
1226  g->multipredict = multipredict<true, false>;
1227  }
1228  else if (all.audit || all.hash_inv)
1229  {
1230  g->predict = predict<false, true>;
1231  g->multipredict = multipredict<false, true>;
1232  }
1233  else
1234  {
1235  g->predict = predict<false, false>;
1236  g->multipredict = multipredict<false, false>;
1237  }
1238 
1239  uint64_t stride;
1240  if (all.power_t == 0.5)
1241  stride = set_learn<true>(all, feature_mask_off, *g.get());
1242  else
1243  stride = set_learn<false>(all, feature_mask_off, *g.get());
1244 
1245  all.weights.stride_shift((uint32_t)ceil_log_2(stride - 1));
1246 
1247  gd* bare = g.get();
1248  learner<gd, example>& ret = init_learner(g, g->learn, bare->predict, ((uint64_t)1 << all.weights.stride_shift()));
1249  ret.set_sensitivity(bare->sensitivity);
1250  ret.set_multipredict(bare->multipredict);
1251  ret.set_update(bare->update);
1252  ret.set_save_load(save_load);
1253  ret.set_end_pass(end_pass);
1254  return make_base(ret);
1255 }
void set_multipredict(void(*u)(T &, L &, E &, size_t, size_t, polyprediction *, bool))
Definition: learner.h:217
void set_update(void(*u)(T &data, L &base, E &))
Definition: learner.h:231
parameters weights
Definition: global_data.h:537
float initial_t
Definition: global_data.h:530
bool hash_inv
Definition: global_data.h:541
float power_t
Definition: global_data.h:447
double holdout_best_loss
Definition: global_data.h:161
base_learner * make_base(learner< T, E > &base)
Definition: learner.h:462
double contraction
Definition: global_data.h:149
virtual void add_and_parse(const option_group_definition &group)=0
void set_save_load(void(*sl)(T &, io_buf &, bool, bool))
Definition: learner.h:257
bool holdout_set_off
Definition: global_data.h:499
bool training
Definition: global_data.h:488
learner< T, E > & init_learner(free_ptr< T > &dat, L *base, void(*learn)(T &, L &, E &), void(*predict)(T &, L &, E &), size_t ws, prediction_type::prediction_type_t pred_type)
Definition: learner.h:369
float initial_constant
Definition: global_data.h:410
shared_data * sd
Definition: global_data.h:375
typed_option< T > & get_typed_option(const std::string &key)
Definition: options.h:120
T powf(T, T)
Definition: lda_core.cc:428
vw_ostream trace_message
Definition: global_data.h:424
virtual bool was_supplied(const std::string &key)=0
size_t numpasses
Definition: global_data.h:451
float eta
Definition: global_data.h:531
int add(svm_params &params, svm_example *fec)
Definition: kernel_svm.cc:546
typed_option< T > make_option(std::string name, T &location)
Definition: options.h:80
void set_sensitivity(float(*u)(T &data, base_learner &base, example &))
Definition: learner.h:237
void set_end_pass(void(*f)(T &))
Definition: learner.h:286
double gravity
Definition: global_data.h:148
bool save_resume
Definition: global_data.h:415
void save_load(gd &g, io_buf &model_file, bool read, bool text)
Definition: gd.cc:992
bool audit
Definition: global_data.h:486
bool invariant_updates
Definition: global_data.h:490
#define THROW(args)
Definition: vw_exception.h:181
uint64_t ceil_log_2(uint64_t v)
Definition: gd.cc:1111
void end_pass(gd &g)
Definition: gd.cc:148
float f
Definition: cache.cc:40
int reg_mode
Definition: global_data.h:448
double normalized_sum_norm_x
Definition: global_data.h:420
float eta_decay_rate
Definition: global_data.h:532

◆ sign()

float GD::sign ( float  w)
inline

Definition at line 106 of file gd.h.

Referenced by trunc_weight().

107 {
108  if (w < 0.)
109  return -1.;
110  else
111  return 1.;
112 }

◆ sync_weights()

void GD::sync_weights ( vw all)

Definition at line 671 of file gd.cc.

References shared_data::contraction, parameters::dense_weights, shared_data::gravity, vw::sd, parameters::sparse, parameters::sparse_weights, trunc_weight(), and vw::weights.

Referenced by end_pass(), save_load(), and update().

672 {
673  // todo, fix length dependence
674  if (all.sd->gravity == 0. && all.sd->contraction == 1.) // to avoid unnecessary weight synchronization
675  return;
676 
677  if (all.weights.sparse)
678  for (weight& w : all.weights.sparse_weights)
679  w = trunc_weight(w, (float)all.sd->gravity) * (float)all.sd->contraction;
680  else
681  for (weight& w : all.weights.dense_weights)
682  w = trunc_weight(w, (float)all.sd->gravity) * (float)all.sd->contraction;
683 
684  all.sd->gravity = 0.;
685  all.sd->contraction = 1.;
686 }
parameters weights
Definition: global_data.h:537
double contraction
Definition: global_data.h:149
float trunc_weight(const float w, const float gravity)
Definition: gd.h:114
shared_data * sd
Definition: global_data.h:375
dense_parameters dense_weights
float weight
sparse_parameters sparse_weights
double gravity
Definition: global_data.h:148

◆ train()

template<bool sqrt_rate, bool feature_mask_off, size_t adaptive, size_t normalized, size_t spare>
void GD::train ( gd g,
example ec,
float  update 
)

Definition at line 141 of file gd.cc.

References GD::gd::all, update(), and GD::gd::update_multiplier.

142 {
143  if (normalized)
144  update *= g.update_multiplier;
145  foreach_feature<float, update_feature<sqrt_rate, feature_mask_off, adaptive, normalized, spare> >(*g.all, ec, update);
146 }
void update(gd &g, base_learner &, example &ec)
Definition: gd.cc:647

◆ trunc_predict()

float GD::trunc_predict ( vw all,
example ec,
double  gravity 
)
inline

Definition at line 365 of file gd.cc.

References label_data::initial, example::l, GD::trunc_data::prediction, and polylabel::simple.

Referenced by predict().

366 {
367  trunc_data temp = {ec.l.simple.initial, (float)gravity};
368  foreach_feature<trunc_data, vec_add_trunc>(all, ec, temp);
369  return temp.prediction;
370 }
label_data simple
Definition: example.h:28
float initial
Definition: simple_label.h:16
polylabel l
Definition: example.h:57

◆ trunc_weight()

float GD::trunc_weight ( const float  w,
const float  gravity 
)
inline

Definition at line 114 of file gd.h.

References GD::multipredict_info< T >::gravity, and sign().

Referenced by audit_feature(), sync_weights(), vec_add_trunc(), and vec_add_trunc_multipredict().

115 {
116  return (gravity < fabsf(w)) ? w - sign(w) * gravity : 0.f;
117 }
float sign(float w)
Definition: gd.h:106

◆ update()

template<bool sparse_l2, bool invariant, bool sqrt_rate, bool feature_mask_off, bool adax, size_t adaptive, size_t normalized, size_t spare>
void GD::update ( gd g,
base_learner ,
example ec 
)

Definition at line 647 of file gd.cc.

References sync_weights().

Referenced by compute_update(), mf_train(), and train().

648 {
649  // invariant: not a test label, importance weight > 0
650  float update;
651  if ((update = compute_update<sparse_l2, invariant, sqrt_rate, feature_mask_off, adax, adaptive, normalized, spare>(
652  g, ec)) != 0.)
653  train<sqrt_rate, feature_mask_off, adaptive, normalized, spare>(g, ec, update);
654 
655  if (g.all->sd->contraction < 1e-9 || g.all->sd->gravity > 1e3) // updating weights now to avoid numerical instability
656  sync_weights(*g.all);
657 }
void sync_weights(vw &all)
Definition: gd.cc:671
void update(gd &g, base_learner &, example &ec)
Definition: gd.cc:647

◆ update_feature()

template<bool sqrt_rate, bool feature_mask_off, size_t adaptive, size_t normalized, size_t spare>
void GD::update_feature ( float &  update,
float  x,
float &  fw 
)
inline

Definition at line 109 of file gd.cc.

110 {
111  weight* w = &fw;
112  if (feature_mask_off || fw != 0.)
113  {
114  if (spare != 0)
115  x *= w[spare];
116  w[0] += update * x;
117  }
118 }
float weight
void update(gd &g, base_learner &, example &ec)
Definition: gd.cc:647

◆ vec_add()

void GD::vec_add ( float &  p,
const float  fx,
const float &  fw 
)
inline

Definition at line 75 of file gd_predict.h.

75 { p += fw * fx; }

◆ vec_add_multipredict()

template<class T >
void GD::vec_add_multipredict ( multipredict_info< T > &  mp,
const float  fx,
uint64_t  fi 
)
inline

Definition at line 40 of file gd.h.

References c, GD::multipredict_info< T >::count, GD::multipredict_info< T >::pred, polyprediction::scalar, GD::multipredict_info< T >::step, and GD::multipredict_info< T >::weights.

Referenced by multipredict(), and multipredict().

41 {
42  if ((-1e-10 < fx) && (fx < 1e-10))
43  return;
44  uint64_t mask = mp.weights.mask();
45  polyprediction* p = mp.pred;
46  fi &= mask;
47  uint64_t top = fi + (uint64_t)((mp.count - 1) * mp.step);
48  uint64_t i = 0;
49  if (top <= mask)
50  {
51  i += fi;
52  for (; i <= top; i += mp.step, ++p)
53  p->scalar +=
54  fx * mp.weights[i]; // TODO: figure out how to use weight_parameters::iterator (not using change_begin())
55  }
56  else // TODO: this could be faster by unrolling into two loops
57  for (size_t c = 0; c < mp.count; ++c, fi += (uint64_t)mp.step, ++p)
58  {
59  fi &= mask;
60  p->scalar += fx * mp.weights[fi];
61  }
62 }
float scalar
Definition: example.h:45
constexpr uint64_t c
Definition: rand48.cc:12

◆ vec_add_print()

void GD::vec_add_print ( float &  p,
const float  fx,
float &  fw 
)
inline

Definition at line 372 of file gd.cc.

373 {
374  p += fw * fx;
375  std::cerr << " + " << fw << "*" << fx;
376 }

◆ vec_add_trunc()

void GD::vec_add_trunc ( trunc_data p,
const float  fx,
float &  fw 
)
inline

Definition at line 360 of file gd.cc.

References GD::trunc_data::gravity, GD::trunc_data::prediction, and trunc_weight().

361 {
362  p.prediction += trunc_weight(fw, p.gravity) * fx;
363 }
float trunc_weight(const float w, const float gravity)
Definition: gd.h:114

◆ vec_add_trunc_multipredict()

template<class T >
void GD::vec_add_trunc_multipredict ( multipredict_info< T > &  mp,
const float  fx,
uint64_t  fi 
)
inline

Definition at line 394 of file gd.cc.

References c, GD::multipredict_info< T >::count, GD::multipredict_info< T >::gravity, GD::multipredict_info< T >::pred, polyprediction::scalar, GD::multipredict_info< T >::step, trunc_weight(), and GD::multipredict_info< T >::weights.

Referenced by multipredict().

395 {
396  size_t index = fi;
397  for (size_t c = 0; c < mp.count; c++, index += mp.step)
398  mp.pred[c].scalar += fx * trunc_weight(mp.weights[index], mp.gravity);
399 }
float trunc_weight(const float w, const float gravity)
Definition: gd.h:114
constexpr uint64_t c
Definition: rand48.cc:12

◆ write_index()

size_t GD::write_index ( io_buf model_file,
std::stringstream &  msg,
bool  text,
uint32_t  num_bits,
uint64_t  i 
)

Definition at line 688 of file gd.cc.

References bin_text_write_fixed().

Referenced by save_load_online_state(), and save_load_regressor().

689 {
690  size_t brw;
691  uint32_t old_i = 0;
692 
693  msg << i;
694 
695  if (num_bits < 31)
696  {
697  old_i = (uint32_t)i;
698  brw = bin_text_write_fixed(model_file, (char*)&old_i, sizeof(old_i), msg, text);
699  }
700  else
701  brw = bin_text_write_fixed(model_file, (char*)&i, sizeof(i), msg, text);
702 
703  return brw;
704 }
size_t bin_text_write_fixed(io_buf &io, char *data, size_t len, std::stringstream &msg, bool text)
Definition: io_buf.h:313

Variable Documentation

◆ global_print_features

bool GD::global_print_features = false

Definition at line 538 of file gd.cc.

◆ x2_max

constexpr float GD::x2_max = FLT_MAX

Definition at line 485 of file gd.cc.

◆ x2_min

constexpr float GD::x2_min = x_min * x_min

Definition at line 484 of file gd.cc.

Referenced by pred_per_update_feature().

◆ x_min

constexpr float GD::x_min = 1.084202e-19f

Definition at line 483 of file gd.cc.