Vowpal Wabbit
gd.cc
Go to the documentation of this file.
1 /*
2 Copyright (c) by respective owners including Yahoo!, Microsoft, and
3 individual contributors. All rights reserved. Released under a BSD (revised)
4 license as described in the file LICENSE.
5  */
6 #include "crossplat_compat.h"
7 
8 #include <float.h>
9 #ifdef _WIN32
10 #define NOMINMAX
11 #include <WinSock2.h>
12 #else
13 #include <netdb.h>
14 #endif
15 
16 #if !defined(VW_NO_INLINE_SIMD)
17 #if !defined(__SSE2__) && (defined(_M_AMD64) || defined(_M_X64))
18 #define __SSE2__
19 #endif
20 
21 #if defined(__ARM_NEON__)
22 #include <arm_neon.h>
23 #elif defined(__SSE2__)
24 #include <xmmintrin.h>
25 #endif
26 #endif
27 
28 #include "gd.h"
29 #include "accumulate.h"
30 #include "reductions.h"
31 #include "vw.h"
32 
33 #define VERSION_SAVE_RESUME_FIX "7.10.1"
34 #define VERSION_PASS_UINT64 "8.3.3"
35 
36 using namespace LEARNER;
37 using namespace VW::config;
38 
39 // todo:
40 // 4. Factor various state out of vw&
41 namespace GD
42 {
43 struct gd
44 {
45  // double normalized_sum_norm_x;
46  double total_weight;
51  float neg_power_t;
52  float sparse_l2;
54  void (*predict)(gd&, base_learner&, example&);
55  void (*learn)(gd&, base_learner&, example&);
56  void (*update)(gd&, base_learner&, example&);
57  float (*sensitivity)(gd&, base_learner&, example&);
58  void (*multipredict)(gd&, base_learner&, example&, size_t, size_t, polyprediction*, bool);
61  bool adax;
62 
63  vw* all; // parallel, features, parameters
64 };
65 
66 void sync_weights(vw& all);
67 
68 inline float quake_InvSqrt(float x)
69 {
70  // Carmack/Quake/SGI fast method:
71  float xhalf = 0.5f * x;
72  static_assert(sizeof(int) == sizeof(float), "Floats and ints are converted between, they must be the same size.");
73  int i = reinterpret_cast<int&>(x); // store floating-point bits in integer
74  i = 0x5f3759d5 - (i >> 1); // initial guess for Newton's method
75  x = reinterpret_cast<float&>(i); // convert new bits into float
76  x = x * (1.5f - xhalf * x * x); // One round of Newton's method
77  return x;
78 }
79 
80 static inline float InvSqrt(float x)
81 {
82 #if !defined(VW_NO_INLINE_SIMD)
83 #if defined(__ARM_NEON__)
84  // Propagate into vector
85  float32x2_t v1 = vdup_n_f32(x);
86  // Estimate
87  float32x2_t e1 = vrsqrte_f32(v1);
88  // N-R iteration 1
89  float32x2_t e2 = vmul_f32(e1, vrsqrts_f32(v1, vmul_f32(e1, e1)));
90  // N-R iteration 2
91  float32x2_t e3 = vmul_f32(e2, vrsqrts_f32(v1, vmul_f32(e2, e2)));
92  // Extract result
93  return vget_lane_f32(e3, 0);
94 #elif defined(__SSE2__)
95  __m128 eta = _mm_load_ss(&x);
96  eta = _mm_rsqrt_ss(eta);
97  _mm_store_ss(&x, eta);
98 #else
99  x = quake_InvSqrt(x);
100 #endif
101 #else
102  x = quake_InvSqrt(x);
103 #endif
104 
105  return x;
106 }
107 
108 template <bool sqrt_rate, bool feature_mask_off, size_t adaptive, size_t normalized, size_t spare>
109 inline void update_feature(float& update, float x, float& fw)
110 {
111  weight* w = &fw;
112  if (feature_mask_off || fw != 0.)
113  {
114  if (spare != 0)
115  x *= w[spare];
116  w[0] += update * x;
117  }
118 }
119 
120 // this deals with few nonzero features vs. all nonzero features issues.
121 template <bool sqrt_rate, size_t adaptive, size_t normalized>
122 float average_update(float total_weight, float normalized_sum_norm_x, float neg_norm_power)
123 {
124  if (normalized)
125  {
126  if (sqrt_rate)
127  {
128  float avg_norm = (float)(total_weight / normalized_sum_norm_x);
129  if (adaptive)
130  return std::sqrt(avg_norm);
131  else
132  return avg_norm;
133  }
134  else
135  return powf((float)(normalized_sum_norm_x / total_weight), neg_norm_power);
136  }
137  return 1.f;
138 }
139 
140 template <bool sqrt_rate, bool feature_mask_off, size_t adaptive, size_t normalized, size_t spare>
141 void train(gd& g, example& ec, float update)
142 {
143  if (normalized)
144  update *= g.update_multiplier;
145  foreach_feature<float, update_feature<sqrt_rate, feature_mask_off, adaptive, normalized, spare> >(*g.all, ec, update);
146 }
147 
148 void end_pass(gd& g)
149 {
150  vw& all = *g.all;
151  if (all.save_resume)
152  {
153  // TODO work out a better system to update state that will be saved in the model.
154  if (all.sd->gravity != 0.)
155  {
156  g.all->options->replace("l1_state", std::to_string(all.sd->gravity));
157  g.all->options->get_typed_option<double>("l1_state").value(all.sd->gravity);
158  }
159  if (all.sd->contraction != 1.)
160  {
161  g.all->options->replace("l2_state", std::to_string(all.sd->contraction));
162  g.all->options->get_typed_option<double>("l2_state").value(all.sd->contraction);
163  }
164  }
165  else
166  sync_weights(all);
167  if (all.all_reduce != nullptr)
168  {
169  if (all.weights.adaptive)
171  else
172  accumulate_avg(all, all.weights, 0);
173  }
174  all.eta *= all.eta_decay_rate;
175  if (all.save_per_pass)
177 
178  if (!all.holdout_set_off)
179  {
182  if ((g.early_stop_thres == g.no_win_counter) &&
184  set_done(all);
185  }
186 }
187 
188 #include <algorithm>
189 
191 {
192  float v;
193  std::string s;
194  friend bool operator<(const string_value& first, const string_value& second);
195 };
196 
197 bool operator<(const string_value& first, const string_value& second) { return fabsf(first.v) > fabsf(second.v); }
198 
200 {
201  vw& all;
202  const uint64_t offset;
203  std::vector<std::string> ns_pre;
204  std::vector<string_value> results;
205  audit_results(vw& p_all, const size_t p_offset) : all(p_all), offset(p_offset) {}
206 };
207 
209 {
210  if (f == nullptr)
211  {
212  if (!dat.ns_pre.empty())
213  {
214  dat.ns_pre.pop_back();
215  }
216 
217  return;
218  }
219 
220  std::string ns_pre;
221  if (!dat.ns_pre.empty())
222  ns_pre += '*';
223 
224  if (f->first != "" && ((f->first) != " "))
225  {
226  ns_pre.append(f->first);
227  ns_pre += '^';
228  }
229 
230  if (f->second != "")
231  {
232  ns_pre.append(f->second);
233  }
234 
235  if (!ns_pre.empty())
236  {
237  dat.ns_pre.push_back(ns_pre);
238  }
239 }
240 
241 inline void audit_feature(audit_results& dat, const float ft_weight, const uint64_t ft_idx)
242 {
243  parameters& weights = dat.all.weights;
244  uint64_t index = ft_idx & weights.mask();
245  size_t stride_shift = weights.stride_shift();
246 
247  std::string ns_pre;
248  for (std::string& s : dat.ns_pre) ns_pre += s;
249 
250  if (dat.all.audit)
251  {
252  std::ostringstream tempstream;
253  tempstream << ':' << (index >> stride_shift) << ':' << ft_weight << ':'
254  << trunc_weight(weights[index], (float)dat.all.sd->gravity) * (float)dat.all.sd->contraction;
255 
256  if (weights.adaptive) // adaptive
257  tempstream << '@' << (&weights[index])[1];
258 
259  string_value sv = {weights[index] * ft_weight, ns_pre + tempstream.str()};
260  dat.results.push_back(sv);
261  }
262 
263  if ((dat.all.current_pass == 0 || dat.all.training == false) && dat.all.hash_inv)
264  {
265  // for invert_hash
266 
267  if (dat.offset != 0)
268  {
269  // otherwise --oaa output no features for class > 0.
270  std::ostringstream tempstream;
271  tempstream << '[' << (dat.offset >> stride_shift) << ']';
272  ns_pre += tempstream.str();
273  }
274 
275  if (!dat.all.name_index_map.count(ns_pre))
276  dat.all.name_index_map.insert(std::map<std::string, size_t>::value_type(ns_pre, index >> stride_shift));
277  }
278 }
279 
281 {
282  parameters& weights = all.weights;
283  uint32_t stride_shift = weights.stride_shift();
284  size_t count = 0;
285  for (features& fs : ec) count += fs.size();
286  for (features& fs : ec)
287  {
288  for (features::iterator_all& f : fs.values_indices_audit())
289  {
290  std::cout << '\t' << f.audit().get()->first << '^' << f.audit().get()->second << ':'
291  << ((f.index() >> stride_shift) & all.parse_mask) << ':' << f.value();
292  for (size_t k = 0; k < all.lda; k++) std::cout << ':' << (&weights[f.index()])[k];
293  }
294  }
295  std::cout << " total of " << count << " features." << std::endl;
296 }
297 
298 void print_features(vw& all, example& ec)
299 {
300  if (all.lda > 0)
301  print_lda_features(all, ec);
302  else
303  {
304  audit_results dat(all, ec.ft_offset);
305 
306  for (features& fs : ec)
307  {
308  if (fs.space_names.size() > 0)
309  for (features::iterator_all& f : fs.values_indices_audit())
310  {
311  audit_interaction(dat, f.audit().get());
312  audit_feature(dat, f.value(), f.index() + ec.ft_offset);
313  audit_interaction(dat, NULL);
314  }
315  else
316  for (features::iterator& f : fs) audit_feature(dat, f.value(), f.index() + ec.ft_offset);
317  }
318 
319  INTERACTIONS::generate_interactions<audit_results, const uint64_t, audit_feature, true, audit_interaction>(
320  all, ec, dat);
321 
322  stable_sort(dat.results.begin(), dat.results.end());
323  if (all.audit)
324  {
325  for (string_value& sv : dat.results) std::cout << '\t' << sv.s;
326  std::cout << std::endl;
327  }
328  }
329 }
330 
332 {
333  if (all.audit)
334  print_result(all.stdout_fileno, ec.pred.scalar, -1, ec.tag);
335  fflush(stdout);
336  print_features(all, ec);
337 }
338 
339 float finalize_prediction(shared_data* sd, float ret)
340 {
341  if (std::isnan(ret))
342  {
343  ret = 0.;
344  std::cerr << "NAN prediction in example " << sd->example_number + 1 << ", forcing " << ret << std::endl;
345  return ret;
346  }
347  if (ret > sd->max_label)
348  return (float)sd->max_label;
349  if (ret < sd->min_label)
350  return (float)sd->min_label;
351  return ret;
352 }
353 
355 {
356  float prediction;
357  float gravity;
358 };
359 
360 inline void vec_add_trunc(trunc_data& p, const float fx, float& fw)
361 {
362  p.prediction += trunc_weight(fw, p.gravity) * fx;
363 }
364 
365 inline float trunc_predict(vw& all, example& ec, double gravity)
366 {
367  trunc_data temp = {ec.l.simple.initial, (float)gravity};
368  foreach_feature<trunc_data, vec_add_trunc>(all, ec, temp);
369  return temp.prediction;
370 }
371 
372 inline void vec_add_print(float& p, const float fx, float& fw)
373 {
374  p += fw * fx;
375  std::cerr << " + " << fw << "*" << fx;
376 }
377 
378 template <bool l1, bool audit>
380 {
381  vw& all = *g.all;
382  if (l1)
383  ec.partial_prediction = trunc_predict(all, ec, all.sd->gravity);
384  else
385  ec.partial_prediction = inline_predict(all, ec);
386 
387  ec.partial_prediction *= (float)all.sd->contraction;
389  if (audit)
390  print_audit_features(all, ec);
391 }
392 
393 template <class T>
394 inline void vec_add_trunc_multipredict(multipredict_info<T>& mp, const float fx, uint64_t fi)
395 {
396  size_t index = fi;
397  for (size_t c = 0; c < mp.count; c++, index += mp.step)
398  mp.pred[c].scalar += fx * trunc_weight(mp.weights[index], mp.gravity);
399 }
400 
401 template <bool l1, bool audit>
403  gd& g, base_learner&, example& ec, size_t count, size_t step, polyprediction* pred, bool finalize_predictions)
404 {
405  vw& all = *g.all;
406  for (size_t c = 0; c < count; c++) pred[c].scalar = ec.l.simple.initial;
407  if (g.all->weights.sparse)
408  {
410  count, step, pred, g.all->weights.sparse_weights, (float)all.sd->gravity};
411  if (l1)
412  foreach_feature<multipredict_info<sparse_parameters>, uint64_t, vec_add_trunc_multipredict>(all, ec, mp);
413  else
414  foreach_feature<multipredict_info<sparse_parameters>, uint64_t, vec_add_multipredict>(all, ec, mp);
415  }
416  else
417  {
418  multipredict_info<dense_parameters> mp = {count, step, pred, g.all->weights.dense_weights, (float)all.sd->gravity};
419  if (l1)
420  foreach_feature<multipredict_info<dense_parameters>, uint64_t, vec_add_trunc_multipredict>(all, ec, mp);
421  else
422  foreach_feature<multipredict_info<dense_parameters>, uint64_t, vec_add_multipredict>(all, ec, mp);
423  }
424  if (all.sd->contraction != 1.)
425  for (size_t c = 0; c < count; c++) pred[c].scalar *= (float)all.sd->contraction;
426  if (finalize_predictions)
427  for (size_t c = 0; c < count; c++) pred[c].scalar = finalize_prediction(all.sd, pred[c].scalar);
428  if (audit)
429  {
430  for (size_t c = 0; c < count; c++)
431  {
432  ec.pred.scalar = pred[c].scalar;
433  print_audit_features(all, ec);
434  ec.ft_offset += (uint64_t)step;
435  }
436  ec.ft_offset -= (uint64_t)(step * count);
437  }
438 }
439 
441 {
444 };
445 
446 template <bool sqrt_rate, size_t adaptive, size_t normalized>
447 inline float compute_rate_decay(power_data& s, float& fw)
448 {
449  weight* w = &fw;
450  float rate_decay = 1.f;
451  if (adaptive)
452  {
453  if (sqrt_rate)
454  rate_decay = InvSqrt(w[adaptive]);
455  else
456  rate_decay = powf(w[adaptive], s.minus_power_t);
457  }
458  if (normalized)
459  {
460  if (sqrt_rate)
461  {
462  float inv_norm = 1.f / w[normalized];
463  if (adaptive)
464  rate_decay *= inv_norm;
465  else
466  rate_decay *= inv_norm * inv_norm;
467  }
468  else
469  rate_decay *= powf(w[normalized] * w[normalized], s.neg_norm_power);
470  }
471  return rate_decay;
472 }
473 
474 struct norm_data
475 {
478  float norm_x;
480  float extra_state[4];
481 };
482 
483 constexpr float x_min = 1.084202e-19f;
484 constexpr float x2_min = x_min * x_min;
485 constexpr float x2_max = FLT_MAX;
486 
487 template <bool sqrt_rate, bool feature_mask_off, size_t adaptive, size_t normalized, size_t spare, bool stateless>
488 inline void pred_per_update_feature(norm_data& nd, float x, float& fw)
489 {
490  if (feature_mask_off || fw != 0.)
491  {
492  weight* w = &fw;
493  float x2 = x * x;
494  if (x2 < x2_min)
495  {
496  x = (x > 0) ? x_min : -x_min;
497  x2 = x2_min;
498  }
499  if (x2 > x2_max)
500  THROW("your features have too much magnitude");
501  if (stateless) // we must not modify the parameter state so introduce a shadow version.
502  {
503  nd.extra_state[0] = w[0];
504  nd.extra_state[adaptive] = w[adaptive];
505  nd.extra_state[normalized] = w[normalized];
506  w = nd.extra_state;
507  }
508  if (adaptive)
509  w[adaptive] += nd.grad_squared * x2;
510  if (normalized)
511  {
512  float x_abs = fabsf(x);
513  if (x_abs > w[normalized]) // new scale discovered
514  {
515  if (w[normalized] >
516  0.) // If the normalizer is > 0 then rescale the weight so it's as if the new scale was the old scale.
517  {
518  if (sqrt_rate)
519  {
520  float rescale = w[normalized] / x_abs;
521  w[0] *= (adaptive ? rescale : rescale * rescale);
522  }
523  else
524  {
525  float rescale = x_abs / w[normalized];
526  w[0] *= powf(rescale * rescale, nd.pd.neg_norm_power);
527  }
528  }
529  w[normalized] = x_abs;
530  }
531  nd.norm_x += x2 / (w[normalized] * w[normalized]);
532  }
533  w[spare] = compute_rate_decay<sqrt_rate, adaptive, normalized>(nd.pd, w[0]);
534  nd.pred_per_update += x2 * w[spare];
535  }
536 }
537 
539 template <bool sqrt_rate, bool feature_mask_off, bool adax, size_t adaptive, size_t normalized, size_t spare,
540  bool stateless>
542 {
543  // We must traverse the features in _precisely_ the same order as during training.
544  label_data& ld = ec.l.simple;
545  vw& all = *g.all;
546 
547  float grad_squared = ec.weight;
548  if (!adax)
549  grad_squared *= all.loss->getSquareGrad(ec.pred.scalar, ld.label);
550 
551  if (grad_squared == 0 && !stateless)
552  return 1.;
553 
554  norm_data nd = {grad_squared, 0., 0., {g.neg_power_t, g.neg_norm_power}, {0}};
556  pred_per_update_feature<sqrt_rate, feature_mask_off, adaptive, normalized, spare, stateless> >(all, ec, nd);
557  if (normalized)
558  {
559  if (!stateless)
560  {
561  g.all->normalized_sum_norm_x += ((double)ec.weight) * nd.norm_x;
562  g.total_weight += ec.weight;
563  g.update_multiplier = average_update<sqrt_rate, adaptive, normalized>(
564  (float)g.total_weight, (float)g.all->normalized_sum_norm_x, g.neg_norm_power);
565  }
566  else
567  {
568  float nsnx = ((float)g.all->normalized_sum_norm_x) + ec.weight * nd.norm_x;
569  float tw = (float)g.total_weight + ec.weight;
570  g.update_multiplier = average_update<sqrt_rate, adaptive, normalized>(tw, nsnx, g.neg_norm_power);
571  }
573  }
574  return nd.pred_per_update;
575 }
576 
577 template <bool sqrt_rate, bool feature_mask_off, bool adax, size_t adaptive, size_t normalized, size_t spare,
578  bool stateless>
579 float sensitivity(gd& g, example& ec)
580 {
581  if (adaptive || normalized)
582  return get_pred_per_update<sqrt_rate, feature_mask_off, adax, adaptive, normalized, spare, stateless>(g, ec);
583  else
584  return ec.total_sum_feat_sq;
585 }
586 
587 template <size_t adaptive>
588 float get_scale(gd& g, example& /* ec */, float weight)
589 {
590  float update_scale = g.all->eta * weight;
591  if (!adaptive)
592  {
593  float t =
594  (float)(g.all->sd->t + weight - g.all->sd->weighted_holdout_examples - g.all->sd->weighted_unlabeled_examples);
595  update_scale *= powf(t, g.neg_power_t);
596  }
597  return update_scale;
598 }
599 
600 template <bool sqrt_rate, bool feature_mask_off, bool adax, size_t adaptive, size_t normalized, size_t spare>
601 float sensitivity(gd& g, base_learner& /* base */, example& ec)
602 {
603  return get_scale<adaptive>(g, ec, 1.) *
604  sensitivity<sqrt_rate, feature_mask_off, adax, adaptive, normalized, spare, true>(g, ec);
605 }
606 
607 template <bool sparse_l2, bool invariant, bool sqrt_rate, bool feature_mask_off, bool adax, size_t adaptive,
608  size_t normalized, size_t spare>
609 float compute_update(gd& g, example& ec)
610 {
611  // invariant: not a test label, importance weight > 0
612  label_data& ld = ec.l.simple;
613  vw& all = *g.all;
614 
615  float update = 0.;
617  if (all.loss->getLoss(all.sd, ec.pred.scalar, ld.label) > 0.)
618  {
619  float pred_per_update = sensitivity<sqrt_rate, feature_mask_off, adax, adaptive, normalized, spare, false>(g, ec);
620  float update_scale = get_scale<adaptive>(g, ec, ec.weight);
621  if (invariant)
622  update = all.loss->getUpdate(ec.pred.scalar, ld.label, update_scale, pred_per_update);
623  else
624  update = all.loss->getUnsafeUpdate(ec.pred.scalar, ld.label, update_scale);
625  // changed from ec.partial_prediction to ld.prediction
626  ec.updated_prediction += pred_per_update * update;
627 
628  if (all.reg_mode && fabs(update) > 1e-8)
629  {
630  double dev1 = all.loss->first_derivative(all.sd, ec.pred.scalar, ld.label);
631  double eta_bar = (fabs(dev1) > 1e-8) ? (-update / dev1) : 0.0;
632  if (fabs(dev1) > 1e-8)
633  all.sd->contraction *= (1. - all.l2_lambda * eta_bar);
634  update /= (float)all.sd->contraction;
635  all.sd->gravity += eta_bar * all.l1_lambda;
636  }
637  }
638 
639  if (sparse_l2)
640  update -= g.sparse_l2 * ec.pred.scalar;
641 
642  return update;
643 }
644 
645 template <bool sparse_l2, bool invariant, bool sqrt_rate, bool feature_mask_off, bool adax, size_t adaptive,
646  size_t normalized, size_t spare>
647 void update(gd& g, base_learner&, example& ec)
648 {
649  // invariant: not a test label, importance weight > 0
650  float update;
651  if ((update = compute_update<sparse_l2, invariant, sqrt_rate, feature_mask_off, adax, adaptive, normalized, spare>(
652  g, ec)) != 0.)
653  train<sqrt_rate, feature_mask_off, adaptive, normalized, spare>(g, ec, update);
654 
655  if (g.all->sd->contraction < 1e-9 || g.all->sd->gravity > 1e3) // updating weights now to avoid numerical instability
656  sync_weights(*g.all);
657 }
658 
659 template <bool sparse_l2, bool invariant, bool sqrt_rate, bool feature_mask_off, bool adax, size_t adaptive,
660  size_t normalized, size_t spare>
661 void learn(gd& g, base_learner& base, example& ec)
662 {
663  // invariant: not a test label, importance weight > 0
664  assert(ec.in_use);
665  assert(ec.l.simple.label != FLT_MAX);
666  assert(ec.weight > 0.);
667  g.predict(g, base, ec);
668  update<sparse_l2, invariant, sqrt_rate, feature_mask_off, adax, adaptive, normalized, spare>(g, base, ec);
669 }
670 
671 void sync_weights(vw& all)
672 {
673  // todo, fix length dependence
674  if (all.sd->gravity == 0. && all.sd->contraction == 1.) // to avoid unnecessary weight synchronization
675  return;
676 
677  if (all.weights.sparse)
678  for (weight& w : all.weights.sparse_weights)
679  w = trunc_weight(w, (float)all.sd->gravity) * (float)all.sd->contraction;
680  else
681  for (weight& w : all.weights.dense_weights)
682  w = trunc_weight(w, (float)all.sd->gravity) * (float)all.sd->contraction;
683 
684  all.sd->gravity = 0.;
685  all.sd->contraction = 1.;
686 }
687 
688 size_t write_index(io_buf& model_file, std::stringstream& msg, bool text, uint32_t num_bits, uint64_t i)
689 {
690  size_t brw;
691  uint32_t old_i = 0;
692 
693  msg << i;
694 
695  if (num_bits < 31)
696  {
697  old_i = (uint32_t)i;
698  brw = bin_text_write_fixed(model_file, (char*)&old_i, sizeof(old_i), msg, text);
699  }
700  else
701  brw = bin_text_write_fixed(model_file, (char*)&i, sizeof(i), msg, text);
702 
703  return brw;
704 }
705 
706 template <class T>
707 void save_load_regressor(vw& all, io_buf& model_file, bool read, bool text, T& weights)
708 {
709  size_t brw = 1;
710 
711  if (all.print_invert) // write readable model with feature names
712  {
713  std::stringstream msg;
714  typedef std::map<std::string, size_t> str_int_map;
715 
716  for (str_int_map::iterator it = all.name_index_map.begin(); it != all.name_index_map.end(); ++it)
717  {
718  weight* v = &weights.strided_index(it->second);
719  if (*v != 0.)
720  {
721  msg << it->first;
722  brw = bin_text_write_fixed(model_file, (char*)it->first.c_str(), sizeof(*it->first.c_str()), msg, true);
723 
724  msg << ":" << it->second << ":" << *v << "\n";
725  bin_text_write_fixed(model_file, (char*)&(*v), sizeof(*v), msg, true);
726  }
727  }
728  return;
729  }
730 
731  uint64_t i = 0;
732  uint32_t old_i = 0;
733  uint64_t length = (uint64_t)1 << all.num_bits;
734  if (read)
735  do
736  {
737  brw = 1;
738  if (all.num_bits < 31) // backwards compatible
739  {
740  brw = model_file.bin_read_fixed((char*)&old_i, sizeof(old_i), "");
741  i = old_i;
742  }
743  else
744  brw = model_file.bin_read_fixed((char*)&i, sizeof(i), "");
745  if (brw > 0)
746  {
747  if (i >= length)
748  THROW("Model content is corrupted, weight vector index " << i << " must be less than total vector length "
749  << length);
750  weight* v = &weights.strided_index(i);
751  brw += model_file.bin_read_fixed((char*)&(*v), sizeof(*v), "");
752  }
753  } while (brw > 0);
754  else // write
755  for (typename T::iterator v = weights.begin(); v != weights.end(); ++v)
756  if (*v != 0.)
757  {
758  i = v.index() >> weights.stride_shift();
759  std::stringstream msg;
760 
761  brw = write_index(model_file, msg, text, all.num_bits, i);
762  msg << ":" << *v << "\n";
763  brw += bin_text_write_fixed(model_file, (char*)&(*v), sizeof(*v), msg, text);
764  }
765 }
766 
767 void save_load_regressor(vw& all, io_buf& model_file, bool read, bool text)
768 {
769  if (all.weights.sparse)
770  save_load_regressor(all, model_file, read, text, all.weights.sparse_weights);
771  else
772  save_load_regressor(all, model_file, read, text, all.weights.dense_weights);
773 }
774 
775 template <class T>
777  vw& all, io_buf& model_file, bool read, bool text, gd* g, std::stringstream& msg, uint32_t ftrl_size, T& weights)
778 {
779  uint64_t length = (uint64_t)1 << all.num_bits;
780 
781  uint64_t i = 0;
782  uint32_t old_i = 0;
783  size_t brw = 1;
784 
785  if (read)
786  do
787  {
788  brw = 1;
789  if (all.num_bits < 31) // backwards compatible
790  {
791  brw = model_file.bin_read_fixed((char*)&old_i, sizeof(old_i), "");
792  i = old_i;
793  }
794  else
795  brw = model_file.bin_read_fixed((char*)&i, sizeof(i), "");
796  if (brw > 0)
797  {
798  if (i >= length)
799  THROW("Model content is corrupted, weight vector index " << i << " must be less than total vector length "
800  << length);
801  weight buff[8] = {0, 0, 0, 0, 0, 0, 0, 0};
802  if (ftrl_size > 0)
803  brw += model_file.bin_read_fixed((char*)buff, sizeof(buff[0]) * ftrl_size, "");
804  else if (g == NULL || (!g->adaptive_input && !g->normalized_input))
805  brw += model_file.bin_read_fixed((char*)buff, sizeof(buff[0]), "");
806  else if ((g->adaptive_input && !g->normalized_input) || (!g->adaptive_input && g->normalized_input))
807  brw += model_file.bin_read_fixed((char*)buff, sizeof(buff[0]) * 2, "");
808  else // adaptive and normalized
809  brw += model_file.bin_read_fixed((char*)buff, sizeof(buff[0]) * 3, "");
810  uint32_t stride = 1 << weights.stride_shift();
811  weight* v = &weights.strided_index(i);
812  for (size_t i = 0; i < stride; i++) v[i] = buff[i];
813  }
814  } while (brw > 0);
815  else // write binary or text
816  for (typename T::iterator v = weights.begin(); v != weights.end(); ++v)
817  {
818  i = v.index() >> weights.stride_shift();
819 
820  if (ftrl_size == 3)
821  {
822  if (*v != 0. || (&(*v))[1] != 0. || (&(*v))[2] != 0.)
823  {
824  brw = write_index(model_file, msg, text, all.num_bits, i);
825  msg << ":" << *v << " " << (&(*v))[1] << " " << (&(*v))[2] << "\n";
826  brw += bin_text_write_fixed(model_file, (char*)&(*v), 3 * sizeof(*v), msg, text);
827  }
828  }
829  else if (ftrl_size == 4)
830  {
831  if (*v != 0. || (&(*v))[1] != 0. || (&(*v))[2] != 0. || (&(*v))[3] != 0.)
832  {
833  brw = write_index(model_file, msg, text, all.num_bits, i);
834  msg << ":" << *v << " " << (&(*v))[1] << " " << (&(*v))[2] << " " << (&(*v))[3] << "\n";
835  brw += bin_text_write_fixed(model_file, (char*)&(*v), 4 * sizeof(*v), msg, text);
836  }
837  }
838  else if (ftrl_size == 6)
839  {
840  if (*v != 0. || (&(*v))[1] != 0. || (&(*v))[2] != 0. || (&(*v))[3] != 0. || (&(*v))[4] != 0. ||
841  (&(*v))[5] != 0.)
842  {
843  brw = write_index(model_file, msg, text, all.num_bits, i);
844  msg << ":" << *v << " " << (&(*v))[1] << " " << (&(*v))[2] << " " << (&(*v))[3] << " " << (&(*v))[4] << " "
845  << (&(*v))[5] << "\n";
846  brw += bin_text_write_fixed(model_file, (char*)&(*v), 6 * sizeof(*v), msg, text);
847  }
848  }
849  else if (g == nullptr || (!all.weights.adaptive && !all.weights.normalized))
850  {
851  if (*v != 0.)
852  {
853  brw = write_index(model_file, msg, text, all.num_bits, i);
854  msg << ":" << *v << "\n";
855  brw += bin_text_write_fixed(model_file, (char*)&(*v), sizeof(*v), msg, text);
856  }
857  }
858  else if ((all.weights.adaptive && !all.weights.normalized) || (!all.weights.adaptive && all.weights.normalized))
859  {
860  // either adaptive or normalized
861  if (*v != 0. || (&(*v))[1] != 0.)
862  {
863  brw = write_index(model_file, msg, text, all.num_bits, i);
864  msg << ":" << *v << " " << (&(*v))[1] << "\n";
865  brw += bin_text_write_fixed(model_file, (char*)&(*v), 2 * sizeof(*v), msg, text);
866  }
867  }
868  else
869  {
870  // adaptive and normalized
871  if (*v != 0. || (&(*v))[1] != 0. || (&(*v))[2] != 0.)
872  {
873  brw = write_index(model_file, msg, text, all.num_bits, i);
874  msg << ":" << *v << " " << (&(*v))[1] << " " << (&(*v))[2] << "\n";
875  brw += bin_text_write_fixed(model_file, (char*)&(*v), 3 * sizeof(*v), msg, text);
876  }
877  }
878  }
879 }
880 
882  vw& all, io_buf& model_file, bool read, bool text, double& total_weight, gd* g, uint32_t ftrl_size)
883 {
884  // vw& all = *g.all;
885  std::stringstream msg;
886 
887  msg << "initial_t " << all.initial_t << "\n";
888  bin_text_read_write_fixed(model_file, (char*)&all.initial_t, sizeof(all.initial_t), "", read, msg, text);
889 
890  msg << "norm normalizer " << all.normalized_sum_norm_x << "\n";
892  model_file, (char*)&all.normalized_sum_norm_x, sizeof(all.normalized_sum_norm_x), "", read, msg, text);
893 
894  msg << "t " << all.sd->t << "\n";
895  bin_text_read_write_fixed(model_file, (char*)&all.sd->t, sizeof(all.sd->t), "", read, msg, text);
896 
897  msg << "sum_loss " << all.sd->sum_loss << "\n";
898  bin_text_read_write_fixed(model_file, (char*)&all.sd->sum_loss, sizeof(all.sd->sum_loss), "", read, msg, text);
899 
900  msg << "sum_loss_since_last_dump " << all.sd->sum_loss_since_last_dump << "\n";
901  bin_text_read_write_fixed(model_file, (char*)&all.sd->sum_loss_since_last_dump,
902  sizeof(all.sd->sum_loss_since_last_dump), "", read, msg, text);
903 
904  float dump_interval = all.sd->dump_interval;
905  msg << "dump_interval " << dump_interval << "\n";
906  bin_text_read_write_fixed(model_file, (char*)&dump_interval, sizeof(dump_interval), "", read, msg, text);
907  if (!read || (all.training && all.preserve_performance_counters)) // update dump_interval from input model
908  all.sd->dump_interval = dump_interval;
909 
910  msg << "min_label " << all.sd->min_label << "\n";
911  bin_text_read_write_fixed(model_file, (char*)&all.sd->min_label, sizeof(all.sd->min_label), "", read, msg, text);
912 
913  msg << "max_label " << all.sd->max_label << "\n";
914  bin_text_read_write_fixed(model_file, (char*)&all.sd->max_label, sizeof(all.sd->max_label), "", read, msg, text);
915 
916  msg << "weighted_labeled_examples " << all.sd->weighted_labeled_examples << "\n";
917  bin_text_read_write_fixed(model_file, (char*)&all.sd->weighted_labeled_examples,
918  sizeof(all.sd->weighted_labeled_examples), "", read, msg, text);
919 
920  msg << "weighted_labels " << all.sd->weighted_labels << "\n";
922  model_file, (char*)&all.sd->weighted_labels, sizeof(all.sd->weighted_labels), "", read, msg, text);
923 
924  msg << "weighted_unlabeled_examples " << all.sd->weighted_unlabeled_examples << "\n";
926  sizeof(all.sd->weighted_unlabeled_examples), "", read, msg, text);
927 
928  msg << "example_number " << all.sd->example_number << "\n";
930  model_file, (char*)&all.sd->example_number, sizeof(all.sd->example_number), "", read, msg, text);
931 
932  msg << "total_features " << all.sd->total_features << "\n";
934  model_file, (char*)&all.sd->total_features, sizeof(all.sd->total_features), "", read, msg, text);
935 
936  if (!read || all.model_file_ver >= VERSION_SAVE_RESUME_FIX)
937  {
938  // restore some data to allow --save_resume work more accurate
939 
940  // fix average loss
941  msg << "total_weight " << total_weight << "\n";
942  bin_text_read_write_fixed(model_file, (char*)&total_weight, sizeof(total_weight), "", read, msg, text);
943 
944  // fix "loss since last" for first printed out example details
945  msg << "sd::oec.weighted_labeled_examples " << all.sd->old_weighted_labeled_examples << "\n";
947  sizeof(all.sd->old_weighted_labeled_examples), "", read, msg, text);
948 
949  // fix "number of examples per pass"
950  msg << "current_pass " << all.current_pass << "\n";
952  bin_text_read_write_fixed(model_file, (char*)&all.current_pass, sizeof(all.current_pass), "", read, msg, text);
953  else // backwards compatiblity.
954  {
955  size_t temp_pass = (size_t)all.current_pass;
956  bin_text_read_write_fixed(model_file, (char*)&temp_pass, sizeof(temp_pass), "", read, msg, text);
957  all.current_pass = temp_pass;
958  }
959  }
960 
961  if (read &&
962  (!all.training ||
963  !all.preserve_performance_counters)) // reset various things so that we report test set performance properly
964  {
965  all.sd->sum_loss = 0;
966  all.sd->sum_loss_since_last_dump = 0;
967  all.sd->weighted_labeled_examples = 0.;
968  all.sd->weighted_labels = 0.;
971  all.sd->example_number = 0;
972  all.sd->total_features = 0;
973  all.current_pass = 0;
974  }
975  if (all.weights.sparse)
976  save_load_online_state(all, model_file, read, text, g, msg, ftrl_size, all.weights.sparse_weights);
977  else
978  save_load_online_state(all, model_file, read, text, g, msg, ftrl_size, all.weights.dense_weights);
979 }
980 
981 template <class T>
983 {
984  public:
985  static void func(weight& w, std::pair<float, float>& initial, uint64_t /* index */)
986  {
987  w = initial.first;
988  (&w)[1] = initial.second;
989  }
990 };
991 
992 void save_load(gd& g, io_buf& model_file, bool read, bool text)
993 {
994  vw& all = *g.all;
995  if (read)
996  {
998 
999  if (all.weights.adaptive && all.initial_t > 0)
1000  {
1001  float init_weight = all.initial_weight;
1002  std::pair<float, float> p = std::make_pair(init_weight, all.initial_t);
1003  if (all.weights.sparse)
1005  else
1007  // for adaptive update, we interpret initial_t as previously seeing initial_t fake datapoints, all with squared
1008  // gradient=1 NOTE: this is not invariant to the scaling of the data (i.e. when combined with normalized). Since
1009  // scaling the data scales the gradient, this should ideally be feature_range*initial_t, or something like that.
1010  // We could potentially fix this by just adding this base quantity times the current range to the sum of gradients
1011  // stored in memory at each update, and always start sum of gradients to 0, at the price of additional additions
1012  // and multiplications during the update...
1013  }
1014  if (g.initial_constant != 0.0)
1016  }
1017 
1018  if (model_file.files.size() > 0)
1019  {
1020  bool resume = all.save_resume;
1021  std::stringstream msg;
1022  msg << ":" << resume << "\n";
1023  bin_text_read_write_fixed(model_file, (char*)&resume, sizeof(resume), "", read, msg, text);
1024  if (resume)
1025  {
1026  if (read && all.model_file_ver < VERSION_SAVE_RESUME_FIX)
1027  all.trace_message
1028  << std::endl
1029  << "WARNING: --save_resume functionality is known to have inaccuracy in model files version less than "
1030  << VERSION_SAVE_RESUME_FIX << std::endl
1031  << std::endl;
1032  save_load_online_state(all, model_file, read, text, g.total_weight, &g);
1033  }
1034  else
1035  save_load_regressor(all, model_file, read, text);
1036  }
1037  if (!all.training) // If the regressor was saved as --save_resume, then when testing we want to materialize the
1038  // weights.
1039  sync_weights(all);
1040 }
1041 
1042 template <bool sparse_l2, bool invariant, bool sqrt_rate, bool feature_mask_off, uint64_t adaptive, uint64_t normalized,
1043  uint64_t spare, uint64_t next>
1044 uint64_t set_learn(vw& all, gd& g)
1045 {
1046  all.normalized_idx = normalized;
1047  if (g.adax)
1048  {
1049  g.learn = learn<sparse_l2, invariant, sqrt_rate, feature_mask_off, true, adaptive, normalized, spare>;
1050  g.update = update<sparse_l2, invariant, sqrt_rate, feature_mask_off, true, adaptive, normalized, spare>;
1051  g.sensitivity = sensitivity<sqrt_rate, feature_mask_off, true, adaptive, normalized, spare>;
1052  return next;
1053  }
1054  else
1055  {
1056  g.learn = learn<sparse_l2, invariant, sqrt_rate, feature_mask_off, false, adaptive, normalized, spare>;
1057  g.update = update<sparse_l2, invariant, sqrt_rate, feature_mask_off, false, adaptive, normalized, spare>;
1058  g.sensitivity = sensitivity<sqrt_rate, feature_mask_off, false, adaptive, normalized, spare>;
1059  return next;
1060  }
1061 }
1062 
1063 template <bool sparse_l2, bool invariant, bool sqrt_rate, uint64_t adaptive, uint64_t normalized, uint64_t spare,
1064  uint64_t next>
1065 uint64_t set_learn(vw& all, bool feature_mask_off, gd& g)
1066 {
1067  all.normalized_idx = normalized;
1068  if (feature_mask_off)
1069  return set_learn<sparse_l2, invariant, sqrt_rate, true, adaptive, normalized, spare, next>(all, g);
1070  else
1071  return set_learn<sparse_l2, invariant, sqrt_rate, false, adaptive, normalized, spare, next>(all, g);
1072 }
1073 
1074 template <bool invariant, bool sqrt_rate, uint64_t adaptive, uint64_t normalized, uint64_t spare, uint64_t next>
1075 uint64_t set_learn(vw& all, bool feature_mask_off, gd& g)
1076 {
1077  if (g.sparse_l2 > 0.f)
1078  return set_learn<true, invariant, sqrt_rate, adaptive, normalized, spare, next>(all, feature_mask_off, g);
1079  else
1080  return set_learn<false, invariant, sqrt_rate, adaptive, normalized, spare, next>(all, feature_mask_off, g);
1081 }
1082 
1083 template <bool sqrt_rate, uint64_t adaptive, uint64_t normalized, uint64_t spare, uint64_t next>
1084 uint64_t set_learn(vw& all, bool feature_mask_off, gd& g)
1085 {
1086  if (all.invariant_updates)
1087  return set_learn<true, sqrt_rate, adaptive, normalized, spare, next>(all, feature_mask_off, g);
1088  else
1089  return set_learn<false, sqrt_rate, adaptive, normalized, spare, next>(all, feature_mask_off, g);
1090 }
1091 
1092 template <bool sqrt_rate, uint64_t adaptive, uint64_t spare>
1093 uint64_t set_learn(vw& all, bool feature_mask_off, gd& g)
1094 {
1095  // select the appropriate learn function based on adaptive, normalization, and feature mask
1096  if (all.weights.normalized)
1097  return set_learn<sqrt_rate, adaptive, adaptive + 1, adaptive + 2, adaptive + 3>(all, feature_mask_off, g);
1098  else
1099  return set_learn<sqrt_rate, adaptive, 0, spare, spare + 1>(all, feature_mask_off, g);
1100 }
1101 
1102 template <bool sqrt_rate>
1103 uint64_t set_learn(vw& all, bool feature_mask_off, gd& g)
1104 {
1105  if (all.weights.adaptive)
1106  return set_learn<sqrt_rate, 1, 2>(all, feature_mask_off, g);
1107  else
1108  return set_learn<sqrt_rate, 0, 0>(all, feature_mask_off, g);
1109 }
1110 
1111 uint64_t ceil_log_2(uint64_t v)
1112 {
1113  if (v == 0)
1114  return 0;
1115  else
1116  return 1 + ceil_log_2(v >> 1);
1117 }
1118 
1119 base_learner* setup(options_i& options, vw& all)
1120 {
1121  auto g = scoped_calloc_or_throw<gd>();
1122 
1123  bool sgd = false;
1124  bool adaptive = false;
1125  bool adax = false;
1126  bool invariant = false;
1127  bool normalized = false;
1128 
1129  option_group_definition new_options("Gradient Descent options");
1130  new_options.add(make_option("sgd", sgd).help("use regular stochastic gradient descent update.").keep(all.save_resume))
1131  .add(make_option("adaptive", adaptive).help("use adaptive, individual learning rates.").keep(all.save_resume))
1132  .add(make_option("adax", adax).help("use adaptive learning rates with x^2 instead of g^2x^2"))
1133  .add(make_option("invariant", invariant).help("use safe/importance aware updates.").keep(all.save_resume))
1134  .add(make_option("normalized", normalized).help("use per feature normalized updates").keep(all.save_resume))
1135  .add(make_option("sparse_l2", g->sparse_l2).default_value(0.f).help("use per feature normalized updates"))
1136  .add(make_option("l1_state", all.sd->gravity)
1137  .keep(all.save_resume)
1138  .default_value(0.)
1139  .help("use per feature normalized updates"))
1140  .add(make_option("l2_state", all.sd->contraction)
1141  .keep(all.save_resume)
1142  .default_value(1.)
1143  .help("use per feature normalized updates"));
1144  options.add_and_parse(new_options);
1145 
1146  g->all = &all;
1147  g->all->normalized_sum_norm_x = 0;
1148  g->no_win_counter = 0;
1149  g->total_weight = 0.;
1150  all.weights.adaptive = true;
1151  all.weights.normalized = true;
1152  g->neg_norm_power = (all.weights.adaptive ? (all.power_t - 1.f) : -1.f);
1153  g->neg_power_t = -all.power_t;
1154 
1155  if (all.initial_t > 0) // for the normalized update: if initial_t is bigger than 1 we interpret this as if we had
1156  // seen (all.initial_t) previous fake datapoints all with norm 1
1157  {
1158  g->all->normalized_sum_norm_x = all.initial_t;
1159  g->total_weight = all.initial_t;
1160  }
1161 
1162  bool feature_mask_off = true;
1163  if (options.was_supplied("feature_mask"))
1164  feature_mask_off = false;
1165 
1166  if (!all.holdout_set_off)
1167  {
1168  all.sd->holdout_best_loss = FLT_MAX;
1169  g->early_stop_thres = options.get_typed_option<size_t>("early_terminate").value();
1170  }
1171 
1172  g->initial_constant = all.initial_constant;
1173 
1174  if (sgd || adaptive || invariant || normalized)
1175  {
1176  // nondefault
1177  all.weights.adaptive = adaptive;
1178  all.invariant_updates = all.training && invariant;
1179  all.weights.normalized = normalized;
1180 
1181  if (!options.was_supplied("learning_rate") && !options.was_supplied("l") &&
1182  !(all.weights.adaptive && all.weights.normalized))
1183  all.eta = 10; // default learning rate to 10 for non default update rule
1184 
1185  // if not using normalized or adaptive, default initial_t to 1 instead of 0
1186  if (!all.weights.adaptive && !all.weights.normalized)
1187  {
1188  if (!options.was_supplied("initial_t"))
1189  {
1190  all.sd->t = 1.f;
1191  all.initial_t = 1.f;
1192  }
1193  all.eta *= powf((float)(all.sd->t), all.power_t);
1194  }
1195  }
1196  else
1197  {
1198  all.invariant_updates = all.training;
1199  }
1200  g->adaptive_input = all.weights.adaptive;
1201  g->normalized_input = all.weights.normalized;
1202 
1203  all.weights.adaptive = all.weights.adaptive && all.training;
1204  all.weights.normalized = all.weights.normalized && all.training;
1205 
1206  if (adax)
1207  g->adax = all.training && adax;
1208 
1209  if (g->adax && !all.weights.adaptive)
1210  THROW("Cannot use adax without adaptive");
1211 
1212  if (pow((double)all.eta_decay_rate, (double)all.numpasses) < 0.0001)
1213  all.trace_message << "Warning: the learning rate for the last pass is multiplied by: "
1214  << pow((double)all.eta_decay_rate, (double)all.numpasses)
1215  << " adjust --decay_learning_rate larger to avoid this." << std::endl;
1216 
1217  if (all.reg_mode % 2)
1218  if (all.audit || all.hash_inv)
1219  {
1220  g->predict = predict<true, true>;
1221  g->multipredict = multipredict<true, true>;
1222  }
1223  else
1224  {
1225  g->predict = predict<true, false>;
1226  g->multipredict = multipredict<true, false>;
1227  }
1228  else if (all.audit || all.hash_inv)
1229  {
1230  g->predict = predict<false, true>;
1231  g->multipredict = multipredict<false, true>;
1232  }
1233  else
1234  {
1235  g->predict = predict<false, false>;
1236  g->multipredict = multipredict<false, false>;
1237  }
1238 
1239  uint64_t stride;
1240  if (all.power_t == 0.5)
1241  stride = set_learn<true>(all, feature_mask_off, *g.get());
1242  else
1243  stride = set_learn<false>(all, feature_mask_off, *g.get());
1244 
1245  all.weights.stride_shift((uint32_t)ceil_log_2(stride - 1));
1246 
1247  gd* bare = g.get();
1248  learner<gd, example>& ret = init_learner(g, g->learn, bare->predict, ((uint64_t)1 << all.weights.stride_shift()));
1249  ret.set_sensitivity(bare->sensitivity);
1250  ret.set_multipredict(bare->multipredict);
1251  ret.set_update(bare->update);
1252  ret.set_save_load(save_load);
1253  ret.set_end_pass(end_pass);
1254  return make_base(ret);
1255 }
1256 
1257 } // namespace GD
float get_pred_per_update(gd &g, example &ec)
Definition: gd.cc:541
double sum_loss
Definition: global_data.h:145
void set_multipredict(void(*u)(T &, L &, E &, size_t, size_t, polyprediction *, bool))
Definition: learner.h:217
v_array< char > tag
Definition: example.h:63
float finalize_prediction(shared_data *sd, float ret)
Definition: gd.cc:339
size_t write_index(io_buf &model_file, std::stringstream &msg, bool text, uint32_t num_bits, uint64_t i)
Definition: gd.cc:688
void set_update(void(*u)(T &data, L &base, E &))
Definition: learner.h:231
void accumulate_weighted_avg(vw &all, parameters &weights)
Definition: accumulate.cc:117
size_t count
Definition: gd.h:32
void set_done(vw &all)
Definition: parser.cc:578
float gravity
Definition: gd.h:36
parameters weights
Definition: global_data.h:537
loss_function * loss
Definition: global_data.h:523
float compute_update(gd &g, example &ec)
Definition: gd.cc:609
virtual float getUpdate(float prediction, float label, float update_scale, float pred_per_update)=0
void print_audit_features(vw &all, example &ec)
Definition: gd.cc:331
float initial_constant
Definition: gd.cc:49
vw * setup(options_i &options)
Definition: main.cc:27
std::vector< string_value > results
Definition: gd.cc:204
void sync_weights(vw &all)
Definition: gd.cc:671
float gravity
Definition: gd.cc:357
void initialize_regressor(vw &all, T &weights)
void predict(gd &g, base_learner &, example &ec)
Definition: gd.cc:379
uint64_t stride_shift(const stagewise_poly &poly, uint64_t idx)
void print_features(vw &all, example &ec)
Definition: gd.cc:298
float initial_t
Definition: global_data.h:530
VW::config::options_i * options
Definition: global_data.h:428
float scalar
Definition: example.h:45
size_t normalized_idx
Definition: global_data.h:506
void(* update)(gd &, base_learner &, example &)
Definition: gd.cc:56
double weighted_unlabeled_examples
Definition: global_data.h:143
bool hash_inv
Definition: global_data.h:541
size_t early_stop_thres
Definition: gd.cc:48
float sparse_l2
Definition: gd.cc:52
float initial_weight
Definition: global_data.h:409
virtual void replace(const std::string &key, const std::string &value)=0
void vec_add_trunc_multipredict(multipredict_info< T > &mp, const float fx, uint64_t fi)
Definition: gd.cc:394
float power_t
Definition: global_data.h:447
static void func(weight &w, std::pair< float, float > &initial, uint64_t)
Definition: gd.cc:985
void learn(gd &g, base_learner &base, example &ec)
Definition: gd.cc:661
the core definition of a set of features.
double holdout_best_loss
Definition: global_data.h:161
base_learner * make_base(learner< T, E > &base)
Definition: learner.h:462
float partial_prediction
Definition: example.h:68
void vec_add_multipredict(multipredict_info< T > &mp, const float fx, uint64_t fi)
Definition: gd.h:40
double contraction
Definition: global_data.h:149
void finalize_regressor(vw &all, std::string reg_name)
virtual void add_and_parse(const option_group_definition &group)=0
void set_save_load(void(*sl)(T &, io_buf &, bool, bool))
Definition: learner.h:257
float trunc_predict(vw &all, example &ec, double gravity)
Definition: gd.cc:365
float label
Definition: simple_label.h:14
float updated_prediction
Definition: example.h:69
void set_default(R &info)
label_data simple
Definition: example.h:28
void print_lda_features(vw &all, example &ec)
Definition: gd.cc:280
size_t step
Definition: gd.h:33
bool holdout_set_off
Definition: global_data.h:499
size_t check_holdout_every_n_passes
Definition: global_data.h:503
void save_load_online_state(vw &all, io_buf &model_file, bool read, bool text, double &total_weight, gd *g, uint32_t ftrl_size)
Definition: gd.cc:881
bool summarize_holdout_set(vw &all, size_t &no_win_counter)
uint32_t num_bits
Definition: global_data.h:398
bool training
Definition: global_data.h:488
size_t size() const
Definition: v_array.h:68
void save_load_regressor(vw &all, io_buf &model_file, bool read, bool text)
Definition: gd.cc:767
static float InvSqrt(float x)
Definition: gd.cc:80
void vec_add_trunc(trunc_data &p, const float fx, float &fw)
Definition: gd.cc:360
virtual float first_derivative(shared_data *, float prediction, float label)=0
float inline_predict(vw &all, example &ec)
Definition: gd.h:98
void audit_feature(audit_results &dat, const float ft_weight, const uint64_t ft_idx)
Definition: gd.cc:241
power_data pd
Definition: gd.cc:479
float(* sensitivity)(gd &, base_learner &, example &)
Definition: gd.cc:57
void save_predictor(vw &all, std::string reg_name, size_t current_pass)
double sum_loss_since_last_dump
Definition: global_data.h:146
float trunc_weight(const float w, const float gravity)
Definition: gd.h:114
Definition: gd.cc:43
uint32_t lda
Definition: global_data.h:508
Definition: gd.cc:41
bool adax
Definition: gd.cc:61
bool normalized_input
Definition: gd.cc:60
AllReduce * all_reduce
Definition: global_data.h:381
size_t bin_text_write_fixed(io_buf &io, char *data, size_t len, std::stringstream &msg, bool text)
Definition: io_buf.h:313
float grad_squared
Definition: gd.cc:476
int stdout_fileno
Definition: global_data.h:434
void audit_interaction(audit_results &dat, const audit_strings *f)
Definition: gd.cc:208
size_t bin_read_fixed(char *data, size_t len, const char *read_message)
Definition: io_buf.h:230
virtual float getLoss(shared_data *, float prediction, float label)=0
learner< T, E > & init_learner(free_ptr< T > &dat, L *base, void(*learn)(T &, L &, E &), void(*predict)(T &, L &, E &), size_t ws, prediction_type::prediction_type_t pred_type)
Definition: learner.h:369
float initial_constant
Definition: global_data.h:410
float norm_x
Definition: gd.cc:478
std::vector< std::string > ns_pre
Definition: gd.cc:203
double total_weight
Definition: gd.cc:46
shared_data * sd
Definition: global_data.h:375
typed_option< T > & get_typed_option(const std::string &key)
Definition: options.h:120
void end_pass(example &ec, vw &all)
Definition: learner.cc:44
bool adaptive_input
Definition: gd.cc:59
T powf(T, T)
Definition: lda_core.cc:428
float l2_lambda
Definition: global_data.h:445
VW::version_struct model_file_ver
Definition: global_data.h:419
float neg_norm_power
Definition: gd.cc:443
v_array< int > files
Definition: io_buf.h:64
iterator over values, indicies and audit space names
vw_ostream trace_message
Definition: global_data.h:424
double old_weighted_labeled_examples
Definition: global_data.h:142
virtual bool was_supplied(const std::string &key)=0
double weighted_holdout_examples
Definition: global_data.h:156
constexpr float x2_min
Definition: gd.cc:484
constexpr uint64_t constant
Definition: constant.h:11
uint64_t set_learn(vw &all, bool feature_mask_off, gd &g)
Definition: gd.cc:1103
double weighted_labels
Definition: global_data.h:144
void set_weight(vw &all, uint32_t index, uint32_t offset, float value)
Definition: vw.h:182
void accumulate_avg(vw &all, parameters &weights, size_t offset)
Definition: accumulate.cc:51
dense_parameters dense_weights
uint64_t current_pass
Definition: global_data.h:396
float initial
Definition: simple_label.h:16
bool print_invert
Definition: global_data.h:542
void pred_per_update_feature(norm_data &nd, float x, float &fw)
Definition: gd.cc:488
void(* learn)(gd &, base_learner &, example &)
Definition: gd.cc:55
float neg_power_t
Definition: gd.cc:51
const uint64_t offset
Definition: gd.cc:202
Definition: io_buf.h:54
bool operator<(const string_value &first, const string_value &second)
Definition: gd.cc:197
float neg_norm_power
Definition: gd.cc:50
float get_scale(gd &g, example &, float weight)
Definition: gd.cc:588
size_t numpasses
Definition: global_data.h:451
float eta
Definition: global_data.h:531
float weight
option_group_definition & add(T &&op)
Definition: options.h:90
uint64_t example_number
Definition: global_data.h:137
int add(svm_params &params, svm_example *fec)
Definition: kernel_svm.cc:546
iterator over values and indicies
size_t no_win_counter
Definition: gd.cc:47
const T & weights
Definition: gd.h:35
polylabel l
Definition: example.h:57
bool save_per_pass
Definition: global_data.h:408
uint64_t parse_mask
Definition: global_data.h:453
void update_feature(float &update, float x, float &fw)
Definition: gd.cc:109
void multipredict(gd &g, base_learner &, example &ec, size_t count, size_t step, polyprediction *pred, bool finalize_predictions)
Definition: gd.cc:402
float pred_per_update
Definition: gd.cc:477
bool in_use
Definition: example.h:79
typed_option< T > make_option(std::string name, T &location)
Definition: options.h:80
float total_sum_feat_sq
Definition: example.h:71
void print_result(int f, float res, v_array< char > tag, float lb, float ub)
Definition: bs.cc:136
void set_sensitivity(float(*u)(T &data, base_learner &base, example &))
Definition: learner.h:237
float min_label
Definition: global_data.h:150
virtual float getUnsafeUpdate(float prediction, float label, float eta_t)=0
void set_end_pass(void(*f)(T &))
Definition: learner.h:286
sparse_parameters sparse_weights
double gravity
Definition: global_data.h:148
void train(svm_params &params)
Definition: kernel_svm.cc:696
float minus_power_t
Definition: gd.cc:442
void foreach_feature(vw &all, features &fs, R &dat, uint64_t offset=0, float mult=1.)
Definition: gd.h:66
std::string s
Definition: gd.cc:193
bool preserve_performance_counters
Definition: global_data.h:416
bool save_resume
Definition: global_data.h:415
float compute_rate_decay(power_data &s, float &fw)
Definition: gd.cc:447
void vec_add_print(float &p, const float fx, float &fw)
Definition: gd.cc:372
float max_label
Definition: global_data.h:151
uint32_t stride_shift()
float v
Definition: gd.cc:192
void(* predict)(gd &, base_learner &, example &)
Definition: gd.cc:54
constexpr float x_min
Definition: gd.cc:483
void update(gd &g, base_learner &, example &ec)
Definition: gd.cc:647
double weighted_labeled_examples
Definition: global_data.h:141
void save_load(gd &g, io_buf &model_file, bool read, bool text)
Definition: gd.cc:992
bool audit
Definition: global_data.h:486
audit_results(vw &p_all, const size_t p_offset)
Definition: gd.cc:205
polyprediction pred
Definition: example.h:60
vw * all
Definition: gd.cc:63
float sensitivity(gd &g, base_learner &, example &ec)
Definition: gd.cc:601
polyprediction * pred
Definition: gd.h:34
void(* multipredict)(gd &, base_learner &, example &, size_t, size_t, polyprediction *, bool)
Definition: gd.cc:58
std::string final_regressor_name
Definition: global_data.h:535
bool invariant_updates
Definition: global_data.h:490
#define VERSION_SAVE_RESUME_FIX
Definition: gd.cc:33
float weight
Definition: example.h:62
std::map< std::string, size_t > name_index_map
Definition: global_data.h:548
float average_update(float total_weight, float normalized_sum_norm_x, float neg_norm_power)
Definition: gd.cc:122
float update_multiplier
Definition: gd.cc:53
#define VERSION_PASS_UINT64
Definition: gd.cc:34
float l1_lambda
Definition: global_data.h:444
float dump_interval
Definition: global_data.h:147
size_t bin_text_read_write_fixed(io_buf &io, char *data, size_t len, const char *read_message, bool read, std::stringstream &msg, bool text)
Definition: io_buf.h:326
uint64_t mask()
#define THROW(args)
Definition: vw_exception.h:181
constexpr uint64_t c
Definition: rand48.cc:12
uint64_t ceil_log_2(uint64_t v)
Definition: gd.cc:1111
bool global_print_features
Definition: gd.cc:538
virtual float getSquareGrad(float prediction, float label)=0
float f
Definition: cache.cc:40
uint64_t total_features
Definition: global_data.h:138
float extra_state[4]
Definition: gd.cc:480
float prediction
Definition: gd.cc:356
const char * to_string(prediction_type_t prediction_type)
Definition: learner.cc:12
constexpr float x2_max
Definition: gd.cc:485
float quake_InvSqrt(float x)
Definition: gd.cc:68
int reg_mode
Definition: global_data.h:448
double normalized_sum_norm_x
Definition: global_data.h:420
std::pair< std::string, std::string > audit_strings
Definition: feature_group.h:22
float eta_decay_rate
Definition: global_data.h:532