Vowpal Wabbit
nn.cc
Go to the documentation of this file.
1 /*
2 Copyright (c) by respective owners including Yahoo!, Microsoft, and
3 individual contributors. All rights reserved. Released under a BSD (revised)
4 license as described in the file LICENSE.
5  */
6 #include <cfloat>
7 #include <cmath>
8 #include <cstdio>
9 #include <sstream>
10 #include <memory>
11 
12 #include "reductions.h"
13 #include "rand48.h"
14 #include "gd.h"
15 #include "vw.h"
16 
17 using namespace LEARNER;
18 using namespace VW::config;
19 
20 constexpr float hidden_min_activation = -3;
21 constexpr float hidden_max_activation = 3;
22 constexpr uint64_t nn_constant = 533357803;
23 
24 struct nn
25 {
26  uint32_t k;
31  float prediction;
32  size_t increment;
33  bool dropout;
34  uint64_t xsubi;
35  uint64_t save_xsubi;
36  bool inpass;
38  bool multitask;
39 
40  float* hidden_units;
41  bool* dropped_out;
42 
45 
46  vw* all; // many things
47  std::shared_ptr<rand_state> _random_state;
48 
49  ~nn()
50  {
51  delete squared_loss;
52  free(hidden_units);
53  free(dropped_out);
54  free(hidden_units_pred);
55  free(hiddenbias_pred);
56  VW::dealloc_example(nullptr, output_layer);
57  VW::dealloc_example(nullptr, hiddenbias);
58  VW::dealloc_example(nullptr, outputweight);
59  }
60 };
61 
62 #define cast_uint32_t static_cast<uint32_t>
63 
64 static inline float fastpow2(float p)
65 {
66  float offset = (p < 0) ? 1.0f : 0.0f;
67  float clipp = (p < -126) ? -126.0f : p;
68  int w = (int)clipp;
69  float z = clipp - w + offset;
70  union
71  {
72  uint32_t i;
73  float f;
74  } v = {cast_uint32_t((1 << 23) * (clipp + 121.2740575f + 27.7280233f / (4.84252568f - z) - 1.49012907f * z))};
75 
76  return v.f;
77 }
78 
79 static inline float fastexp(float p) { return fastpow2(1.442695040f * p); }
80 
81 static inline float fasttanh(float p) { return -1.0f + 2.0f / (1.0f + fastexp(-2.0f * p)); }
82 
83 void finish_setup(nn& n, vw& all)
84 {
85  // TODO: output_layer audit
86 
87  memset(&n.output_layer, 0, sizeof(n.output_layer));
90  uint64_t nn_index = nn_constant << all.weights.stride_shift();
91 
93  for (unsigned int i = 0; i < n.k; ++i)
94  {
95  fs.push_back(1., nn_index);
96  if (all.audit || all.hash_inv)
97  {
98  std::stringstream ss;
99  ss << "OutputLayer" << i;
100  fs.space_names.push_back(audit_strings_ptr(new audit_strings("", ss.str())));
101  }
102  nn_index += (uint64_t)n.increment;
103  }
104  n.output_layer.num_features += n.k;
105 
106  if (!n.inpass)
107  {
108  fs.push_back(1., nn_index);
109  if (all.audit || all.hash_inv)
110  fs.space_names.push_back(audit_strings_ptr(new audit_strings("", "OutputLayerConst")));
112  }
113 
114  n.output_layer.in_use = true;
115 
116  // TODO: not correct if --noconstant
117  memset(&n.hiddenbias, 0, sizeof(n.hiddenbias));
120  n.hiddenbias.feature_space[constant_namespace].push_back(1, (uint64_t)constant);
121  if (all.audit || all.hash_inv)
122  n.hiddenbias.feature_space[constant_namespace].space_names.push_back(
123  audit_strings_ptr(new audit_strings("", "HiddenBias")));
125  n.hiddenbias.l.simple.label = FLT_MAX;
126  n.hiddenbias.weight = 1;
127  n.hiddenbias.in_use = true;
128 
129  memset(&n.outputweight, 0, sizeof(n.outputweight));
133  n.outputweight.feature_space[nn_output_namespace].push_back(outfs.values[0], outfs.indicies[0]);
134  if (all.audit || all.hash_inv)
135  n.outputweight.feature_space[nn_output_namespace].space_names.push_back(
136  audit_strings_ptr(new audit_strings("", "OutputWeight")));
139  n.outputweight.l.simple.label = FLT_MAX;
140  n.outputweight.weight = 1;
141  n.outputweight.in_use = true;
142 
143  n.finished_setup = true;
144 }
145 
146 void end_pass(nn& n)
147 {
148  if (n.all->bfgs)
149  n.xsubi = n.save_xsubi;
150 }
151 
152 template <bool is_learn, bool recompute_hidden>
154 {
155  bool shouldOutput = n.all->raw_prediction > 0;
156  if (!n.finished_setup)
157  finish_setup(n, *(n.all));
158  shared_data sd;
159  memcpy(&sd, n.all->sd, sizeof(shared_data));
160  shared_data* save_sd = n.all->sd;
161  n.all->sd = &sd;
162 
163  label_data ld = ec.l.simple;
164  void (*save_set_minmax)(shared_data*, float) = n.all->set_minmax;
165  float save_min_label;
166  float save_max_label;
167  float dropscale = n.dropout ? 2.0f : 1.0f;
168  loss_function* save_loss = n.all->loss;
169 
170  polyprediction* hidden_units = n.hidden_units_pred;
171  polyprediction* hiddenbias_pred = n.hiddenbias_pred;
172  bool* dropped_out = n.dropped_out;
173 
174  std::ostringstream outputStringStream;
175 
176  n.all->set_minmax = noop_mm;
177  n.all->loss = n.squared_loss;
178  save_min_label = n.all->sd->min_label;
180  save_max_label = n.all->sd->max_label;
182 
183  uint64_t save_ft_offset = ec.ft_offset;
184 
185  if (n.multitask)
186  ec.ft_offset = 0;
187 
189 
190  if (recompute_hidden)
191  {
192  base.multipredict(n.hiddenbias, 0, n.k, hiddenbias_pred, true);
193 
194  for (unsigned int i = 0; i < n.k; ++i)
195  // avoid saddle point at 0
196  if (hiddenbias_pred[i].scalar == 0)
197  {
198  n.hiddenbias.l.simple.label = (float)(n._random_state->get_and_update_random() - 0.5);
199  base.learn(n.hiddenbias, i);
200  n.hiddenbias.l.simple.label = FLT_MAX;
201  }
202 
203  base.multipredict(ec, 0, n.k, hidden_units, true);
204 
205  for (unsigned int i = 0; i < n.k; ++i) dropped_out[i] = (n.dropout && merand48(n.xsubi) < 0.5);
206 
207  if (ec.passthrough)
208  for (unsigned int i = 0; i < n.k; ++i)
209  {
210  add_passthrough_feature(ec, i * 2, hiddenbias_pred[i].scalar);
211  add_passthrough_feature(ec, i * 2 + 1, hidden_units[i].scalar);
212  }
213  }
214 
215  if (shouldOutput)
216  for (unsigned int i = 0; i < n.k; ++i)
217  {
218  if (i > 0)
219  outputStringStream << ' ';
220  outputStringStream << i << ':' << hidden_units[i].scalar << ','
221  << fasttanh(hidden_units[i].scalar); // TODO: huh, what was going on here?
222  }
223 
224  n.all->loss = save_loss;
225  n.all->set_minmax = save_set_minmax;
226  n.all->sd->min_label = save_min_label;
227  n.all->sd->max_label = save_max_label;
228  ec.ft_offset = save_ft_offset;
229 
230  bool converse = false;
231  float save_partial_prediction = 0;
232  float save_final_prediction = 0;
233  float save_ec_loss = 0;
234 
235 CONVERSE: // That's right, I'm using goto. So sue me.
236 
239 
241 
242  n.all->set_minmax = noop_mm;
243  n.all->loss = n.squared_loss;
244  save_min_label = n.all->sd->min_label;
245  n.all->sd->min_label = -1;
246  save_max_label = n.all->sd->max_label;
247  n.all->sd->max_label = 1;
248 
249  for (unsigned int i = 0; i < n.k; ++i)
250  {
251  float sigmah = (dropped_out[i]) ? 0.0f : dropscale * fasttanh(hidden_units[i].scalar);
253  out_fs.values[i] = sigmah;
254 
255  n.output_layer.total_sum_feat_sq += sigmah * sigmah;
256  out_fs.sum_feat_sq += sigmah * sigmah;
257 
258  n.outputweight.feature_space[nn_output_namespace].indicies[0] = out_fs.indicies[i];
259  base.predict(n.outputweight, n.k);
260  float wf = n.outputweight.pred.scalar;
261 
262  // avoid saddle point at 0
263  if (wf == 0)
264  {
265  float sqrtk = std::sqrt((float)n.k);
266  n.outputweight.l.simple.label = (float)(n._random_state->get_and_update_random() - 0.5) / sqrtk;
267  base.update(n.outputweight, n.k);
268  n.outputweight.l.simple.label = FLT_MAX;
269  }
270  }
271 
272  n.all->loss = save_loss;
273  n.all->set_minmax = save_set_minmax;
274  n.all->sd->min_label = save_min_label;
275  n.all->sd->max_label = save_max_label;
276 
277  if (n.inpass)
278  {
279  // TODO: this is not correct if there is something in the
280  // nn_output_namespace but at least it will not leak memory
281  // in that case
283  features save_nn_output_namespace = ec.feature_space[nn_output_namespace];
286  if (is_learn)
287  base.learn(ec, n.k);
288  else
289  base.predict(ec, n.k);
291  n.output_layer.loss = ec.loss;
293  ec.feature_space[nn_output_namespace].sum_feat_sq = 0;
294  ec.feature_space[nn_output_namespace] = save_nn_output_namespace;
295  ec.indices.pop();
296  }
297  else
298  {
300  n.output_layer.l = ec.l;
301  n.output_layer.weight = ec.weight;
303  if (is_learn)
304  base.learn(n.output_layer, n.k);
305  else
306  base.predict(n.output_layer, n.k);
307  ec.l = n.output_layer.l;
308  }
309 
311 
312  if (shouldOutput)
313  {
314  outputStringStream << ' ' << n.output_layer.partial_prediction;
315  n.all->print_text(n.all->raw_prediction, outputStringStream.str(), ec.tag);
316  }
317 
318  if (is_learn && n.all->training && ld.label != FLT_MAX)
319  {
320  float gradient = n.all->loss->first_derivative(n.all->sd, n.prediction, ld.label);
321 
322  if (fabs(gradient) > 0)
323  {
324  n.all->loss = n.squared_loss;
325  n.all->set_minmax = noop_mm;
326  save_min_label = n.all->sd->min_label;
328  save_max_label = n.all->sd->max_label;
330  save_ft_offset = ec.ft_offset;
331 
332  if (n.multitask)
333  ec.ft_offset = 0;
334 
335  for (unsigned int i = 0; i < n.k; ++i)
336  {
337  if (!dropped_out[i])
338  {
339  float sigmah = n.output_layer.feature_space[nn_output_namespace].values[i] / dropscale;
340  float sigmahprime = dropscale * (1.0f - sigmah * sigmah);
343  base.predict(n.outputweight, n.k);
344  float nu = n.outputweight.pred.scalar;
345  float gradhw = 0.5f * nu * gradient * sigmahprime;
346 
347  ec.l.simple.label = GD::finalize_prediction(n.all->sd, hidden_units[i].scalar - gradhw);
348  ec.pred.scalar = hidden_units[i].scalar;
349  if (ec.l.simple.label != hidden_units[i].scalar)
350  base.update(ec, i);
351  }
352  }
353 
354  n.all->loss = save_loss;
355  n.all->set_minmax = save_set_minmax;
356  n.all->sd->min_label = save_min_label;
357  n.all->sd->max_label = save_max_label;
358  ec.ft_offset = save_ft_offset;
359  }
360  }
361 
362  ec.l.simple.label = ld.label;
363 
364  if (!converse)
365  {
366  save_partial_prediction = n.output_layer.partial_prediction;
367  save_final_prediction = n.prediction;
368  save_ec_loss = n.output_layer.loss;
369  }
370 
371  if (n.dropout && !converse)
372  {
373  for (unsigned int i = 0; i < n.k; ++i)
374  {
375  dropped_out[i] = !dropped_out[i];
376  }
377 
378  converse = true;
379  goto CONVERSE;
380  }
381 
382  ec.partial_prediction = save_partial_prediction;
383  ec.pred.scalar = save_final_prediction;
384  ec.loss = save_ec_loss;
385 
386  n.all->sd = save_sd;
387  n.all->set_minmax(n.all->sd, sd.min_label);
388  n.all->set_minmax(n.all->sd, sd.max_label);
389 }
390 
391 void multipredict(nn& n, single_learner& base, example& ec, size_t count, size_t step, polyprediction* pred,
392  bool finalize_predictions)
393 {
394  for (size_t c = 0; c < count; c++)
395  {
396  if (c == 0)
397  predict_or_learn_multi<false, true>(n, base, ec);
398  else
399  predict_or_learn_multi<false, false>(n, base, ec);
400  if (finalize_predictions)
401  pred[c] = ec.pred;
402  else
403  pred[c].scalar = ec.partial_prediction;
404  ec.ft_offset += (uint64_t)step;
405  }
406  ec.ft_offset -= (uint64_t)(step * count);
407 }
408 
409 void finish_example(vw& all, nn&, example& ec)
410 {
411  int save_raw_prediction = all.raw_prediction;
412  all.raw_prediction = -1;
413  return_simple_example(all, nullptr, ec);
414  all.raw_prediction = save_raw_prediction;
415 }
416 
418 {
419  auto n = scoped_calloc_or_throw<nn>();
420  bool meanfield = false;
421  option_group_definition new_options("Neural Network");
422  new_options.add(make_option("nn", n->k).keep().help("Sigmoidal feedforward network with <k> hidden units"))
423  .add(make_option("inpass", n->inpass)
424  .keep()
425  .help("Train or test sigmoidal feedforward network with input passthrough."))
426  .add(make_option("multitask", n->multitask).keep().help("Share hidden layer across all reduced tasks."))
427  .add(make_option("dropout", n->dropout).keep().help("Train or test sigmoidal feedforward network using dropout."))
428  .add(make_option("meanfield", meanfield).help("Train or test sigmoidal feedforward network using mean field."));
429  options.add_and_parse(new_options);
430 
431  if (!options.was_supplied("nn"))
432  return nullptr;
433 
434  n->all = &all;
435  n->_random_state = all.get_random_state();
436 
437  if (n->multitask && !all.quiet)
438  std::cerr << "using multitask sharing for neural network " << (all.training ? "training" : "testing") << std::endl;
439 
440  if (options.was_supplied("meanfield"))
441  {
442  n->dropout = false;
443  if (!all.quiet)
444  std::cerr << "using mean field for neural network " << (all.training ? "training" : "testing") << std::endl;
445  }
446 
447  if (n->dropout && !all.quiet)
448  std::cerr << "using dropout for neural network " << (all.training ? "training" : "testing") << std::endl;
449 
450  if (n->inpass && !all.quiet)
451  std::cerr << "using input passthrough for neural network " << (all.training ? "training" : "testing") << std::endl;
452 
453  n->finished_setup = false;
454  n->squared_loss = getLossFunction(all, "squared", 0);
455 
456  n->xsubi = all.random_seed;
457 
458  n->save_xsubi = n->xsubi;
459 
460  n->hidden_units = calloc_or_throw<float>(n->k);
461  n->dropped_out = calloc_or_throw<bool>(n->k);
462  n->hidden_units_pred = calloc_or_throw<polyprediction>(n->k);
463  n->hiddenbias_pred = calloc_or_throw<polyprediction>(n->k);
464 
465  auto base = as_singleline(setup_base(options, all));
466  n->increment = base->increment; // Indexing of output layer is odd.
467  nn& nv = *n.get();
469  init_learner(n, base, predict_or_learn_multi<true, true>, predict_or_learn_multi<false, true>, n->k + 1);
470  if (nv.multitask)
474 
475  return make_base(l);
476 }
477 
478 /*
479 
480  train: ./vw -k -c -d mnist8v9.gz --passes 24 -b 25 --nn 64 -l 0.1 --invariant --adaptive --holdout_off --random_seed
481 19 --nnmultipredict -f mnist64 predict: ./vw -t -d mnist8v9.gz -i mnist64 --nnmultipredict
482 
483  default multipredict
484  nn 64 train 9.1s 8.1s
485  predict 0.57s 0.52s
486  nn 128 train 16.5s 13.8s
487  predict 0.76s 0.69s
488 
489 with oaa:
490 
491  train: ./vw --oaa 10 -b 25 --adaptive --invariant --holdout_off -l 0.1 --nn 64 --passes 24 -k -c -d mnist-all.gz
492 --random_seed 19 --nnmultipredict -f mnist-all64 predict: ./vw -t -d mnist-all.gz -i mnist-all64 --nnmultipredict
493 
494 */
void set_multipredict(void(*u)(T &, L &, E &, size_t, size_t, polyprediction *, bool))
Definition: learner.h:217
v_array< char > tag
Definition: example.h:63
static float fastexp(float p)
Definition: nn.cc:79
float finalize_prediction(shared_data *sd, float ret)
Definition: gd.cc:339
int raw_prediction
Definition: global_data.h:519
v_array< namespace_index > indices
parameters weights
Definition: global_data.h:537
loss_function * loss
Definition: global_data.h:523
void predict(E &ec, size_t i=0)
Definition: learner.h:169
example output_layer
Definition: nn.cc:28
T pop()
Definition: v_array.h:58
void push_back(feature_value v, feature_index i)
float scalar
Definition: example.h:45
vw * all
Definition: nn.cc:46
std::shared_ptr< audit_strings > audit_strings_ptr
Definition: feature_group.h:23
bool hash_inv
Definition: global_data.h:541
v_array< feature_index > indicies
std::vector< std::string > * interactions
void finish_setup(nn &n, vw &all)
Definition: nn.cc:83
void dealloc_example(void(*delete_label)(void *), example &ec, void(*delete_prediction)(void *))
Definition: example.cc:219
uint64_t random_seed
Definition: global_data.h:491
std::shared_ptr< rand_state > _random_state
Definition: nn.cc:47
uint64_t xsubi
Definition: nn.cc:34
constexpr unsigned char nn_output_namespace
Definition: constant.h:23
the core definition of a set of features.
base_learner * make_base(learner< T, E > &base)
Definition: learner.h:462
float * hidden_units
Definition: nn.cc:40
float partial_prediction
Definition: example.h:68
bool quiet
Definition: global_data.h:487
v_array< feature_value > values
virtual void add_and_parse(const option_group_definition &group)=0
float label
Definition: simple_label.h:14
float merand48(uint64_t &initial)
Definition: rand48.cc:16
label_data simple
Definition: example.h:28
base_learner * nn_setup(options_i &options, vw &all)
Definition: nn.cc:417
uint32_t k
Definition: nn.cc:26
#define add_passthrough_feature(ec, i, x)
Definition: example.h:119
bool * dropped_out
Definition: nn.cc:41
void predict_or_learn_multi(nn &n, single_learner &base, example &ec)
Definition: nn.cc:153
constexpr uint64_t nn_constant
Definition: nn.cc:22
bool training
Definition: global_data.h:488
bool inpass
Definition: nn.cc:36
virtual float first_derivative(shared_data *, float prediction, float label)=0
constexpr float hidden_max_activation
Definition: nn.cc:21
polyprediction * hidden_units_pred
Definition: nn.cc:43
float prediction
Definition: nn.cc:31
std::shared_ptr< rand_state > get_random_state()
Definition: global_data.h:553
std::array< features, NUM_NAMESPACES > feature_space
single_learner * as_singleline(learner< T, E > *l)
Definition: learner.h:476
bool dropout
Definition: nn.cc:33
void(* set_minmax)(shared_data *sd, float label)
Definition: global_data.h:394
void noop_mm(shared_data *, float)
Definition: global_data.cc:135
void set_finish_example(void(*f)(vw &all, T &, E &))
Definition: learner.h:307
learner< T, E > & init_learner(free_ptr< T > &dat, L *base, void(*learn)(T &, L &, E &), void(*predict)(T &, L &, E &), size_t ws, prediction_type::prediction_type_t pred_type)
Definition: learner.h:369
void push_back(const T &new_ele)
Definition: v_array.h:107
shared_data * sd
Definition: global_data.h:375
bool multitask
Definition: nn.cc:38
void end_pass(example &ec, vw &all)
Definition: learner.cc:44
uint64_t save_xsubi
Definition: nn.cc:35
void multipredict(nn &n, single_learner &base, example &ec, size_t count, size_t step, polyprediction *pred, bool finalize_predictions)
Definition: nn.cc:391
bool bfgs
Definition: global_data.h:412
size_t num_features
Definition: example.h:67
virtual bool was_supplied(const std::string &key)=0
Definition: nn.cc:24
constexpr uint64_t constant
Definition: constant.h:11
void(* print_text)(int, std::string, v_array< char >)
Definition: global_data.h:522
example hiddenbias
Definition: nn.cc:29
float loss
Definition: example.h:70
option_group_definition & add(T &&op)
Definition: options.h:90
int add(svm_params &params, svm_example *fec)
Definition: kernel_svm.cc:546
constexpr float hidden_min_activation
Definition: nn.cc:20
v_array< audit_strings_ptr > space_names
#define cast_uint32_t
Definition: nn.cc:62
polylabel l
Definition: example.h:57
bool in_use
Definition: example.h:79
typed_option< T > make_option(std::string name, T &location)
Definition: options.h:80
float total_sum_feat_sq
Definition: example.h:71
features * passthrough
Definition: example.h:74
float sum_feat_sq
float min_label
Definition: global_data.h:150
void set_end_pass(void(*f)(T &))
Definition: learner.h:286
size_t increment
Definition: nn.cc:32
void finish_example(vw &all, nn &, example &ec)
Definition: nn.cc:409
std::vector< std::string > interactions
Definition: global_data.h:457
float max_label
Definition: global_data.h:151
uint32_t stride_shift()
void multipredict(E &ec, size_t lo, size_t count, polyprediction *pred, bool finalize_predictions)
Definition: learner.h:178
bool finished_setup
Definition: nn.cc:37
static float fastpow2(float p)
Definition: nn.cc:64
~nn()
Definition: nn.cc:49
loss_function * squared_loss
Definition: nn.cc:27
bool audit
Definition: global_data.h:486
LEARNER::base_learner * setup_base(options_i &options, vw &all)
Definition: parse_args.cc:1222
polyprediction pred
Definition: example.h:60
void update(E &ec, size_t i=0)
Definition: learner.h:222
void learn(E &ec, size_t i=0)
Definition: learner.h:160
static float fasttanh(float p)
Definition: nn.cc:81
constexpr unsigned char constant_namespace
Definition: constant.h:22
float weight
Definition: example.h:62
loss_function * getLossFunction(vw &all, std::string funcName, float function_parameter)
constexpr uint64_t c
Definition: rand48.cc:12
float f
Definition: cache.cc:40
example outputweight
Definition: nn.cc:30
polyprediction * hiddenbias_pred
Definition: nn.cc:44
std::pair< std::string, std::string > audit_strings
Definition: feature_group.h:22
void return_simple_example(vw &all, void *, example &ec)