Vowpal Wabbit
interactions_predict.h
Go to the documentation of this file.
1 /*
2 Copyright (c) by respective owners including Yahoo!, Microsoft, and
3 individual contributors. All rights reserved. Released under a BSD
4 license as described in the file LICENSE.
5 */
6 #pragma once
7 
8 #include <stdint.h>
9 #include "constant.h"
10 #include "feature_group.h"
11 #include <vector>
12 #include <string>
13 
14 namespace INTERACTIONS
15 {
16 /*
17  * By default include interactions of feature with itself.
18  * This approach produces slightly more interactions but it's safier
19  * for some cases, as discussed in issues/698
20  * Previous behaviour was: include interactions of feature with itself only if its value != value^2.
21  *
22  */
23 constexpr bool feature_self_interactions = true;
24 // must return logical expression
25 /*old: ft_value != 1.0 && feature_self_interactions_for_value_other_than_1*/
26 #define PROCESS_SELF_INTERACTIONS(ft_value) feature_self_interactions
27 
28 // 3 template functions to pass T() proper argument (feature idx in regressor, or its coefficient)
29 
30 template <class R, void (*T)(R&, const float, float&), class W>
31 inline void call_T(R& dat, W& weights, const float ft_value, const uint64_t ft_idx)
32 {
33  T(dat, ft_value, weights[ft_idx]);
34 }
35 
36 template <class R, void (*T)(R&, const float, const float&), class W>
37 inline void call_T(R& dat, const W& weights, const float ft_value, const uint64_t ft_idx)
38 {
39  T(dat, ft_value, weights[ft_idx]);
40 }
41 
42 template <class R, void (*T)(R&, float, uint64_t), class W>
43 inline void call_T(R& dat, W& /*weights*/, const float ft_value, const uint64_t ft_idx)
44 {
45  T(dat, ft_value, ft_idx);
46 }
47 
48 // state data used in non-recursive feature generation algorithm
49 // contains N feature_gen_data records (where N is length of interaction)
51 {
52  size_t loop_idx; // current feature id in namespace
53  uint64_t hash; // hash of feature interactions of previous namespaces in the list
54  float x; // value of feature interactions of previous namespaces in the list
55  size_t loop_end; // last feature id. May be less than number of features if namespace involved in interaction more
56  // than once calculated at preprocessing together with same_ns
57  size_t self_interaction; // namespace interacting with itself
59  // feature_gen_data(): loop_idx(0), x(1.), loop_end(0), self_interaction(false) {}
60 };
61 
62 // The inline function below may be adjusted to change the way
63 // synthetic (interaction) features' values are calculated, e.g.,
64 // fabs(value1-value2) or even value1>value2?1.0:-1.0
65 // Beware - its result must be non-zero.
66 inline float INTERACTION_VALUE(float value1, float value2) { return value1 * value2; }
67 
68 // uncomment line below to disable usage of inner 'for' loops for pair and triple interactions
69 // end switch to usage of non-recursive feature generation algorithm for interactions of any length
70 
71 // #define GEN_INTER_LOOP
72 
73 template <class R, class S, void (*T)(R&, float, S), bool audit, void (*audit_func)(R&, const audit_strings*), class W>
74 inline void inner_kernel(R& dat, features::iterator_all& begin, features::iterator_all& end, const uint64_t offset,
75  W& weights, feature_value ft_value, feature_index halfhash)
76 {
77  if (audit)
78  {
79  for (; begin != end; ++begin)
80  {
81  audit_func(dat, begin.audit().get());
82  call_T<R, T>(dat, weights, INTERACTION_VALUE(ft_value, begin.value()), (begin.index() ^ halfhash) + offset);
83  audit_func(dat, nullptr);
84  }
85  }
86  else
87  {
88  for (; begin != end; ++begin)
89  call_T<R, T>(dat, weights, INTERACTION_VALUE(ft_value, begin.value()), (begin.index() ^ halfhash) + offset);
90  }
91 }
92 
93 // this templated function generates new features for given example and set of interactions
94 // and passes each of them to given function T()
95 // it must be in header file to avoid compilation problems
96 template <class R, class S, void (*T)(R&, float, S), bool audit, void (*audit_func)(R&, const audit_strings*),
97  class W> // nullptr func can't be used as template param in old compilers
98 inline void generate_interactions(std::vector<std::string>& interactions, bool permutations, example_predict& ec,
99  R& dat,
100  W& weights) // default value removed to eliminate ambiguity in old complers
101 {
102  features* features_data = ec.feature_space.data();
103 
104  // often used values
105  const uint64_t offset = ec.ft_offset;
106  // const uint64_t stride_shift = all.stride_shift; // it seems we don't need stride shift in FTRL-like hash
107 
108  // statedata for generic non-recursive iteration
109  v_array<feature_gen_data> state_data = v_init<feature_gen_data>();
110 
111  feature_gen_data empty_ns_data; // micro-optimization. don't want to call its constructor each time in loop.
112  empty_ns_data.loop_idx = 0;
113  empty_ns_data.x = 1.;
114  empty_ns_data.loop_end = 0;
115  empty_ns_data.self_interaction = false;
116 
117  // loop throw the set of possible interactions
118  for (auto& ns : interactions)
119  { // current list of namespaces to interact.
120 
121 #ifndef GEN_INTER_LOOP
122 
123  // unless GEN_INTER_LOOP is defined we use nested 'for' loops for interactions length 2 (pairs) and 3 (triples)
124  // and generic non-recursive algorythm for all other cases.
125  // nested 'for' loops approach is faster, but can't be used for interation of any length.
126 
127  const size_t len = ns.size();
128 
129  if (len == 2) // special case of pairs
130  {
131  features& first = features_data[(uint8_t)ns[0]];
132  if (first.nonempty())
133  {
134  features& second = features_data[(uint8_t)ns[1]];
135  if (second.nonempty())
136  {
137  const bool same_namespace = (!permutations && (ns[0] == ns[1]));
138 
139  for (size_t i = 0; i < first.indicies.size(); ++i)
140  {
141  feature_index halfhash = FNV_prime * (uint64_t)first.indicies[i];
142  if (audit)
143  audit_func(dat, first.space_names[i].get());
144  // next index differs for permutations and simple combinations
145  feature_value ft_value = first.values[i];
147  features::iterator_all begin = range.begin();
148  if (same_namespace)
149  begin += (PROCESS_SELF_INTERACTIONS(ft_value)) ? i : i + 1;
150 
151  features::iterator_all end = range.end();
152  inner_kernel<R, S, T, audit, audit_func>(dat, begin, end, offset, weights, ft_value, halfhash);
153 
154  if (audit)
155  audit_func(dat, nullptr);
156  } // end for(fst)
157  } // end if (data[snd] size > 0)
158  } // end if (data[fst] size > 0)
159  }
160  else if (len == 3) // special case for triples
161  {
162  features& first = features_data[(uint8_t)ns[0]];
163  if (first.nonempty())
164  {
165  features& second = features_data[(uint8_t)ns[1]];
166  if (second.nonempty())
167  {
168  features& third = features_data[(uint8_t)ns[2]];
169  if (third.nonempty())
170  { // don't compare 1 and 3 as interaction is sorted
171  const bool same_namespace1 = (!permutations && (ns[0] == ns[1]));
172  const bool same_namespace2 = (!permutations && (ns[1] == ns[2]));
173 
174  for (size_t i = 0; i < first.indicies.size(); ++i)
175  {
176  if (audit)
177  audit_func(dat, first.space_names[i].get());
178  const uint64_t halfhash1 = FNV_prime * (uint64_t)first.indicies[i];
179  const float& first_ft_value = first.values[i];
180  size_t j = 0;
181  if (same_namespace1) // next index differs for permutations and simple combinations
182  j = (PROCESS_SELF_INTERACTIONS(first_ft_value)) ? i : i + 1;
183 
184  for (; j < second.indicies.size(); ++j)
185  { // f3 x k*(f2 x k*f1)
186  if (audit)
187  audit_func(dat, second.space_names[j].get());
188  feature_index halfhash = FNV_prime * (halfhash1 ^ (uint64_t)second.indicies[j]);
189  feature_value ft_value = INTERACTION_VALUE(first_ft_value, second.values[j]);
190 
192  features::iterator_all begin = range.begin();
193  if (same_namespace2) // next index differs for permutations and simple combinations
194  begin += (PROCESS_SELF_INTERACTIONS(ft_value)) ? j : j + 1;
195 
196  features::iterator_all end = range.end();
197  inner_kernel<R, S, T, audit, audit_func>(dat, begin, end, offset, weights, ft_value, halfhash);
198  if (audit)
199  audit_func(dat, nullptr);
200  } // end for (snd)
201  if (audit)
202  audit_func(dat, nullptr);
203  } // end for (fst)
204 
205  } // end if (data[thr] size > 0)
206  } // end if (data[snd] size > 0)
207  } // end if (data[fst] size > 0)
208  }
209  else // generic case: quatriples, etc.
210 
211 #endif
212  {
213  bool must_skip_interaction = false;
214  // preparing state data
215  feature_gen_data* fgd = state_data.begin();
216  feature_gen_data* fgd2; // for further use
217  for (namespace_index n : ns)
218  {
219  features& ft = features_data[(int32_t)n];
220  const size_t ft_cnt = ft.indicies.size();
221 
222  if (ft_cnt == 0)
223  {
224  must_skip_interaction = true;
225  break;
226  }
227 
228  if (fgd == state_data.end())
229  {
230  state_data.push_back(empty_ns_data);
231  fgd = state_data.end() - 1; // reassign as memory could be realloced
232  }
233 
234  fgd->loop_end = ft_cnt - 1; // saving number of features for each namespace
235  fgd->ft_arr = &ft;
236  ++fgd;
237  }
238 
239  // if any of interacting namespace has 0 features - whole interaction is skipped
240  if (must_skip_interaction)
241  continue; // no_data_to_interact
242 
243  if (!permutations) // adjust state_data for simple combinations
244  { // if permutations mode is disabeled then namespaces in ns are already sorted and thus grouped
245  // (in fact, currently they are sorted even for enabled permutations mode)
246  // let's go throw the list and calculate number of features to skip in namespaces which
247  // repeated more than once to generate only simple combinations of features
248 
249  size_t margin = 0; // number of features to ignore if namespace has been seen before
250 
251  // iterate list backward as margin grows in this order
252 
253  for (fgd = state_data.end() - 1; fgd > state_data.begin(); --fgd)
254  {
255  fgd2 = fgd - 1;
256  fgd->self_interaction = (fgd->ft_arr == fgd2->ft_arr); // state_data.begin().self_interaction is always false
257  if (fgd->self_interaction)
258  {
259  size_t& loop_end = fgd2->loop_end;
260 
261  if (!PROCESS_SELF_INTERACTIONS((*fgd2->ft_arr).values[loop_end - margin]))
262  {
263  ++margin; // otherwise margin can't be increased
264  must_skip_interaction = loop_end < margin;
265  if (must_skip_interaction)
266  break;
267  }
268 
269  if (margin != 0)
270  loop_end -= margin; // skip some features and increase margin
271  }
272  else if (margin != 0)
273  margin = 0;
274  }
275 
276  // if impossible_without_permutations == true then we faced with case like interaction 'aaaa'
277  // where namespace 'a' contains less than 4 unique features. It's impossible to make simple
278  // combination of length 4 without repetitions from 3 or less elements.
279  if (must_skip_interaction)
280  continue; // impossible_without_permutations
281  } // end of state_data adjustment
282 
283  fgd = state_data.begin(); // always equal to first ns
284  fgd2 = state_data.end() - 1; // always equal to last ns
285  fgd->loop_idx = 0; // loop_idx contains current feature id for curently processed namespace.
286 
287  // beware: micro-optimization.
288  /* start & end are always point to features in last namespace of interaction.
289  for 'all.permutations == true' they are constant.*/
290  size_t start_i = 0;
291 
292  feature_gen_data* cur_data = fgd;
293  // end of micro-optimization block
294 
295  // generic feature generation cycle for interactions of any length
296  bool do_it = true;
297  while (do_it)
298  {
299  if (cur_data < fgd2) // can go further threw the list of namespaces in interaction
300  {
301  feature_gen_data* next_data = cur_data + 1;
302  size_t feature = cur_data->loop_idx;
303  features& fs = *(cur_data->ft_arr);
304 
305  if (next_data->self_interaction)
306  { // if next namespace is same, we should start with loop_idx + 1 to avoid feature interaction with itself
307  // unless feature has value x and x != x*x. E.g. x != 0 and x != 1. Features with x == 0 are already
308  // filtered out in parce_args.cc::maybeFeature().
309 
310  next_data->loop_idx =
311  (PROCESS_SELF_INTERACTIONS(fs.values[feature])) ? cur_data->loop_idx : cur_data->loop_idx + 1;
312  }
313  else
314  next_data->loop_idx = 0;
315 
316  if (audit)
317  audit_func(dat, fs.space_names[feature].get());
318 
319  if (cur_data == fgd) // first namespace
320  {
321  next_data->hash = FNV_prime * (uint64_t)fs.indicies[feature];
322  next_data->x = fs.values[feature]; // data->x == 1.
323  }
324  else
325  { // feature2 xor (16777619*feature1)
326  next_data->hash = FNV_prime * (cur_data->hash ^ (uint64_t)fs.indicies[feature]);
327  next_data->x = INTERACTION_VALUE(fs.values[feature], cur_data->x);
328  }
329 
330  ++cur_data;
331  }
332  else
333  { // last namespace - iterate its features and go back
334  if (!permutations) // start value is not a constant in this case
335  start_i = fgd2->loop_idx;
336 
337  features& fs = *(fgd2->ft_arr);
338 
339  feature_value ft_value = fgd2->x;
340  feature_index halfhash = fgd2->hash;
341 
343  features::iterator_all begin = range.begin();
344  begin += start_i;
345  features::iterator_all end = range.begin();
346  end += fgd2->loop_end + 1;
347  inner_kernel<R, S, T, audit, audit_func, W>(dat, begin, end, offset, weights, ft_value, halfhash);
348 
349  // trying to go back increasing loop_idx of each namespace by the way
350 
351  bool go_further = true;
352 
353  do
354  {
355  --cur_data;
356  go_further = (++cur_data->loop_idx > cur_data->loop_end); // increment loop_idx
357  if (audit)
358  audit_func(dat, nullptr);
359  } while (go_further && cur_data != fgd);
360 
361  do_it = !(cur_data == fgd && go_further);
362  // if do_it==false - we've reached 0 namespace but its 'cur_data.loop_idx > cur_data.loop_end' -> exit the
363  // while loop
364  } // if last namespace
365  } // while do_it
366  }
367  } // foreach interaction in all.interactions
368 
369  state_data.delete_v();
370 }
371 } // namespace INTERACTIONS
v_array< feature_index > indicies
features_value_index_audit_range values_indices_audit()
the core definition of a set of features.
float feature_value
Definition: feature_group.h:20
v_array< feature_value > values
constexpr bool feature_self_interactions
T *& begin()
Definition: v_array.h:42
size_t size() const
Definition: v_array.h:68
std::array< features, NUM_NAMESPACES > feature_space
void push_back(const T &new_ele)
Definition: v_array.h:107
defines a "range" usable by C++ 11 for loops
bool nonempty() const
iterator over values, indicies and audit space names
unsigned char namespace_index
uint64_t feature_index
Definition: feature_group.h:21
void generate_interactions(vw &all, example_predict &ec, R &dat)
Definition: interactions.h:45
#define PROCESS_SELF_INTERACTIONS(ft_value)
T *& end()
Definition: v_array.h:43
v_array< audit_strings_ptr > space_names
constexpr uint32_t FNV_prime
Definition: constant.h:17
float INTERACTION_VALUE(float value1, float value2)
void delete_v()
Definition: v_array.h:98
void call_T(R &dat, W &weights, const float ft_value, const uint64_t ft_idx)
void inner_kernel(R &dat, features::iterator_all &begin, features::iterator_all &end, const uint64_t offset, W &weights, feature_value ft_value, feature_index halfhash)
feature_value & value()
Definition: feature_group.h:71
std::pair< std::string, std::string > audit_strings
Definition: feature_group.h:22