Vowpal Wabbit
interactions.cc
Go to the documentation of this file.
1 #include "interactions.h"
2 
3 #include "vw_exception.h"
4 #include <algorithm>
5 
6 namespace INTERACTIONS
7 {
8 /*
9  * Interactions preprocessing
10  */
11 
12 // expand namespace interactions if contain wildcards
13 // recursive function used internally in this module
15  std::string const& ns, std::vector<std::string>& res, std::string& val, size_t pos)
16 {
17  assert(pos <= ns.size());
18 
19  if (pos == ns.size())
20  {
21  // we're at the end of interaction
22 
23  // and store it in res
24  res.push_back(val);
25  // don't free s memory as it's data will be used later
26  }
27  else
28  {
29  // we're at the middle of interaction
30  if (ns[pos] != ':')
31  {
32  // not a wildcard
33  val.push_back(ns[pos]);
34  expand_namespaces_with_recursion(ns, res, val, pos + 1);
35  val.pop_back(); // i don't need value itself
36  }
37  else
38  {
39  for (unsigned char j = printable_start; j <= printable_end; ++j)
40  {
41  if (valid_ns(j))
42  {
43  val.push_back(j);
44  expand_namespaces_with_recursion(ns, res, val, pos + 1);
45  val.pop_back(); // i don't need value itself
46  }
47  }
48  }
49  }
50 }
51 
52 // expand namespace interactions if contain wildcards
53 // called from parse_args.cc
54 // process all interactions in a vector
55 
56 std::vector<std::string> expand_interactions(
57  const std::vector<std::string>& vec, const size_t required_length, const std::string& err_msg)
58 {
59  std::vector<std::string> res;
60 
61  for (std::string const& i : vec)
62  {
63  const size_t len = i.length();
64  if (required_length > 0 && len != required_length)
65  // got strict requirement of interaction length and it was failed.
66  {
67  THROW(err_msg);
68  }
69  else if (len < 2)
70  // regardles of required_length value this check is always performed
71  THROW("error, feature interactions must involve at least two namespaces" << err_msg);
72 
73  std::string temp;
74  expand_namespaces_with_recursion(i, res, temp, 0);
75  }
76  return res;
77 }
78 
79 /*
80  * Sorting and filtering duplicate interactions
81  */
82 
83 // returns true if iteraction contains one or more duplicated namespaces
84 // with one exeption - returns false if interaction made of one namespace
85 // like 'aaa' as it has no sense to sort such things.
86 
87 inline bool must_be_left_sorted(const std::string& oi)
88 {
89  if (oi.size() <= 1)
90  return true; // one letter in std::string - no need to sort
91 
92  bool diff_ns_found = false;
93  bool pair_found = false;
94 
95  for (auto i = std::begin(oi); i != std::end(oi) - 1; ++i)
96  if (*i == *(i + 1)) // pair found
97  {
98  if (diff_ns_found)
99  return true; // case 'abb'
100  pair_found = true;
101  }
102  else
103  {
104  if (pair_found)
105  return true; // case 'aab'
106  diff_ns_found = true;
107  }
108 
109  return false; // 'aaa' or 'abc'
110 }
111 
112 // used from parse_args.cc
113 // filter duplicate namespaces treating them as unordered sets of namespaces.
114 // also sort namespaces in interactions containing duplicate namespaces to make sure they are grouped together.
115 
117  std::vector<std::string>& vec, bool filter_duplicates, size_t& removed_cnt, size_t& sorted_cnt)
118 {
119  // 2 out parameters
120  removed_cnt = 0;
121  sorted_cnt = 0;
122 
123  // interaction value sort + original position
124  std::vector<std::pair<std::string, size_t>> vec_sorted;
125  for (size_t i = 0; i < vec.size(); ++i)
126  {
127  std::string sorted_i(vec[i]);
128  std::stable_sort(std::begin(sorted_i), std::end(sorted_i));
129  vec_sorted.push_back(std::make_pair(sorted_i, i));
130  }
131 
132  if (filter_duplicates)
133  {
134  // remove duplicates
135  std::stable_sort(vec_sorted.begin(), vec_sorted.end(),
136  [](std::pair<std::string, size_t> const& a, std::pair<std::string, size_t> const& b) {
137  return a.first < b.first;
138  });
139  auto last = unique(vec_sorted.begin(), vec_sorted.end(),
140  [](std::pair<std::string, size_t> const& a, std::pair<std::string, size_t> const& b) {
141  return a.first == b.first;
142  });
143  vec_sorted.erase(last, vec_sorted.end());
144 
145  // report number of removed interactions
146  removed_cnt = vec.size() - vec_sorted.size();
147 
148  // restore original order
149  std::stable_sort(vec_sorted.begin(), vec_sorted.end(),
150  [](std::pair<std::string, size_t> const& a, std::pair<std::string, size_t> const& b) {
151  return a.second < b.second;
152  });
153  }
154 
155  // we have original vector and vector with duplicates removed + corresponding indexes in original vector
156  // plus second vector's data is sorted. We can reuse it if we need interaction to be left sorted.
157  // let's make a new vector from these two sources - without dulicates and with sorted data whenever it's needed.
158  std::vector<std::string> res;
159  for (auto& i : vec_sorted)
160  {
161  if (must_be_left_sorted(i.first))
162  {
163  // if so - copy sorted data to result
164  res.push_back(i.first);
165  ++sorted_cnt;
166  }
167  else // else - move unsorted data to result
168  res.push_back(vec[i.second]);
169  }
170 
171  vec = res;
172 }
173 
174 /*
175  * Estimation of generated features properties
176  */
177 
178 // the code under DEBUG_EVAL_COUNT_OF_GEN_FT below is an alternative way of implementation of
179 // eval_count_of_generated_ft() it just calls generate_interactions() with small function which counts generated
180 // features and sums their squared values it's replaced with more fast (?) analytic solution but keeps just in case and
181 // for doublecheck.
182 
183 //#define DEBUG_EVAL_COUNT_OF_GEN_FT
184 #ifdef DEBUG_EVAL_COUNT_OF_GEN_FT
185 struct eval_gen_data
186 {
187  size_t& new_features_cnt;
188  float& new_features_value;
189  eval_gen_data(size_t& features_cnt, float& features_value)
190  : new_features_cnt(features_cnt), new_features_value(features_value)
191  {
192  }
193 };
194 
195 void ft_cnt(eval_gen_data& dat, const float fx, const uint64_t)
196 {
197  ++dat.new_features_cnt;
198  dat.new_features_value += fx * fx;
199 }
200 #endif
201 
202 // lookup table of factorials up tu 21!
203 constexpr int64_t fast_factorial[] = {1, 1, 2, 6, 24, 120, 720, 5040, 40320, 362880, 3628800, 39916800, 479001600,
204  6227020800, 87178291200, 1307674368000, 20922789888000, 355687428096000, 6402373705728000, 121645100408832000,
205  2432902008176640000};
206 constexpr size_t size_fast_factorial = sizeof(fast_factorial) / sizeof(*fast_factorial);
207 
208 // helper factorial function that allows to perform:
209 // n!/(n-k)! = (n-k+1)*(n-k+2)..*(n-1)*n
210 // by specifying n-k as second argument
211 // that helps to avoid size_t overflow for big n
212 // leave second argument = 1 to get regular factorial function
213 
214 inline size_t factor(const size_t n, const size_t start_from = 1)
215 {
216  if (n <= 0)
217  return 1;
218  if (start_from == 1 && n < size_fast_factorial)
219  return (size_t)fast_factorial[n];
220 
221  size_t res = 1;
222  for (size_t i = start_from + 1; i <= n; ++i) res *= i;
223  return res;
224 }
225 
226 // returns number of new features that will be generated for example and sum of their squared values
227 
228 void eval_count_of_generated_ft(vw& all, example& ec, size_t& new_features_cnt, float& new_features_value)
229 {
230  new_features_cnt = 0;
231  new_features_value = 0.;
232 
233  v_array<float> results = v_init<float>();
234 
235  if (all.permutations)
236  {
237  // just multiply precomputed values for all namespaces
238  for (std::string& inter : *ec.interactions)
239  {
240  size_t num_features_in_inter = 1;
241  float sum_feat_sq_in_inter = 1.;
242 
243  for (namespace_index ns : inter)
244  {
245  num_features_in_inter *= ec.feature_space[ns].size();
246  sum_feat_sq_in_inter *= ec.feature_space[ns].sum_feat_sq;
247  if (num_features_in_inter == 0)
248  break;
249  }
250 
251  if (num_features_in_inter == 0)
252  continue;
253 
254  new_features_cnt += num_features_in_inter;
255  new_features_value += sum_feat_sq_in_inter;
256  }
257  }
258  else // case of simple combinations
259  {
260 #ifdef DEBUG_EVAL_COUNT_OF_GEN_FT
261  size_t correct_features_cnt = 0;
262  float correct_features_value = 0.;
263  eval_gen_data dat(correct_features_cnt, correct_features_value);
264  generate_interactions<eval_gen_data, uint64_t, ft_cnt>(all, ec, dat);
265 #endif
266 
267  for (std::string& inter : *ec.interactions)
268  {
269  size_t num_features_in_inter = 1;
270  float sum_feat_sq_in_inter = 1.;
271 
272  for (auto ns = inter.begin(); ns != inter.end(); ++ns)
273  {
274  if ((ns == inter.end() - 1) || (*ns != *(ns + 1))) // neighbour namespaces are different
275  {
276  // just multiply precomputed values
277  const int nsc = *ns;
278  num_features_in_inter *= ec.feature_space[nsc].size();
279  sum_feat_sq_in_inter *= ec.feature_space[nsc].sum_feat_sq;
280  if (num_features_in_inter == 0)
281  break; // one of namespaces has no features - go to next interaction
282  }
283  else // we are at beginning of a block made of same namespace (interaction is preliminary sorted)
284  {
285  // let's find out real length of this block
286  size_t order_of_inter = 2; // alredy compared ns == ns+1
287 
288  for (auto ns_end = ns + 2; ns_end < inter.end(); ++ns_end)
289  if (*ns == *ns_end)
290  ++order_of_inter;
291 
292  // namespace is same for whole block
293  features& fs = ec.feature_space[(const int)*ns];
294 
295  // count number of features with value != 1.;
296  size_t cnt_ft_value_non_1 = 0;
297 
298  // in this block we shall calculate number of generated features and sum of their values
299  // keeping in mind rules applicable for simple combinations instead of permutations
300 
301  // let's calculate sum of their squared value for whole block
302 
303  // ensure results as big as order_of_inter and empty.
304  for (size_t i = 0; i < results.size(); ++i) results[i] = 0.;
305  while (results.size() < order_of_inter) results.push_back(0.);
306 
307  // recurrent value calculations
308  for (size_t i = 0; i < fs.size(); ++i)
309  {
310  const float x = fs.values[i] * fs.values[i];
311 
312  if (!PROCESS_SELF_INTERACTIONS(fs.values[i]))
313  {
314  for (size_t i = order_of_inter - 1; i > 0; --i) results[i] += results[i - 1] * x;
315 
316  results[0] += x;
317  }
318  else
319  {
320  results[0] += x;
321 
322  for (size_t i = 1; i < order_of_inter; ++i) results[i] += results[i - 1] * x;
323 
324  ++cnt_ft_value_non_1;
325  }
326  }
327 
328  sum_feat_sq_in_inter *= results[order_of_inter - 1]; // will be explained in http://bit.ly/1Hk9JX1
329 
330  // let's calculate the number of a new features
331 
332  // if number of features is less than order of interaction then go to the next interaction
333  // as you can't make simple combination of interaction 'aaa' if a contains < 3 features.
334  // unless one of them has value != 1. and we are counting them.
335  const size_t ft_size = fs.size();
336  if (cnt_ft_value_non_1 == 0 && ft_size < order_of_inter)
337  {
338  num_features_in_inter = 0;
339  break;
340  }
341 
342  size_t n;
343  if (cnt_ft_value_non_1 == 0) // number of generated simple combinations is C(n,k)
344  {
345  n = (size_t)choose((int64_t)ft_size, (int64_t)order_of_inter);
346  }
347  else
348  {
349  n = 0;
350  for (size_t l = 0; l <= order_of_inter; ++l)
351  {
352  // C(l+m-1, l) * C(n-m, k-l)
353  size_t num = (l == 0) ? 1 : (size_t)choose(l + cnt_ft_value_non_1 - 1, l);
354 
355  if (ft_size - cnt_ft_value_non_1 >= order_of_inter - l)
356  num *= (size_t)choose(ft_size - cnt_ft_value_non_1, order_of_inter - l);
357  else
358  num = 0;
359 
360  n += num;
361  }
362 
363  } // details on http://bit.ly/1Hk9JX1
364 
365  num_features_in_inter *= n;
366 
367  ns += order_of_inter - 1; // jump over whole block
368  }
369  }
370 
371  if (num_features_in_inter == 0)
372  continue; // signal that values should be ignored (as default value is 1)
373 
374  new_features_cnt += num_features_in_inter;
375  new_features_value += sum_feat_sq_in_inter;
376  }
377 
378 #ifdef DEBUG_EVAL_COUNT_OF_GEN_FT
379  if (correct_features_cnt != new_features_cnt)
380  all.trace_message << "Incorrect new features count " << new_features_cnt << " must be " << correct_features_cnt
381  << std::endl;
382  if (fabs(correct_features_value - new_features_value) > 1e-5)
383  all.trace_message << "Incorrect new features value " << new_features_value << " must be "
384  << correct_features_value << std::endl;
385 #endif
386  }
387 
388  results.delete_v();
389 }
390 
391 } // namespace INTERACTIONS
constexpr unsigned char printable_end
Definition: interactions.h:17
void eval_count_of_generated_ft(vw &all, example &ec, size_t &new_features_cnt, float &new_features_value)
std::vector< std::string > * interactions
the core definition of a set of features.
v_array< feature_value > values
int64_t choose(int64_t n, int64_t k)
Definition: interactions.h:69
size_t size() const
Definition: v_array.h:68
std::array< features, NUM_NAMESPACES > feature_space
size_t size() const
void push_back(const T &new_ele)
Definition: v_array.h:107
bool must_be_left_sorted(const std::string &oi)
Definition: interactions.cc:87
unsigned char namespace_index
constexpr unsigned char printable_start
Definition: interactions.h:16
std::vector< std::string > expand_interactions(const std::vector< std::string > &vec, const size_t required_length, const std::string &err_msg)
Definition: interactions.cc:56
constexpr size_t size_fast_factorial
#define PROCESS_SELF_INTERACTIONS(ft_value)
bool valid_ns(char c)
Definition: example.h:111
constexpr uint64_t a
Definition: rand48.cc:11
constexpr int64_t fast_factorial[]
bool permutations
Definition: global_data.h:454
size_t factor(const size_t n, const size_t start_from=1)
void delete_v()
Definition: v_array.h:98
void expand_namespaces_with_recursion(std::string const &ns, std::vector< std::string > &res, std::string &val, size_t pos)
Definition: interactions.cc:14
void sort_and_filter_duplicate_interactions(std::vector< std::string > &vec, bool filter_duplicates, size_t &removed_cnt, size_t &sorted_cnt)
#define THROW(args)
Definition: vw_exception.h:181