15 std::string
const& ns, std::vector<std::string>& res, std::string& val,
size_t pos)
17 assert(pos <= ns.size());
33 val.push_back(ns[pos]);
57 const std::vector<std::string>& vec,
const size_t required_length,
const std::string& err_msg)
59 std::vector<std::string> res;
61 for (std::string
const& i : vec)
63 const size_t len = i.length();
64 if (required_length > 0 && len != required_length)
71 THROW(
"error, feature interactions must involve at least two namespaces" << err_msg);
92 bool diff_ns_found =
false;
93 bool pair_found =
false;
95 for (
auto i = std::begin(oi); i != std::end(oi) - 1; ++i)
106 diff_ns_found =
true;
117 std::vector<std::string>& vec,
bool filter_duplicates,
size_t& removed_cnt,
size_t& sorted_cnt)
124 std::vector<std::pair<std::string, size_t>> vec_sorted;
125 for (
size_t i = 0; i < vec.size(); ++i)
127 std::string sorted_i(vec[i]);
128 std::stable_sort(std::begin(sorted_i), std::end(sorted_i));
129 vec_sorted.push_back(std::make_pair(sorted_i, i));
132 if (filter_duplicates)
135 std::stable_sort(vec_sorted.begin(), vec_sorted.end(),
136 [](std::pair<std::string, size_t>
const&
a, std::pair<std::string, size_t>
const& b) {
137 return a.first < b.first;
139 auto last = unique(vec_sorted.begin(), vec_sorted.end(),
140 [](std::pair<std::string, size_t>
const&
a, std::pair<std::string, size_t>
const& b) {
141 return a.first == b.first;
143 vec_sorted.erase(last, vec_sorted.end());
146 removed_cnt = vec.size() - vec_sorted.size();
149 std::stable_sort(vec_sorted.begin(), vec_sorted.end(),
150 [](std::pair<std::string, size_t>
const&
a, std::pair<std::string, size_t>
const& b) {
151 return a.second < b.second;
158 std::vector<std::string> res;
159 for (
auto& i : vec_sorted)
164 res.push_back(i.first);
168 res.push_back(vec[i.second]);
184 #ifdef DEBUG_EVAL_COUNT_OF_GEN_FT 187 size_t& new_features_cnt;
188 float& new_features_value;
189 eval_gen_data(
size_t& features_cnt,
float& features_value)
190 : new_features_cnt(features_cnt), new_features_value(features_value)
195 void ft_cnt(eval_gen_data& dat,
const float fx,
const uint64_t)
197 ++dat.new_features_cnt;
198 dat.new_features_value += fx * fx;
203 constexpr int64_t
fast_factorial[] = {1, 1, 2, 6, 24, 120, 720, 5040, 40320, 362880, 3628800, 39916800, 479001600,
204 6227020800, 87178291200, 1307674368000, 20922789888000, 355687428096000, 6402373705728000, 121645100408832000,
205 2432902008176640000};
214 inline size_t factor(
const size_t n,
const size_t start_from = 1)
218 if (start_from == 1 && n < size_fast_factorial)
219 return (
size_t)fast_factorial[n];
222 for (
size_t i = start_from + 1; i <= n; ++i) res *= i;
230 new_features_cnt = 0;
231 new_features_value = 0.;
240 size_t num_features_in_inter = 1;
241 float sum_feat_sq_in_inter = 1.;
247 if (num_features_in_inter == 0)
251 if (num_features_in_inter == 0)
254 new_features_cnt += num_features_in_inter;
255 new_features_value += sum_feat_sq_in_inter;
260 #ifdef DEBUG_EVAL_COUNT_OF_GEN_FT 261 size_t correct_features_cnt = 0;
262 float correct_features_value = 0.;
263 eval_gen_data dat(correct_features_cnt, correct_features_value);
264 generate_interactions<eval_gen_data, uint64_t, ft_cnt>(all, ec, dat);
267 for (std::string& inter : *ec.interactions)
269 size_t num_features_in_inter = 1;
270 float sum_feat_sq_in_inter = 1.;
272 for (
auto ns = inter.begin(); ns != inter.end(); ++ns)
274 if ((ns == inter.end() - 1) || (*ns != *(ns + 1)))
278 num_features_in_inter *= ec.feature_space[nsc].size();
279 sum_feat_sq_in_inter *= ec.feature_space[nsc].sum_feat_sq;
280 if (num_features_in_inter == 0)
286 size_t order_of_inter = 2;
288 for (
auto ns_end = ns + 2; ns_end < inter.end(); ++ns_end)
293 features& fs = ec.feature_space[(
const int)*ns];
296 size_t cnt_ft_value_non_1 = 0;
304 for (
size_t i = 0; i < results.
size(); ++i) results[i] = 0.;
305 while (results.
size() < order_of_inter) results.
push_back(0.);
308 for (
size_t i = 0; i < fs.
size(); ++i)
314 for (
size_t i = order_of_inter - 1; i > 0; --i) results[i] += results[i - 1] * x;
322 for (
size_t i = 1; i < order_of_inter; ++i) results[i] += results[i - 1] * x;
324 ++cnt_ft_value_non_1;
328 sum_feat_sq_in_inter *= results[order_of_inter - 1];
335 const size_t ft_size = fs.
size();
336 if (cnt_ft_value_non_1 == 0 && ft_size < order_of_inter)
338 num_features_in_inter = 0;
343 if (cnt_ft_value_non_1 == 0)
345 n = (size_t)
choose((int64_t)ft_size, (int64_t)order_of_inter);
350 for (
size_t l = 0; l <= order_of_inter; ++l)
353 size_t num = (l == 0) ? 1 : (
size_t)
choose(l + cnt_ft_value_non_1 - 1, l);
355 if (ft_size - cnt_ft_value_non_1 >= order_of_inter - l)
356 num *= (size_t)
choose(ft_size - cnt_ft_value_non_1, order_of_inter - l);
365 num_features_in_inter *= n;
367 ns += order_of_inter - 1;
371 if (num_features_in_inter == 0)
374 new_features_cnt += num_features_in_inter;
375 new_features_value += sum_feat_sq_in_inter;
378 #ifdef DEBUG_EVAL_COUNT_OF_GEN_FT 379 if (correct_features_cnt != new_features_cnt)
380 all.trace_message <<
"Incorrect new features count " << new_features_cnt <<
" must be " << correct_features_cnt
382 if (fabs(correct_features_value - new_features_value) > 1e-5)
383 all.trace_message <<
"Incorrect new features value " << new_features_value <<
" must be " 384 << correct_features_value << std::endl;
constexpr unsigned char printable_end
void eval_count_of_generated_ft(vw &all, example &ec, size_t &new_features_cnt, float &new_features_value)
std::vector< std::string > * interactions
the core definition of a set of features.
v_array< feature_value > values
int64_t choose(int64_t n, int64_t k)
std::array< features, NUM_NAMESPACES > feature_space
void push_back(const T &new_ele)
bool must_be_left_sorted(const std::string &oi)
unsigned char namespace_index
constexpr unsigned char printable_start
std::vector< std::string > expand_interactions(const std::vector< std::string > &vec, const size_t required_length, const std::string &err_msg)
constexpr size_t size_fast_factorial
#define PROCESS_SELF_INTERACTIONS(ft_value)
constexpr int64_t fast_factorial[]
size_t factor(const size_t n, const size_t start_from=1)
void expand_namespaces_with_recursion(std::string const &ns, std::vector< std::string > &res, std::string &val, size_t pos)
void sort_and_filter_duplicate_interactions(std::vector< std::string > &vec, bool filter_duplicates, size_t &removed_cnt, size_t &sorted_cnt)