16 #if !defined(VW_NO_INLINE_SIMD) 17 #if !defined(__SSE2__) && (defined(_M_AMD64) || defined(_M_X64)) 21 #if defined(__ARM_NEON__) 23 #elif defined(__SSE2__) 24 #include <xmmintrin.h> 33 #define VERSION_SAVE_RESUME_FIX "7.10.1" 34 #define VERSION_PASS_UINT64 "8.3.3" 71 float xhalf = 0.5f * x;
72 static_assert(
sizeof(
int) ==
sizeof(
float),
"Floats and ints are converted between, they must be the same size.");
73 int i =
reinterpret_cast<int&
>(x);
74 i = 0x5f3759d5 - (i >> 1);
75 x =
reinterpret_cast<float&
>(i);
76 x = x * (1.5f - xhalf * x * x);
82 #if !defined(VW_NO_INLINE_SIMD) 83 #if defined(__ARM_NEON__) 85 float32x2_t v1 = vdup_n_f32(x);
87 float32x2_t e1 = vrsqrte_f32(v1);
89 float32x2_t e2 = vmul_f32(e1, vrsqrts_f32(v1, vmul_f32(e1, e1)));
91 float32x2_t e3 = vmul_f32(e2, vrsqrts_f32(v1, vmul_f32(e2, e2)));
93 return vget_lane_f32(e3, 0);
94 #elif defined(__SSE2__) 95 __m128 eta = _mm_load_ss(&x);
96 eta = _mm_rsqrt_ss(eta);
97 _mm_store_ss(&x, eta);
108 template <
bool sqrt_rate,
bool feature_mask_off,
size_t adaptive,
size_t normalized,
size_t spare>
112 if (feature_mask_off || fw != 0.)
121 template <
bool sqrt_rate,
size_t adaptive,
size_t normalized>
122 float average_update(
float total_weight,
float normalized_sum_norm_x,
float neg_norm_power)
128 float avg_norm = (float)(total_weight / normalized_sum_norm_x);
130 return std::sqrt(avg_norm);
135 return powf((
float)(normalized_sum_norm_x / total_weight), neg_norm_power);
140 template <
bool sqrt_rate,
bool feature_mask_off,
size_t adaptive,
size_t normalized,
size_t spare>
145 foreach_feature<float, update_feature<sqrt_rate, feature_mask_off, adaptive, normalized, spare> >(*g.
all, ec,
update);
224 if (f->first !=
"" && ((f->first) !=
" "))
226 ns_pre.append(f->first);
232 ns_pre.append(f->second);
237 dat.
ns_pre.push_back(ns_pre);
244 uint64_t index = ft_idx & weights.
mask();
248 for (std::string& s : dat.
ns_pre) ns_pre += s;
252 std::ostringstream tempstream;
253 tempstream << ':' << (index >>
stride_shift) <<
':' << ft_weight <<
':' 257 tempstream <<
'@' << (&weights[index])[1];
259 string_value sv = {weights[index] * ft_weight, ns_pre + tempstream.str()};
270 std::ostringstream tempstream;
272 ns_pre += tempstream.str();
276 dat.
all.
name_index_map.insert(std::map<std::string, size_t>::value_type(ns_pre, index >> stride_shift));
285 for (
features& fs : ec) count += fs.size();
290 std::cout <<
'\t' <<
f.audit().get()->first <<
'^' <<
f.audit().get()->second <<
':' 292 for (
size_t k = 0; k < all.
lda; k++) std::cout <<
':' << (&weights[
f.index()])[k];
295 std::cout <<
" total of " << count <<
" features." << std::endl;
308 if (fs.space_names.size() > 0)
319 INTERACTIONS::generate_interactions<audit_results, const uint64_t, audit_feature, true, audit_interaction>(
326 std::cout << std::endl;
344 std::cerr <<
"NAN prediction in example " << sd->
example_number + 1 <<
", forcing " << ret << std::endl;
349 if (ret < sd->min_label)
368 foreach_feature<trunc_data, vec_add_trunc>(all, ec, temp);
375 std::cerr <<
" + " << fw <<
"*" << fx;
378 template <
bool l1,
bool audit>
401 template <
bool l1,
bool audit>
414 foreach_feature<multipredict_info<sparse_parameters>, uint64_t,
vec_add_multipredict>(all, ec, mp);
422 foreach_feature<multipredict_info<dense_parameters>, uint64_t,
vec_add_multipredict>(all, ec, mp);
426 if (finalize_predictions)
430 for (
size_t c = 0; c < count; c++)
436 ec.
ft_offset -= (uint64_t)(step * count);
446 template <
bool sqrt_rate,
size_t adaptive,
size_t normalized>
450 float rate_decay = 1.f;
454 rate_decay =
InvSqrt(w[adaptive]);
462 float inv_norm = 1.f / w[normalized];
464 rate_decay *= inv_norm;
466 rate_decay *= inv_norm * inv_norm;
480 float extra_state[4];
487 template <
bool sqrt_rate,
bool feature_mask_off,
size_t adaptive,
size_t normalized,
size_t spare,
bool stateless>
490 if (feature_mask_off || fw != 0.)
496 x = (x > 0) ? x_min : -x_min;
500 THROW(
"your features have too much magnitude");
512 float x_abs = fabsf(x);
513 if (x_abs > w[normalized])
520 float rescale = w[normalized] / x_abs;
521 w[0] *= (adaptive ? rescale : rescale * rescale);
525 float rescale = x_abs / w[normalized];
529 w[normalized] = x_abs;
531 nd.
norm_x += x2 / (w[normalized] * w[normalized]);
533 w[spare] = compute_rate_decay<sqrt_rate, adaptive, normalized>(nd.
pd, w[0]);
539 template <
bool sqrt_rate,
bool feature_mask_off,
bool adax,
size_t adaptive,
size_t normalized,
size_t spare,
547 float grad_squared = ec.
weight;
551 if (grad_squared == 0 && !stateless)
556 pred_per_update_feature<sqrt_rate, feature_mask_off, adaptive, normalized, spare, stateless> >(all, ec, nd);
577 template <
bool sqrt_rate,
bool feature_mask_off,
bool adax,
size_t adaptive,
size_t normalized,
size_t spare,
581 if (adaptive || normalized)
582 return get_pred_per_update<sqrt_rate, feature_mask_off, adax, adaptive, normalized, spare, stateless>(g, ec);
587 template <
size_t adaptive>
600 template <
bool sqrt_rate,
bool feature_mask_off,
bool adax,
size_t adaptive,
size_t normalized,
size_t spare>
603 return get_scale<adaptive>(g, ec, 1.) *
604 sensitivity<sqrt_rate, feature_mask_off, adax, adaptive, normalized, spare, true>(g, ec);
607 template <
bool sparse_l2,
bool invariant,
bool sqrt_rate,
bool feature_mask_off,
bool adax,
size_t adaptive,
608 size_t normalized,
size_t spare>
619 float pred_per_update = sensitivity<sqrt_rate, feature_mask_off, adax, adaptive, normalized, spare, false>(g, ec);
620 float update_scale = get_scale<adaptive>(g, ec, ec.
weight);
622 update = all.
loss->
getUpdate(ec.pred.scalar, ld.
label, update_scale, pred_per_update);
626 ec.updated_prediction += pred_per_update *
update;
628 if (all.
reg_mode && fabs(update) > 1e-8)
631 double eta_bar = (fabs(dev1) > 1e-8) ? (-update / dev1) : 0.0;
632 if (fabs(dev1) > 1e-8)
645 template <
bool sparse_l2,
bool invariant,
bool sqrt_rate,
bool feature_mask_off,
bool adax,
size_t adaptive,
646 size_t normalized,
size_t spare>
651 if ((update = compute_update<sparse_l2, invariant, sqrt_rate, feature_mask_off, adax, adaptive, normalized, spare>(
653 train<sqrt_rate, feature_mask_off, adaptive, normalized, spare>(g, ec,
update);
655 if (g.all->sd->contraction < 1e-9 || g.all->sd->gravity > 1e3)
659 template <
bool sparse_l2,
bool invariant,
bool sqrt_rate,
bool feature_mask_off,
bool adax,
size_t adaptive,
660 size_t normalized,
size_t spare>
668 update<sparse_l2, invariant, sqrt_rate, feature_mask_off, adax, adaptive, normalized, spare>(g, base, ec);
688 size_t write_index(
io_buf& model_file, std::stringstream& msg,
bool text, uint32_t num_bits, uint64_t i)
713 std::stringstream msg;
714 typedef std::map<std::string, size_t> str_int_map;
718 weight* v = &weights.strided_index(it->second);
722 brw =
bin_text_write_fixed(model_file, (
char*)it->first.c_str(),
sizeof(*it->first.c_str()), msg,
true);
724 msg <<
":" << it->second <<
":" << *v <<
"\n";
733 uint64_t length = (uint64_t)1 << all.
num_bits;
740 brw = model_file.
bin_read_fixed((
char*)&old_i,
sizeof(old_i),
"");
748 THROW(
"Model content is corrupted, weight vector index " << i <<
" must be less than total vector length " 750 weight* v = &weights.strided_index(i);
755 for (
typename T::iterator v = weights.begin(); v != weights.end(); ++v)
758 i = v.index() >> weights.stride_shift();
759 std::stringstream msg;
762 msg <<
":" << *v <<
"\n";
777 vw& all,
io_buf& model_file,
bool read,
bool text,
gd* g, std::stringstream& msg, uint32_t ftrl_size, T& weights)
779 uint64_t length = (uint64_t)1 << all.
num_bits;
791 brw = model_file.
bin_read_fixed((
char*)&old_i,
sizeof(old_i),
"");
799 THROW(
"Model content is corrupted, weight vector index " << i <<
" must be less than total vector length " 801 weight buff[8] = {0, 0, 0, 0, 0, 0, 0, 0};
803 brw += model_file.
bin_read_fixed((
char*)buff,
sizeof(buff[0]) * ftrl_size,
"");
805 brw += model_file.
bin_read_fixed((
char*)buff,
sizeof(buff[0]),
"");
807 brw += model_file.
bin_read_fixed((
char*)buff,
sizeof(buff[0]) * 2,
"");
809 brw += model_file.
bin_read_fixed((
char*)buff,
sizeof(buff[0]) * 3,
"");
810 uint32_t stride = 1 << weights.stride_shift();
811 weight* v = &weights.strided_index(i);
812 for (
size_t i = 0; i < stride; i++) v[i] = buff[i];
816 for (
typename T::iterator v = weights.begin(); v != weights.end(); ++v)
818 i = v.index() >> weights.stride_shift();
822 if (*v != 0. || (&(*v))[1] != 0. || (&(*v))[2] != 0.)
825 msg <<
":" << *v <<
" " << (&(*v))[1] <<
" " << (&(*v))[2] <<
"\n";
829 else if (ftrl_size == 4)
831 if (*v != 0. || (&(*v))[1] != 0. || (&(*v))[2] != 0. || (&(*v))[3] != 0.)
834 msg <<
":" << *v <<
" " << (&(*v))[1] <<
" " << (&(*v))[2] <<
" " << (&(*v))[3] <<
"\n";
838 else if (ftrl_size == 6)
840 if (*v != 0. || (&(*v))[1] != 0. || (&(*v))[2] != 0. || (&(*v))[3] != 0. || (&(*v))[4] != 0. ||
844 msg <<
":" << *v <<
" " << (&(*v))[1] <<
" " << (&(*v))[2] <<
" " << (&(*v))[3] <<
" " << (&(*v))[4] <<
" " 845 << (&(*v))[5] <<
"\n";
854 msg <<
":" << *v <<
"\n";
861 if (*v != 0. || (&(*v))[1] != 0.)
864 msg <<
":" << *v <<
" " << (&(*v))[1] <<
"\n";
871 if (*v != 0. || (&(*v))[1] != 0. || (&(*v))[2] != 0.)
874 msg <<
":" << *v <<
" " << (&(*v))[1] <<
" " << (&(*v))[2] <<
"\n";
882 vw& all,
io_buf& model_file,
bool read,
bool text,
double& total_weight,
gd* g, uint32_t ftrl_size)
885 std::stringstream msg;
887 msg <<
"initial_t " << all.
initial_t <<
"\n";
894 msg <<
"t " << all.
sd->
t <<
"\n";
897 msg <<
"sum_loss " << all.
sd->
sum_loss <<
"\n";
905 msg <<
"dump_interval " << dump_interval <<
"\n";
941 msg <<
"total_weight " << total_weight <<
"\n";
985 static void func(
weight& w, std::pair<float, float>& initial, uint64_t )
988 (&w)[1] = initial.second;
1002 std::pair<float, float> p = std::make_pair(init_weight, all.
initial_t);
1021 std::stringstream msg;
1022 msg <<
":" << resume <<
"\n";
1029 <<
"WARNING: --save_resume functionality is known to have inaccuracy in model files version less than " 1042 template <
bool sparse_l2,
bool invariant,
bool sqrt_rate,
bool feature_mask_off, uint64_t adaptive, uint64_t normalized,
1043 uint64_t spare, uint64_t next>
1049 g.
learn = learn<sparse_l2, invariant, sqrt_rate, feature_mask_off, true, adaptive, normalized, spare>;
1050 g.
update = update<sparse_l2, invariant, sqrt_rate, feature_mask_off, true, adaptive, normalized, spare>;
1051 g.
sensitivity = sensitivity<sqrt_rate, feature_mask_off, true, adaptive, normalized, spare>;
1056 g.
learn = learn<sparse_l2, invariant, sqrt_rate, feature_mask_off, false, adaptive, normalized, spare>;
1057 g.
update = update<sparse_l2, invariant, sqrt_rate, feature_mask_off, false, adaptive, normalized, spare>;
1058 g.
sensitivity = sensitivity<sqrt_rate, feature_mask_off, false, adaptive, normalized, spare>;
1063 template <
bool sparse_l2,
bool invariant,
bool sqrt_rate, uint64_t adaptive, uint64_t normalized, uint64_t spare,
1068 if (feature_mask_off)
1069 return set_learn<sparse_l2, invariant, sqrt_rate, true, adaptive, normalized, spare, next>(all, g);
1071 return set_learn<sparse_l2, invariant, sqrt_rate, false, adaptive, normalized, spare, next>(all, g);
1074 template <
bool invariant,
bool sqrt_rate, u
int64_t adaptive, u
int64_t normalized, u
int64_t spare, u
int64_t next>
1078 return set_learn<true, invariant, sqrt_rate, adaptive, normalized, spare, next>(all, feature_mask_off, g);
1080 return set_learn<false, invariant, sqrt_rate, adaptive, normalized, spare, next>(all, feature_mask_off, g);
1083 template <
bool sqrt_rate, u
int64_t adaptive, u
int64_t normalized, u
int64_t spare, u
int64_t next>
1087 return set_learn<true, sqrt_rate, adaptive, normalized, spare, next>(all, feature_mask_off, g);
1089 return set_learn<false, sqrt_rate, adaptive, normalized, spare, next>(all, feature_mask_off, g);
1092 template <
bool sqrt_rate, u
int64_t adaptive, u
int64_t spare>
1097 return set_learn<sqrt_rate, adaptive, adaptive + 1, adaptive + 2, adaptive + 3>(all, feature_mask_off, g);
1099 return set_learn<sqrt_rate, adaptive, 0, spare, spare + 1>(all, feature_mask_off, g);
1102 template <
bool sqrt_rate>
1106 return set_learn<sqrt_rate, 1, 2>(all, feature_mask_off, g);
1108 return set_learn<sqrt_rate, 0, 0>(all, feature_mask_off, g);
1121 auto g = scoped_calloc_or_throw<gd>();
1124 bool adaptive =
false;
1126 bool invariant =
false;
1127 bool normalized =
false;
1132 .
add(
make_option(
"adax", adax).help(
"use adaptive learning rates with x^2 instead of g^2x^2"))
1135 .
add(
make_option(
"sparse_l2", g->sparse_l2).default_value(0.f).help(
"use per feature normalized updates"))
1139 .help(
"use per feature normalized updates"))
1143 .help(
"use per feature normalized updates"));
1148 g->no_win_counter = 0;
1149 g->total_weight = 0.;
1153 g->neg_power_t = -all.
power_t;
1158 g->all->normalized_sum_norm_x = all.
initial_t;
1162 bool feature_mask_off =
true;
1164 feature_mask_off =
false;
1169 g->early_stop_thres = options.
get_typed_option<
size_t>(
"early_terminate").value();
1174 if (sgd || adaptive || invariant || normalized)
1210 THROW(
"Cannot use adax without adaptive");
1213 all.
trace_message <<
"Warning: the learning rate for the last pass is multiplied by: " 1215 <<
" adjust --decay_learning_rate larger to avoid this." << std::endl;
1220 g->predict = predict<true, true>;
1221 g->multipredict = multipredict<true, true>;
1225 g->predict = predict<true, false>;
1226 g->multipredict = multipredict<true, false>;
1230 g->predict = predict<false, true>;
1231 g->multipredict = multipredict<false, true>;
1235 g->predict = predict<false, false>;
1236 g->multipredict = multipredict<false, false>;
1241 stride = set_learn<true>(all, feature_mask_off, *g.get());
1243 stride = set_learn<false>(all, feature_mask_off, *g.get());
1245 all.weights.stride_shift((uint32_t)
ceil_log_2(stride - 1));
float get_pred_per_update(gd &g, example &ec)
void set_multipredict(void(*u)(T &, L &, E &, size_t, size_t, polyprediction *, bool))
float finalize_prediction(shared_data *sd, float ret)
size_t write_index(io_buf &model_file, std::stringstream &msg, bool text, uint32_t num_bits, uint64_t i)
void set_update(void(*u)(T &data, L &base, E &))
void accumulate_weighted_avg(vw &all, parameters &weights)
float compute_update(gd &g, example &ec)
virtual float getUpdate(float prediction, float label, float update_scale, float pred_per_update)=0
void print_audit_features(vw &all, example &ec)
vw * setup(options_i &options)
std::vector< string_value > results
void sync_weights(vw &all)
void initialize_regressor(vw &all, T &weights)
void predict(gd &g, base_learner &, example &ec)
uint64_t stride_shift(const stagewise_poly &poly, uint64_t idx)
void print_features(vw &all, example &ec)
VW::config::options_i * options
void(* update)(gd &, base_learner &, example &)
double weighted_unlabeled_examples
virtual void replace(const std::string &key, const std::string &value)=0
void vec_add_trunc_multipredict(multipredict_info< T > &mp, const float fx, uint64_t fi)
static void func(weight &w, std::pair< float, float > &initial, uint64_t)
void learn(gd &g, base_learner &base, example &ec)
the core definition of a set of features.
void set_default(R &info)
base_learner * make_base(learner< T, E > &base)
void vec_add_multipredict(multipredict_info< T > &mp, const float fx, uint64_t fi)
void finalize_regressor(vw &all, std::string reg_name)
virtual void add_and_parse(const option_group_definition &group)=0
void set_save_load(void(*sl)(T &, io_buf &, bool, bool))
float trunc_predict(vw &all, example &ec, double gravity)
void set_default(R &info)
void print_lda_features(vw &all, example &ec)
size_t check_holdout_every_n_passes
void save_load_online_state(vw &all, io_buf &model_file, bool read, bool text, double &total_weight, gd *g, uint32_t ftrl_size)
bool summarize_holdout_set(vw &all, size_t &no_win_counter)
void save_load_regressor(vw &all, io_buf &model_file, bool read, bool text)
static float InvSqrt(float x)
void vec_add_trunc(trunc_data &p, const float fx, float &fw)
virtual float first_derivative(shared_data *, float prediction, float label)=0
float inline_predict(vw &all, example &ec)
void audit_feature(audit_results &dat, const float ft_weight, const uint64_t ft_idx)
float(* sensitivity)(gd &, base_learner &, example &)
void save_predictor(vw &all, std::string reg_name, size_t current_pass)
double sum_loss_since_last_dump
float trunc_weight(const float w, const float gravity)
size_t bin_text_write_fixed(io_buf &io, char *data, size_t len, std::stringstream &msg, bool text)
void audit_interaction(audit_results &dat, const audit_strings *f)
size_t bin_read_fixed(char *data, size_t len, const char *read_message)
virtual float getLoss(shared_data *, float prediction, float label)=0
learner< T, E > & init_learner(free_ptr< T > &dat, L *base, void(*learn)(T &, L &, E &), void(*predict)(T &, L &, E &), size_t ws, prediction_type::prediction_type_t pred_type)
std::vector< std::string > ns_pre
typed_option< T > & get_typed_option(const std::string &key)
void end_pass(example &ec, vw &all)
VW::version_struct model_file_ver
iterator over values, indicies and audit space names
double old_weighted_labeled_examples
virtual bool was_supplied(const std::string &key)=0
double weighted_holdout_examples
constexpr uint64_t constant
uint64_t set_learn(vw &all, bool feature_mask_off, gd &g)
void set_weight(vw &all, uint32_t index, uint32_t offset, float value)
void accumulate_avg(vw &all, parameters &weights, size_t offset)
dense_parameters dense_weights
void pred_per_update_feature(norm_data &nd, float x, float &fw)
void(* learn)(gd &, base_learner &, example &)
bool operator<(const string_value &first, const string_value &second)
float get_scale(gd &g, example &, float weight)
option_group_definition & add(T &&op)
int add(svm_params ¶ms, svm_example *fec)
iterator over values and indicies
void update_feature(float &update, float x, float &fw)
void multipredict(gd &g, base_learner &, example &ec, size_t count, size_t step, polyprediction *pred, bool finalize_predictions)
typed_option< T > make_option(std::string name, T &location)
void print_result(int f, float res, v_array< char > tag, float lb, float ub)
void set_sensitivity(float(*u)(T &data, base_learner &base, example &))
virtual float getUnsafeUpdate(float prediction, float label, float eta_t)=0
void set_end_pass(void(*f)(T &))
sparse_parameters sparse_weights
void train(svm_params ¶ms)
void foreach_feature(vw &all, features &fs, R &dat, uint64_t offset=0, float mult=1.)
bool preserve_performance_counters
float compute_rate_decay(power_data &s, float &fw)
void vec_add_print(float &p, const float fx, float &fw)
void(* predict)(gd &, base_learner &, example &)
void update(gd &g, base_learner &, example &ec)
double weighted_labeled_examples
void save_load(gd &g, io_buf &model_file, bool read, bool text)
audit_results(vw &p_all, const size_t p_offset)
float sensitivity(gd &g, base_learner &, example &ec)
void(* multipredict)(gd &, base_learner &, example &, size_t, size_t, polyprediction *, bool)
std::string final_regressor_name
#define VERSION_SAVE_RESUME_FIX
std::map< std::string, size_t > name_index_map
float average_update(float total_weight, float normalized_sum_norm_x, float neg_norm_power)
#define VERSION_PASS_UINT64
size_t bin_text_read_write_fixed(io_buf &io, char *data, size_t len, const char *read_message, bool read, std::stringstream &msg, bool text)
uint64_t ceil_log_2(uint64_t v)
bool global_print_features
virtual float getSquareGrad(float prediction, float label)=0
const char * to_string(prediction_type_t prediction_type)
float quake_InvSqrt(float x)
double normalized_sum_norm_x
std::pair< std::string, std::string > audit_strings