19 #include <sys/timeb.h> 71 struct timeb t_start, t_end;
74 struct timeb t_start_global, t_end_global;
117 "Zero or negative curvature detected.\n" 118 "To increase curvature you can increase regularization or rescale features.\n" 119 "It is also possible that you have reached numerical accuracy\n" 120 "and further decrease in the objective cannot be reliably detected.\n";
155 inline void add_grad(
float& d,
float f,
float& fw) { (&fw)[
W_GT] += d * f; }
164 GD::foreach_feature<float, add_grad>(all, ec, loss_grad);
174 GD::foreach_feature<float, add_precond>(all, ec, curvature);
177 inline void add_DIR(
float& p,
const float fx,
float& fw) { p += (&fw)[
W_DIR] * fx; }
182 GD::foreach_feature<float, add_DIR>(all, ec, temp);
191 for (
typename T::iterator iter = weights.begin(); iter != weights.end(); ++iter)
192 ret += regularizer * (&(*iter))[
W_DIR] * (&(*iter))[
W_DIR];
196 for (
typename T::iterator iter = weights.begin(); iter != weights.end(); ++iter)
197 ret += ((
double)b.
regularizers[2 * (iter.index() >> weights.stride_shift())]) * (&(*iter))[
W_DIR] *
208 if (regularizer == 0.)
222 for (
typename T::iterator iter = weights.begin(); iter != weights.end(); ++iter)
223 ret += ((
double)(&(*iter))[
W_DIR]) * (&(*iter))[W_DIR];
238 void bfgs_iter_start(
vw& all,
bfgs& b,
float* mem,
int& lastj,
double importance_weight_sum,
int& origin, T& weights)
244 for (
typename T::iterator w = weights.begin(); w != weights.end(); ++w)
246 float* mem1 = mem + (w.index() >> weights.stride_shift()) * b.
mem_stride;
250 g1_Hg1 += ((double)(&(*w))[
W_GT]) * ((&(*w))[W_GT]) * ((&(*w))[
W_COND]);
251 g1_g1 += ((double)((&(*w))[W_GT])) * ((&(*w))[W_GT]);
252 (&(*w))[
W_DIR] = -(&(*w))[W_COND] * ((&(*w))[W_GT]);
257 fprintf(stderr,
"%-10.5f\t%-10.5f\t%-10s\t%-10s\t%-10s\t", g1_g1 / (importance_weight_sum * importance_weight_sum),
258 g1_Hg1 / importance_weight_sum,
"",
"",
"");
273 uint32_t length = 1 << all.
num_bits;
281 for (
typename T::iterator w = weights.begin(); w != weights.end(); ++w)
283 mem = mem0 + (w.index() >> weights.stride_shift()) * b.
mem_stride;
285 g_Hy += ((double)(&(*w))[
W_GT]) * ((&(*w))[
W_COND]) * y;
290 float beta = (float)(g_Hy / g_Hg);
292 if (beta < 0.
f || std::isnan(beta))
295 for (
typename T::iterator w = weights.begin(); w != weights.end(); ++w)
297 mem = mem0 + (w.index() >> weights.stride_shift()) * b.
mem_stride;
300 (&(*w))[
W_DIR] *= beta;
305 fprintf(stderr,
"%f\t", beta);
313 fprintf(stderr,
"%-10s\t",
"");
321 for (
typename T::iterator w = weights.begin(); w != weights.end(); ++w)
323 float* mem1 = mem + (w.index() >> weights.stride_shift()) * b.
mem_stride;
333 if (y_s <= 0. || y_Hy <= 0.)
337 float gamma = (float)(y_s / y_Hy);
339 for (
int j = 0; j < lastj; j++)
341 alpha[j] = rho[j] * s_q;
343 for (
typename T::iterator w = weights.begin(); w != weights.end(); ++w)
345 mem = mem0 + (w.index() >> weights.stride_shift()) * b.
mem_stride;
351 alpha[lastj] = rho[lastj] * s_q;
354 for (
typename T::iterator w = weights.begin(); w != weights.end(); ++w)
356 mem = mem0 + (w.index() >> weights.stride_shift()) * b.
mem_stride;
364 for (
int j = lastj; j > 0; j--)
366 coef_j = alpha[j] - rho[j] * y_r;
368 for (
typename T::iterator w = weights.begin(); w != weights.end(); ++w)
370 mem = mem0 + (w.index() >> weights.stride_shift()) * b.
mem_stride;
376 coef_j = alpha[0] - rho[0] * y_r;
377 for (
typename T::iterator w = weights.begin(); w != weights.end(); ++w)
379 mem = mem0 + (w.index() >> weights.stride_shift()) * b.
mem_stride;
387 lastj = (lastj < b.
m - 1) ? lastj + 1 : b.
m - 1;
390 for (
typename T::iterator w = weights.begin(); w != weights.end(); ++w)
392 mem = mem0 + (w.index() >> weights.stride_shift()) * b.
mem_stride;
397 for (
int j = lastj; j > 0; j--) rho[j] = rho[j - 1];
409 double wolfe_eval(
vw& all,
bfgs& b,
float* mem,
double loss_sum,
double previous_loss_sum,
double step_size,
410 double importance_weight_sum,
int& origin,
double& wolfe1, T& weights)
417 for (
typename T::iterator w = weights.begin(); w != weights.end(); ++w)
419 float* mem1 = mem + (w.index() >> weights.stride_shift()) * b.
mem_stride;
421 g1_d += ((double)(&(*w))[
W_GT]) * (&(*w))[
W_DIR];
422 g1_Hg1 += ((double)(&(*w))[W_GT]) * (&(*w))[W_GT] * ((&(*w))[
W_COND]);
423 g1_g1 += ((double)(&(*w))[W_GT]) * (&(*w))[W_GT];
426 wolfe1 = (loss_sum - previous_loss_sum) / (step_size * g0_d);
427 double wolfe2 = g1_d / g0_d;
431 fprintf(stderr,
"%-10.5f\t%-10.5f\t%s%-10f\t%-10f\t", g1_g1 / (importance_weight_sum * importance_weight_sum),
432 g1_Hg1 / importance_weight_sum,
" ", wolfe1, wolfe2);
433 return 0.5 * step_size;
436 double wolfe_eval(
vw& all,
bfgs& b,
float* mem,
double loss_sum,
double previous_loss_sum,
double step_size,
437 double importance_weight_sum,
int& origin,
double& wolfe1)
440 return wolfe_eval(all, b, mem, loss_sum, previous_loss_sum, step_size, importance_weight_sum, origin, wolfe1,
443 return wolfe_eval(all, b, mem, loss_sum, previous_loss_sum, step_size, importance_weight_sum, origin, wolfe1,
454 for (
typename T::iterator w = weights.begin(); w != weights.end(); ++w)
456 (&(*w))[
W_GT] += regularization * (*w);
457 ret += 0.5 * regularization * (*w) * (*w);
460 for (
typename T::iterator w = weights.begin(); w != weights.end(); ++w)
462 uint64_t i = w.index() >> weights.stride_shift();
465 ret += 0.5 * b.
regularizers[2 * i] * delta_weight * delta_weight;
475 ret -= 0.5 * regularization * (weights.strided_index(
constant)) * (weights.strided_index(
constant));
479 uint64_t i =
constant >> weights.stride_shift();
482 ret -= 0.5 * b.
regularizers[2 * i] * delta_weight * delta_weight;
500 float max_hessian = 0.f;
503 for (
typename T::iterator w = weights.begin(); w != weights.end(); ++w)
505 (&(*w))[
W_COND] += regularization;
506 if ((&(*w))[
W_COND] > max_hessian)
507 max_hessian = (&(*w))[
W_COND];
512 for (
typename T::iterator w = weights.begin(); w != weights.end(); ++w)
515 if ((&(*w))[
W_COND] > max_hessian)
516 max_hessian = (&(*w))[
W_COND];
523 for (
typename T::iterator w = weights.begin(); w != weights.end(); ++w)
525 if (std::isinf(*w) || *w > max_precond)
526 (&(*w))[
W_COND] = max_precond;
540 uint32_t length = 1 << all.
num_bits;
547 THROW(
"Failed to allocate weight array: try decreasing -b <bits>");
549 for (
typename T::iterator w = weights.begin(); w != weights.end(); ++w)
551 uint64_t i = w.index() >> weights.stride_shift();
558 for (
typename T::iterator w = weights.begin(); w != weights.end(); ++w)
560 if ((&(*w))[
W_COND] > 0.f)
564 for (
typename T::iterator w = weights.begin(); w != weights.end(); ++w)
565 b.
regularizers[2 * (w.index() >> weights.stride_shift()) + 1] = *w;
580 for (
typename T::iterator w = weights.begin(); w != weights.end(); ++w)
582 uint64_t i = w.index() >> weights.stride_shift();
608 for (
typename T::iterator w = weights.begin(); w != weights.end(); ++w)
610 float* mem1 = mem + (w.index() >> weights.stride_shift()) * b.
mem_stride;
627 for (
typename T::iterator iter = w.begin(); iter != w.end(); ++iter)
628 (&(*iter))[
W_XT] += step_size * (&(*iter))[
W_DIR];
684 fprintf(stderr,
"%-10s\t%-10.5f\t%-.5f\n",
"", d_mag, b.
step_size);
709 fprintf(stderr,
"%2lu ", (
long unsigned int)b.
current_pass + 1);
710 fprintf(stderr,
"h unknown ");
713 fprintf(stderr,
"%2lu h%-10.5f\t", (
long unsigned int)b.
current_pass + 1,
726 if (std::isnan((
float)wolfe1))
728 fprintf(stderr,
"\n");
729 fprintf(stdout,
"Derivative 0 detected.\n");
744 fprintf(stderr,
"%-10s\t%-10s\t(revise x %.1f)\t%-.5f\n",
"",
"", ratio, new_step);
762 "\nTermination condition reached in pass %ld: decrease in loss less than %.3f%%.\n" 763 "If you want to optimize further, decrease termination threshold.\n",
779 fprintf(stdout,
"In bfgs_iter_middle: %s",
curv_message);
795 fprintf(stderr,
"%-10s\t%-10.5f\t%-.5f\n",
"", d_mag, b.
step_size);
823 fprintf(stdout,
"Derivative 0 detected.\n");
912 b.
all->
trace_message <<
"If you want to optimize further, increase the number of passes\n";
916 b.
all->
trace_message <<
"Output feature regularizer file is created only when the convergence is reached. " 917 "Try increasing the number of passes for convergence\n";
940 b.
all->
trace_message <<
"Early termination reached w.r.t. holdout set error";
955 template <
bool audit>
964 template <
bool audit>
973 predict<audit>(b, base, ec);
982 uint32_t length = 2 * (1 << all.
num_bits);
1010 std::stringstream msg;
1014 msg <<
":" << *v <<
"\n";
1020 }
while ((!read && i < length) || (read && brw > 0));
1030 uint32_t length = 1 << all->
num_bits;
1039 THROW(
"Failed to allocate regularizers array: try decreasing -b <bits>");
1045 b.
rho = calloc_or_throw<double>(m);
1046 b.
alpha = calloc_or_throw<double>(m);
1051 std::cerr <<
"m = " << m << std::endl
1053 << ((
long unsigned int)all->
length() *
1056 <<
"M for weights and mem" << std::endl;
1059 ftime(&b.t_start_global);
1063 const char* header_fmt =
"%2s %-10s\t%-10s\t%-10s\t %-10s\t%-10s\t%-10s\t%-10s\t%-10s\t%-s\n";
1064 fprintf(stderr, header_fmt,
"##",
"avg. loss",
"der. mag.",
"d. m. cond.",
"wolfe1",
"wolfe2",
"mix fraction",
1065 "curvature",
"dir. magnitude",
"step size");
1066 std::cerr.precision(5);
1080 std::stringstream msg;
1081 msg <<
":" << reg_vector <<
"\n";
1095 auto b = scoped_calloc_or_throw<bfgs>();
1096 bool conjugate_gradient =
false;
1097 bool bfgs_option =
false;
1099 bfgs_outer_options.
add(
1100 make_option(
"conjugate_gradient", conjugate_gradient).keep().help(
"use conjugate gradient based optimization"));
1103 bfgs_inner_options.
add(
make_option(
"bfgs", bfgs_option).keep().help(
"use conjugate gradient based optimization"));
1105 bfgs_inner_options.
add(
make_option(
"mem", b->m).default_value(15).help(
"memory in bfgs"));
1106 bfgs_inner_options.
add(
1107 make_option(
"termination", b->rel_threshold).default_value(0.001f).help(
"Termination threshold"));
1110 if (!conjugate_gradient)
1120 b->wolfe1_bound = 0.01;
1121 b->first_hessian_on =
true;
1122 b->first_pass =
true;
1123 b->gradient_pass =
true;
1124 b->preconditioner_pass =
true;
1125 b->backstep_on =
false;
1127 b->no_win_counter = 0;
1132 b->early_stop_thres = options.
get_typed_option<
size_t>(
"early_terminate").value();
1141 b->all->trace_message <<
"enabling BFGS based optimization ";
1143 b->all->trace_message <<
"enabling conjugate gradient optimization via BFGS ";
1145 b->all->trace_message <<
"with curvature calculation" << std::endl;
1147 b->all->trace_message <<
"**without** curvature calculation" << std::endl;
1151 THROW(
"you must make at least 2 passes to use BFGS");
1158 learn_ptr = learn<true>;
1160 learn_ptr = learn<false>;
float bfgs_predict(vw &all, example &ec)
float finalize_prediction(shared_data *sd, float ret)
void zero_preconditioner(vw &all)
void preconditioner_to_regularizer(vw &all, bfgs &b, float regularization, T &weights)
double add_regularization(vw &all, bfgs &b, float regularization, T &weights)
void print_audit_features(vw &all, example &ec)
constexpr bool test_example(example &ec) noexcept
void accumulate(vw &all, parameters &weights, size_t offset)
void initialize_regressor(vw &all, T &weights)
base_learner * bfgs_setup(options_i &options, vw &all)
uint64_t stride_shift(const stagewise_poly &poly, uint64_t idx)
double holdout_sum_loss_since_last_pass
float direction_magnitude(vw &, T &weights)
virtual float second_derivative(shared_data *, float prediction, float label)=0
int process_pass(vw &all, bfgs &b)
void process_example(vw &all, bfgs &b, example &ec)
float dot_with_direction(vw &all, example &ec)
base_learner * make_base(learner< T, E > &base)
void finalize_regressor(vw &all, std::string reg_name)
virtual void add_and_parse(const option_group_definition &group)=0
void bfgs_iter_middle(vw &all, bfgs &b, float *mem, double *rho, double *alpha, int &lastj, int &origin, T &weights)
bool summarize_holdout_set(vw &all, size_t &no_win_counter)
virtual float first_derivative(shared_data *, float prediction, float label)=0
float inline_predict(vw &all, example &ec)
void save_predictor(vw &all, std::string reg_name, size_t current_pass)
double derivative_in_direction(vw &, bfgs &b, float *mem, int &origin, T &weights)
void(* set_minmax)(shared_data *sd, float label)
size_t bin_text_write_fixed(io_buf &io, char *data, size_t len, std::stringstream &msg, bool text)
void save_load(bfgs &b, io_buf &model_file, bool read, bool text)
size_t bin_read_fixed(char *data, size_t len, const char *read_message)
constexpr float max_precond_ratio
constexpr const char * curv_message
v_array< float > predictions
virtual float getLoss(shared_data *, float prediction, float label)=0
learner< T, E > & init_learner(free_ptr< T > &dat, L *base, void(*learn)(T &, L &, E &), void(*predict)(T &, L &, E &), size_t ws, prediction_type::prediction_type_t pred_type)
void push_back(const T &new_ele)
typed_option< T > & get_typed_option(const std::string &key)
void end_pass(example &ec, vw &all)
void save_load_regularizer(vw &all, bfgs &b, io_buf &model_file, bool read, bool text)
void update_preconditioner(vw &all, example &ec)
void regularizer_to_weight(vw &, bfgs &b, T &weights)
double weighted_holdout_examples_since_last_pass
std::string per_feature_regularizer_output
constexpr uint64_t constant
double wolfe_eval(vw &all, bfgs &b, float *mem, double loss_sum, double previous_loss_sum, double step_size, double importance_weight_sum, int &origin, double &wolfe1, T &weights)
dense_parameters dense_weights
double regularizer_direction_magnitude(vw &, bfgs &b, double regularizer, T &weights)
void zero_derivative(vw &all)
void set_zero(size_t offset)
void add_grad(float &d, float f, float &fw)
std::string per_feature_regularizer_text
void update_weight(vw &, float step_size, T &w)
option_group_definition & add(T &&op)
struct timeb t_start_global t_end_global
void init_driver(bfgs &b)
void finalize_preconditioner(vw &, bfgs &b, float regularization, T &weights)
typed_option< T > make_option(std::string name, T &location)
sparse_parameters sparse_weights
float accumulate_scalar(vw &all, float local_sum)
float predict_and_gradient(vw &all, example &ec)
void reset_state(vw &all, bfgs &b, bool zero)
std::string per_feature_regularizer_input
void predict(bfgs &b, base_learner &, example &ec)
void save_load_regressor(vw &all, io_buf &model_file, bool read, bool text, T &weights)
double importance_weight_sum
std::string final_regressor_name
void add_DIR(float &p, const float fx, float &fw)
void learn(bfgs &b, base_learner &base, example &ec)
void add_precond(float &d, float f, float &fw)
size_t bin_text_read_write_fixed(io_buf &io, char *data, size_t len, const char *read_message, bool read, std::stringstream &msg, bool text)
void bfgs_iter_start(vw &all, bfgs &b, float *mem, int &lastj, double importance_weight_sum, int &origin, T &weights)