Vowpal Wabbit
audit_regressor.cc
Go to the documentation of this file.
1 /*
2 Copyright (c) by respective owners including Yahoo!, Microsoft, and
3 individual contributors. All rights reserved. Released under a BSD (revised)
4 license as described in the file LICENSE.
5  */
6 
7 #include "reductions.h"
8 #include "interactions.h"
9 #include "parse_args.h"
10 #include "vw.h"
11 
12 using namespace VW::config;
13 
15 {
16  vw* all;
17  size_t increment;
18  size_t cur_class;
20  std::vector<std::string>* ns_pre;
24 };
25 
27 {
28  // same as audit_interaction in gd.cc
29  if (f == nullptr)
30  {
31  dat.ns_pre->pop_back();
32  return;
33  }
34 
35  std::string ns_pre;
36  if (!dat.ns_pre->empty())
37  ns_pre += '*';
38 
39  if (f->first != "" && ((f->first) != " "))
40  {
41  ns_pre.append(f->first);
42  ns_pre += '^';
43  }
44  if (f->second != "")
45  {
46  ns_pre.append(f->second);
47  dat.ns_pre->push_back(ns_pre);
48  }
49 }
50 
51 inline void audit_regressor_feature(audit_regressor_data& dat, const float, const uint64_t ft_idx)
52 {
53  parameters& weights = dat.all->weights;
54  if (weights[ft_idx] != 0)
55  ++dat.values_audited;
56  else
57  return;
58 
59  std::string ns_pre;
60  for (std::vector<std::string>::const_iterator s = dat.ns_pre->begin(); s != dat.ns_pre->end(); ++s) ns_pre += *s;
61 
62  std::ostringstream tempstream;
63  tempstream << ':' << ((ft_idx & weights.mask()) >> weights.stride_shift()) << ':' << weights[ft_idx];
64 
65  std::string temp = ns_pre + tempstream.str() + '\n';
66  if (dat.total_class_cnt > 1) // add class prefix for multiclass problems
67  temp = std::to_string(dat.cur_class) + ':' + temp;
68 
69  dat.out_file->bin_write_fixed(temp.c_str(), (uint32_t)temp.size());
70 
71  weights[ft_idx] = 0.; // mark value audited
72 }
73 
75 {
76  vw& all = *rd.all;
77 
78  std::ostringstream tempstream;
79  parameters& weights = rd.all->weights;
80  for (unsigned char* i = ec.indices.begin(); i != ec.indices.end(); i++)
81  {
82  features& fs = ec.feature_space[*i];
83  for (size_t j = 0; j < fs.size(); ++j)
84  {
85  tempstream << '\t' << fs.space_names[j].get()->first << '^' << fs.space_names[j].get()->second << ':'
86  << ((fs.indicies[j] >> weights.stride_shift()) & all.parse_mask);
87  for (size_t k = 0; k < all.lda; k++)
88  {
89  weight& w = weights[(fs.indicies[j] + k)];
90  tempstream << ':' << w;
91  w = 0.;
92  }
93  tempstream << std::endl;
94  }
95  }
96 
97  rd.out_file->bin_write_fixed(tempstream.str().c_str(), (uint32_t)tempstream.str().size());
98 }
99 
100 // This is a learner which does nothing with examples.
101 // void learn(audit_regressor_data&, LEARNER::base_learner&, example&) {}
102 
104 {
105  vw& all = *rd.all;
106 
107  if (all.lda > 0)
108  audit_regressor_lda(rd, base, ec);
109  else
110  {
111  rd.cur_class = 0;
112  uint64_t old_offset = ec.ft_offset;
113 
114  while (rd.cur_class < rd.total_class_cnt)
115  {
116  for (unsigned char* i = ec.indices.begin(); i != ec.indices.end(); ++i)
117  {
118  features& fs = ec.feature_space[(size_t)*i];
119  if (fs.space_names.size() > 0)
120  for (size_t j = 0; j < fs.size(); ++j)
121  {
122  audit_regressor_interaction(rd, fs.space_names[j].get());
123  audit_regressor_feature(rd, fs.values[j], (uint32_t)fs.indicies[j] + ec.ft_offset);
124  audit_regressor_interaction(rd, NULL);
125  }
126  else
127  for (size_t j = 0; j < fs.size(); ++j)
128  audit_regressor_feature(rd, fs.values[j], (uint32_t)fs.indicies[j] + ec.ft_offset);
129  }
130 
131  if (rd.all->weights.sparse)
134  rd.all->interactions, rd.all->permutations, ec, rd, rd.all->weights.sparse_weights);
135  else
136  INTERACTIONS::generate_interactions<audit_regressor_data, const uint64_t, audit_regressor_feature, true,
138  rd.all->interactions, rd.all->permutations, ec, rd, rd.all->weights.dense_weights);
139 
140  ec.ft_offset += rd.increment;
141  ++rd.cur_class;
142  }
143 
144  ec.ft_offset = old_offset; // make sure example is not changed.
145  }
146 }
148 {
149  d.out_file->flush(); // close_file() should do this for me ...
150  d.out_file->close_file();
151  delete (d.out_file);
152  d.out_file = NULL;
153  delete d.ns_pre;
154  d.ns_pre = NULL;
155 }
156 
157 inline void print_ex(vw& all, size_t ex_processed, size_t vals_found, size_t progress)
158 {
159  all.trace_message << std::left << std::setw(shared_data::col_example_counter) << ex_processed << " " << std::right
160  << std::setw(9) << vals_found << " " << std::right << std::setw(12) << progress << '%' << std::endl;
161 }
162 
164 {
165  bool printed = false;
166  if (ec.example_counter + 1 >= all.sd->dump_interval && !all.quiet)
167  {
169  all.sd->weighted_unlabeled_examples = (double)(ec.example_counter + 1); // used in update_dump_interval
171  printed = true;
172  }
173 
175  {
176  // all regressor values were audited
177  if (!printed)
178  print_ex(all, ec.example_counter + 1, dd.values_audited, 100);
179  set_done(all);
180  }
181 
182  VW::finish_example(all, ec);
183 }
184 
186 {
188  dat.all->trace_message << "Note: for some reason audit couldn't find all regressor values in dataset ("
189  << dat.values_audited << " of " << dat.loaded_regressor_values << " found)." << std::endl;
190 }
191 
192 template <class T>
194 {
195  for (typename T::iterator iter = w.begin(); iter != w.end(); ++iter)
196  if (*iter != 0)
198 }
199 
201 {
202  // checks a few settings that might be applied after audit_regressor_setup() is called
203  if ((dat.all->options->was_supplied("cache_file") || dat.all->options->was_supplied("cache")) &&
204  !dat.all->options->was_supplied("kill_cache"))
205  {
206  THROW("audit_regressor is incompatible with a cache file. Use it in single pass mode only.");
207  }
208 
209  dat.all->sd->dump_interval = 1.; // regressor could initialize these if saved with --save_resume
210  dat.all->sd->example_number = 0;
211 
212  dat.increment = dat.all->l->increment / dat.all->l->weights;
213  dat.total_class_cnt = dat.all->l->weights;
214 
215  if (dat.all->options->was_supplied("csoaa"))
216  {
217  size_t n = dat.all->options->get_typed_option<uint32_t>("csoaa").value();
218  if (n != dat.total_class_cnt)
219  {
220  dat.total_class_cnt = n;
221  dat.increment = dat.all->l->increment / n;
222  }
223  }
224 
225  // count non-null feature values in regressor
226  if (dat.all->weights.sparse)
228  else
230 
231  if (dat.loaded_regressor_values == 0)
232  THROW("regressor has no non-zero weights. Nothing to audit.");
233 
234  if (!dat.all->quiet)
235  {
236  dat.all->trace_message << "Regressor contains " << dat.loaded_regressor_values << " values\n";
237  dat.all->trace_message << std::left << std::setw(shared_data::col_example_counter) << "example"
238  << " " << std::setw(shared_data::col_example_weight) << "values"
239  << " " << std::setw(shared_data::col_current_label) << "total" << std::endl;
240  dat.all->trace_message << std::left << std::setw(shared_data::col_example_counter) << "counter"
241  << " " << std::setw(shared_data::col_example_weight) << "audited"
242  << " " << std::setw(shared_data::col_current_label) << "progress" << std::endl;
243  }
244 }
245 
247 {
248  std::string out_file;
249 
250  option_group_definition new_options("Audit Regressor");
251  new_options.add(make_option("audit_regressor", out_file)
252  .keep()
253  .help("stores feature names and their regressor values. Same dataset must be used for both "
254  "regressor training and this mode."));
255  options.add_and_parse(new_options);
256 
257  if (!options.was_supplied("audit_regressor"))
258  return nullptr;
259 
260  if (out_file.empty())
261  THROW("audit_regressor argument (output filename) is missing.");
262 
263  if (all.numpasses > 1)
264  THROW("audit_regressor can't be used with --passes > 1.");
265 
266  all.audit = true;
267 
268  auto dat = scoped_calloc_or_throw<audit_regressor_data>();
269  dat->all = &all;
270  dat->ns_pre = new std::vector<std::string>(); // explicitly invoking std::vector's constructor
271  dat->out_file = new io_buf();
272  dat->out_file->open_file(out_file.c_str(), all.stdin_off, io_buf::WRITE);
273 
278  ret.set_finish(finish);
280 
281  return LEARNER::make_base<audit_regressor_data>(ret);
282 }
v_array< namespace_index > indices
void print_ex(vw &all, size_t ex_processed, size_t vals_found, size_t progress)
void set_init_driver(void(*f)(T &))
Definition: learner.h:299
size_t example_counter
Definition: example.h:64
void set_done(vw &all)
Definition: parser.cc:578
LEARNER::base_learner * audit_regressor_setup(options_i &options, vw &all)
parameters weights
Definition: global_data.h:537
VW::config::options_i * options
Definition: global_data.h:428
double weighted_unlabeled_examples
Definition: global_data.h:143
static constexpr int WRITE
Definition: io_buf.h:72
virtual bool close_file()
Definition: io_buf.h:204
static constexpr int col_current_label
Definition: global_data.h:182
v_array< feature_index > indicies
the core definition of a set of features.
bool quiet
Definition: global_data.h:487
v_array< feature_value > values
virtual void add_and_parse(const option_group_definition &group)=0
void finish(vw &all, bool delete_all)
Definition: parse_args.cc:1823
T *& begin()
Definition: v_array.h:42
void audit_regressor_lda(audit_regressor_data &rd, LEARNER::single_learner &, example &ec)
bool progress_add
Definition: global_data.h:545
size_t size() const
Definition: v_array.h:68
std::vector< std::string > * ns_pre
void regressor_values(audit_regressor_data &dat, T &w)
uint32_t lda
Definition: global_data.h:508
static constexpr int col_example_weight
Definition: global_data.h:180
std::array< features, NUM_NAMESPACES > feature_space
single_learner * as_singleline(learner< T, E > *l)
Definition: learner.h:476
size_t size() const
void set_finish_example(void(*f)(vw &all, T &, E &))
Definition: learner.h:307
virtual void flush()
Definition: io_buf.h:194
learner< T, E > & init_learner(free_ptr< T > &dat, L *base, void(*learn)(T &, L &, E &), void(*predict)(T &, L &, E &), size_t ws, prediction_type::prediction_type_t pred_type)
Definition: learner.h:369
shared_data * sd
Definition: global_data.h:375
typed_option< T > & get_typed_option(const std::string &key)
Definition: options.h:120
float progress_arg
Definition: global_data.h:546
vw_ostream trace_message
Definition: global_data.h:424
virtual bool was_supplied(const std::string &key)=0
static constexpr int col_example_counter
Definition: global_data.h:179
void generate_interactions(vw &all, example_predict &ec, R &dat)
Definition: interactions.h:45
dense_parameters dense_weights
Definition: io_buf.h:54
void finish_example(vw &, example &)
Definition: parser.cc:881
T *& end()
Definition: v_array.h:43
void init_driver(audit_regressor_data &dat)
size_t numpasses
Definition: global_data.h:451
void audit_regressor_feature(audit_regressor_data &dat, const float, const uint64_t ft_idx)
float weight
option_group_definition & add(T &&op)
Definition: options.h:90
uint64_t example_number
Definition: global_data.h:137
v_array< audit_strings_ptr > space_names
uint64_t parse_mask
Definition: global_data.h:453
size_t increment
Definition: learner.h:153
typed_option< T > make_option(std::string name, T &location)
Definition: options.h:80
size_t bin_write_fixed(const char *data, size_t len)
Definition: io_buf.h:252
void audit_regressor_interaction(audit_regressor_data &dat, const audit_strings *f)
void set_finish(void(*f)(T &))
Definition: learner.h:265
sparse_parameters sparse_weights
void audit_regressor(audit_regressor_data &rd, LEARNER::single_learner &base, example &ec)
std::vector< std::string > interactions
Definition: global_data.h:457
LEARNER::base_learner * l
Definition: global_data.h:383
uint32_t stride_shift()
void end_examples(audit_regressor_data &d)
bool permutations
Definition: global_data.h:454
bool audit
Definition: global_data.h:486
LEARNER::base_learner * setup_base(options_i &options, vw &all)
Definition: parse_args.cc:1222
size_t weights
Definition: learner.h:152
bool stdin_off
Definition: global_data.h:527
float dump_interval
Definition: global_data.h:147
uint64_t mask()
#define THROW(args)
Definition: vw_exception.h:181
float f
Definition: cache.cc:40
const char * to_string(prediction_type_t prediction_type)
Definition: learner.cc:12
void set_end_examples(void(*f)(T &))
Definition: learner.h:295
void update_dump_interval(bool progress_add, float progress_arg)
Definition: global_data.h:215
std::pair< std::string, std::string > audit_strings
Definition: feature_group.h:22