Vowpal Wabbit
parse_example.cc
Go to the documentation of this file.
1 /*
2 Copyright (c) by respective owners including Yahoo!, Microsoft, and
3 individual contributors. All rights reserved. Released under a BSD (revised)
4 license as described in the file LICENSE.
5  */
6 
7 #include <cmath>
8 #include <math.h>
9 #include <cctype>
10 #include "parse_example.h"
11 #include "hash.h"
12 #include "unique_sort.h"
13 #include "global_data.h"
14 #include "constant.h"
15 
16 size_t read_features(vw* all, char*& line, size_t& num_chars)
17 {
18  line = nullptr;
19  size_t num_chars_initial = readto(*(all->p->input), line, '\n');
20  if (num_chars_initial < 1)
21  return num_chars_initial;
22  num_chars = num_chars_initial;
23  if (line[0] == '\xef' && num_chars >= 3 && line[1] == '\xbb' && line[2] == '\xbf')
24  {
25  line += 3;
26  num_chars -= 3;
27  }
28  if (num_chars > 0 && line[num_chars - 1] == '\n')
29  num_chars--;
30  if (num_chars > 0 && line[num_chars - 1] == '\r')
31  num_chars--;
32  return num_chars_initial;
33 }
34 
36 {
37  char* line;
38  size_t num_chars;
39  size_t num_chars_initial = read_features(all, line, num_chars);
40  if (num_chars_initial < 1)
41  return (int)num_chars_initial;
42 
43  substring example = {line, line + num_chars};
44  substring_to_example(all, examples[0], example);
45 
46  return (int)num_chars_initial;
47 }
48 
49 template <bool audit>
50 class TC_parser
51 {
52  public:
53  char* beginLine;
54  char* reading_head;
55  char* endLine;
57  bool new_index;
58  size_t anon;
59  uint64_t channel_hash;
60  char* base;
61  unsigned char index;
62  float v;
64  std::array<unsigned char, NUM_NAMESPACES>* redefine;
67  std::array<uint64_t, NUM_NAMESPACES>* affix_features;
68  std::array<bool, NUM_NAMESPACES>* spelling_features;
70  uint32_t hash_seed;
71  uint64_t parse_mask;
72 
73  std::array<std::vector<feature_dict*>, NUM_NAMESPACES>* namespace_dictionaries;
74 
76 
77  inline void parserWarning(const char* message, char* begin, char* pos, const char* message2)
78  {
79  std::stringstream ss;
80  ss << message << std::string(begin, pos - begin).c_str() << message2 << "in Example #"
81  << this->p->end_parsed_examples << ": \"" << std::string(this->beginLine, this->endLine).c_str() << "\""
82  << std::endl;
83  if (p->strict_parse)
84  {
86  }
87  else
88  {
89  std::cerr << ss.str();
90  }
91  }
92 
93  inline float featureValue()
94  {
95  if (*reading_head == ' ' || *reading_head == '\t' || *reading_head == '|' || reading_head == endLine ||
96  *reading_head == '\r')
97  return 1.;
98  else if (*reading_head == ':')
99  {
100  // featureValue --> ':' 'Float'
101  ++reading_head;
102  char* end_read = nullptr;
103  v = parseFloat(reading_head, &end_read, endLine);
104  if (end_read == reading_head)
105  {
106  parserWarning("malformed example! Float expected after : \"", beginLine, reading_head, "\"");
107  }
108  if (std::isnan(v))
109  {
110  v = 0.f;
111  parserWarning("warning: invalid feature value:\"", reading_head, end_read, "\" read as NaN. Replacing with 0.");
112  }
113  reading_head = end_read;
114  return v;
115  }
116  else
117  {
118  // syntax error
119  parserWarning("malformed example! '|', ':', space, or EOL expected after : \"", beginLine, reading_head, "\"");
120  return 0.f;
121  }
122  }
123 
125  {
126  substring ret;
127  ret.begin = reading_head;
128  while (!(*reading_head == ' ' || *reading_head == ':' || *reading_head == '\t' || *reading_head == '|' ||
129  reading_head == endLine || *reading_head == '\r'))
130  ++reading_head;
131  ret.end = reading_head;
132 
133  return ret;
134  }
135 
136  inline void maybeFeature()
137  {
138  if (*reading_head == ' ' || *reading_head == '\t' || *reading_head == '|' || reading_head == endLine ||
139  *reading_head == '\r')
140  {
141  // maybeFeature --> ø
142  }
143  else
144  {
145  // maybeFeature --> 'String' FeatureValue
146  substring feature_name = read_name();
147  v = cur_channel_v * featureValue();
148  uint64_t word_hash;
149  if (feature_name.end != feature_name.begin)
150  word_hash = (p->hasher(feature_name, channel_hash) & parse_mask);
151  else
152  word_hash = channel_hash + anon++;
153  if (v == 0)
154  return; // dont add 0 valued features to list of features
155  features& fs = ae->feature_space[index];
156  fs.push_back(v, word_hash);
157  if (audit)
158  {
159  v_array<char> feature_v = v_init<char>();
160  push_many(feature_v, feature_name.begin, feature_name.end - feature_name.begin);
161  feature_v.push_back('\0');
162  fs.space_names.push_back(audit_strings_ptr(new audit_strings(base, feature_v.begin())));
163  feature_v.delete_v();
164  }
165  if ((*affix_features)[index] > 0 && (feature_name.end != feature_name.begin))
166  {
167  features& affix_fs = ae->feature_space[affix_namespace];
168  if (affix_fs.size() == 0)
170  uint64_t affix = (*affix_features)[index];
171  while (affix > 0)
172  {
173  bool is_prefix = affix & 0x1;
174  uint64_t len = (affix >> 1) & 0x7;
175  substring affix_name = {feature_name.begin, feature_name.end};
176  if (affix_name.end > affix_name.begin + len)
177  {
178  if (is_prefix)
179  affix_name.end = affix_name.begin + len;
180  else
181  affix_name.begin = affix_name.end - len;
182  }
183  word_hash =
184  p->hasher(affix_name, (uint64_t)channel_hash) * (affix_constant + (affix & 0xF) * quadratic_constant);
185  affix_fs.push_back(v, word_hash);
186  if (audit)
187  {
188  v_array<char> affix_v = v_init<char>();
189  if (index != ' ')
190  affix_v.push_back(index);
191  affix_v.push_back(is_prefix ? '+' : '-');
192  affix_v.push_back('0' + (char)len);
193  affix_v.push_back('=');
194  push_many(affix_v, affix_name.begin, affix_name.end - affix_name.begin);
195  affix_v.push_back('\0');
196  affix_fs.space_names.push_back(audit_strings_ptr(new audit_strings("affix", affix_v.begin())));
197  }
198  affix >>= 4;
199  }
200  }
201  if ((*spelling_features)[index])
202  {
203  features& spell_fs = ae->feature_space[spelling_namespace];
204  if (spell_fs.size() == 0)
206  // v_array<char> spelling;
207  spelling.clear();
208  for (char* c = feature_name.begin; c != feature_name.end; ++c)
209  {
210  char d = 0;
211  if ((*c >= '0') && (*c <= '9'))
212  d = '0';
213  else if ((*c >= 'a') && (*c <= 'z'))
214  d = 'a';
215  else if ((*c >= 'A') && (*c <= 'Z'))
216  d = 'A';
217  else if (*c == '.')
218  d = '.';
219  else
220  d = '#';
221  // if ((spelling.size() == 0) || (spelling.last() != d))
222  spelling.push_back(d);
223  }
224  substring spelling_ss = {spelling.begin(), spelling.end()};
225  uint64_t word_hash = hashstring(spelling_ss, (uint64_t)channel_hash);
226  spell_fs.push_back(v, word_hash);
227  if (audit)
228  {
229  v_array<char> spelling_v = v_init<char>();
230  if (index != ' ')
231  {
232  spelling_v.push_back(index);
233  spelling_v.push_back('_');
234  }
235  push_many(spelling_v, spelling_ss.begin, spelling_ss.end - spelling_ss.begin);
236  spelling_v.push_back('\0');
237  spell_fs.space_names.push_back(audit_strings_ptr(new audit_strings("spelling", spelling_v.begin())));
238  }
239  }
240  if ((*namespace_dictionaries)[index].size() > 0)
241  {
242  for (auto map : (*namespace_dictionaries)[index])
243  {
244  uint64_t hash = uniform_hash(feature_name.begin, feature_name.end - feature_name.begin, quadratic_constant);
245  features* feats = map->get(feature_name, hash);
246  if ((feats != nullptr) && (feats->values.size() > 0))
247  {
249  if (dict_fs.size() == 0)
251  push_many(dict_fs.values, feats->values.begin(), feats->values.size());
252  push_many(dict_fs.indicies, feats->indicies.begin(), feats->indicies.size());
253  dict_fs.sum_feat_sq += feats->sum_feat_sq;
254  if (audit)
255  for (size_t i = 0; i < feats->indicies.size(); ++i)
256  {
257  uint64_t id = feats->indicies[i];
258  std::stringstream ss;
259  ss << index << '_';
260  for (char* fc = feature_name.begin; fc != feature_name.end; ++fc) ss << *fc;
261  ss << '=' << id;
262  dict_fs.space_names.push_back(audit_strings_ptr(new audit_strings("dictionary", ss.str())));
263  }
264  }
265  }
266  }
267  }
268  }
269 
270  inline void nameSpaceInfoValue()
271  {
272  if (*reading_head == ' ' || *reading_head == '\t' || reading_head == endLine || *reading_head == '|' ||
273  *reading_head == '\r')
274  {
275  // nameSpaceInfoValue --> ø
276  }
277  else if (*reading_head == ':')
278  {
279  // nameSpaceInfoValue --> ':' 'Float'
280  ++reading_head;
281  char* end_read = nullptr;
282  cur_channel_v = parseFloat(reading_head, &end_read);
283  if (end_read == reading_head)
284  {
285  parserWarning("malformed example! Float expected after : \"", beginLine, reading_head, "\"");
286  }
287  if (std::isnan(cur_channel_v))
288  {
289  cur_channel_v = 1.f;
291  "warning: invalid namespace value:\"", reading_head, end_read, "\" read as NaN. Replacing with 1.");
292  }
293  reading_head = end_read;
294  }
295  else
296  {
297  // syntax error
298  parserWarning("malformed example! '|',':', space, or EOL expected after : \"", beginLine, reading_head, "\"");
299  }
300  }
301 
302  inline void nameSpaceInfo()
303  {
304  if (reading_head == endLine || *reading_head == '|' || *reading_head == ' ' || *reading_head == '\t' ||
305  *reading_head == ':' || *reading_head == '\r')
306  {
307  // syntax error
308  parserWarning("malformed example! String expected after : \"", beginLine, reading_head, "\"");
309  }
310  else
311  {
312  // NameSpaceInfo --> 'String' NameSpaceInfoValue
313  index = (unsigned char)(*reading_head);
314  if (redefine_some)
315  index = (*redefine)[index]; // redefine index
316  if (ae->feature_space[index].size() == 0)
317  new_index = true;
318  substring name = read_name();
319  if (audit)
320  {
321  v_array<char> base_v_array = v_init<char>();
322  push_many(base_v_array, name.begin, name.end - name.begin);
323  base_v_array.push_back('\0');
324  if (base != nullptr)
325  free(base);
326  base = base_v_array.begin();
327  }
328  channel_hash = p->hasher(name, this->hash_seed);
330  }
331  }
332 
333  inline void listFeatures()
334  {
335  while ((*reading_head == ' ' || *reading_head == '\t') && (reading_head < endLine))
336  {
337  // listFeatures --> ' ' MaybeFeature ListFeatures
338  ++reading_head;
339  maybeFeature();
340  }
341  if (!(*reading_head == '|' || reading_head == endLine || *reading_head == '\r'))
342  {
343  // syntax error
344  parserWarning("malformed example! '|',space, or EOL expected after : \"", beginLine, reading_head, "\"");
345  }
346  }
347 
348  inline void nameSpace()
349  {
350  cur_channel_v = 1.0;
351  index = 0;
352  new_index = false;
353  anon = 0;
354  if (*reading_head == ' ' || *reading_head == '\t' || reading_head == endLine || *reading_head == '|' ||
355  *reading_head == '\r')
356  {
357  // NameSpace --> ListFeatures
358  index = (unsigned char)' ';
359  if (ae->feature_space[index].size() == 0)
360  new_index = true;
361  if (audit)
362  {
363  if (base != nullptr)
364  free(base);
365  base = calloc_or_throw<char>(2);
366  base[0] = ' ';
367  base[1] = '\0';
368  }
369  channel_hash = this->hash_seed == 0 ? 0 : uniform_hash("", 0, this->hash_seed);
370  listFeatures();
371  }
372  else if (*reading_head != ':')
373  {
374  // NameSpace --> NameSpaceInfo ListFeatures
375  nameSpaceInfo();
376  listFeatures();
377  }
378  else
379  {
380  // syntax error
381  parserWarning("malformed example! '|',String,space, or EOL expected after : \"", beginLine, reading_head, "\"");
382  }
383  if (new_index && ae->feature_space[index].size() > 0)
384  ae->indices.push_back(index);
385  }
386 
387  inline void listNameSpace()
388  {
389  while ((*reading_head == '|') && (reading_head < endLine)) // ListNameSpace --> '|' NameSpace ListNameSpace
390  {
391  ++reading_head;
392  nameSpace();
393  }
394  if (reading_head != endLine && *reading_head != '\r')
395  {
396  // syntax error
397  parserWarning("malformed example! '|' or EOL expected after : \"", beginLine, reading_head, "\"");
398  }
399  }
400 
401  TC_parser(char* reading_head, char* endLine, vw& all, example* ae)
402  {
403  spelling = v_init<char>();
404  if (endLine != reading_head)
405  {
406  this->beginLine = reading_head;
407  this->reading_head = reading_head;
408  this->endLine = endLine;
409  this->p = all.p;
410  this->redefine_some = all.redefine_some;
411  this->redefine = &all.redefine;
412  this->ae = ae;
413  this->affix_features = &all.affix_features;
414  this->spelling_features = &all.spelling_features;
415  this->namespace_dictionaries = &all.namespace_dictionaries;
416  this->base = nullptr;
417  this->hash_seed = all.hash_seed;
418  this->parse_mask = all.parse_mask;
419  listNameSpace();
420  if (base != nullptr)
421  free(base);
422  }
423  }
424 };
425 
427 {
428  all->p->lp.default_label(&ae->l);
429  char* bar_location = safe_index(example.begin, '|', example.end);
430  char* tab_location = safe_index(example.begin, '\t', bar_location);
431  substring label_space;
432  if (tab_location != bar_location)
433  {
434  label_space.begin = tab_location + 1;
435  }
436  else
437  {
438  label_space.begin = example.begin;
439  }
440  label_space.end = bar_location;
441 
442  if (*example.begin == '|')
443  {
444  all->p->words.clear();
445  }
446  else
447  {
448  tokenize(' ', label_space, all->p->words);
449  if (!all->p->words.empty() &&
450  (all->p->words.last().end == label_space.end ||
451  *(all->p->words.last().begin) == '\'')) // The last field is a tag, so record and strip it off
452  {
453  substring tag = all->p->words.pop();
454  if (*tag.begin == '\'')
455  tag.begin++;
456  push_many(ae->tag, tag.begin, tag.end - tag.begin);
457  }
458  }
459 
460  if (!all->p->words.empty())
461  all->p->lp.parse_label(all->p, all->sd, &ae->l, all->p->words);
462 
463  if (all->audit || all->hash_inv)
464  TC_parser<true> parser_line(bar_location, example.end, *all, ae);
465  else
466  TC_parser<false> parser_line(bar_location, example.end, *all, ae);
467 }
468 
469 std::vector<std::string> split(char* phrase, const std::string& delimiter)
470 {
471  std::vector<std::string> list;
472  std::string s = std::string(phrase);
473  size_t pos = 0;
474  std::string token;
475  while ((pos = s.find(delimiter)) != std::string::npos)
476  {
477  token = s.substr(0, pos);
478  list.push_back(token);
479  s.erase(0, pos + delimiter.length());
480  }
481  list.push_back(s);
482  return list;
483 }
484 
485 namespace VW
486 {
487 void read_line(vw& all, example* ex, char* line)
488 {
489  substring ss = {line, line + strlen(line)};
490  while ((ss.end >= ss.begin) && (*(ss.end - 1) == '\n')) ss.end--;
491  substring_to_example(&all, ex, ss);
492 }
493 
494 void read_lines(vw* all, char* line, size_t /*len*/, v_array<example*>& examples)
495 {
496  auto lines = split(line, "\n");
497  for (size_t i = 0; i < lines.size(); i++)
498  {
499  // Check if a new empty example needs to be added.
500  if (examples.size() < i + 1)
501  {
502  examples.push_back(&VW::get_unused_example(all));
503  }
504  read_line(*all, examples[i], const_cast<char*>(lines[i].c_str()));
505  }
506 }
507 
508 } // namespace VW
TC_parser(char *reading_head, char *endLine, vw &all, example *ae)
void nameSpaceInfoValue()
v_array< char > tag
Definition: example.h:63
size_t anon
v_array< namespace_index > indices
std::array< bool, NUM_NAMESPACES > spelling_features
Definition: global_data.h:477
std::array< std::vector< feature_dict * >, NUM_NAMESPACES > namespace_dictionaries
Definition: global_data.h:482
substring read_name()
T pop()
Definition: v_array.h:58
unsigned char index
void listNameSpace()
uint64_t channel_hash
void push_back(feature_value v, feature_index i)
void read_line(vw &all, example *ex, char *line)
v_array< substring > words
Definition: parser.h:63
char * endLine
std::shared_ptr< audit_strings > audit_strings_ptr
Definition: feature_group.h:23
bool hash_inv
Definition: global_data.h:541
char * end
Definition: hashstring.h:10
constexpr unsigned char affix_namespace
Definition: constant.h:27
char * begin
Definition: hashstring.h:9
constexpr int quadratic_constant
Definition: constant.h:7
v_array< feature_index > indicies
int read_features_string(vw *all, v_array< example *> &examples)
void(* default_label)(void *)
Definition: label_parser.h:12
bool redefine_some
Definition: global_data.h:467
the core definition of a set of features.
VW_STD14_CONSTEXPR uint64_t uniform_hash(const void *key, size_t len, uint64_t seed)
Definition: hash.h:67
#define THROW_EX(ex, args)
Definition: vw_exception.h:188
io_buf * input
Definition: parser.h:69
std::array< std::vector< feature_dict * >, NUM_NAMESPACES > * namespace_dictionaries
bool redefine_some
v_array< feature_value > values
hash_func_t hasher
Definition: parser.h:73
std::array< uint64_t, NUM_NAMESPACES > affix_features
Definition: global_data.h:476
float cur_channel_v
std::array< unsigned char, NUM_NAMESPACES > * redefine
constexpr unsigned char spelling_namespace
Definition: constant.h:28
char * base
void read_lines(vw *all, char *line, size_t, v_array< example *> &examples)
T *& begin()
Definition: v_array.h:42
bool strict_parse
Definition: parser.h:107
uint64_t parse_mask
size_t size() const
Definition: v_array.h:68
std::array< uint64_t, NUM_NAMESPACES > * affix_features
void listFeatures()
parser * p
Definition: global_data.h:377
std::array< features, NUM_NAMESPACES > feature_space
void substring_to_example(vw *all, example *ae, substring example)
size_t size() const
parser * p
size_t readto(io_buf &i, char *&pointer, char terminal)
Definition: io_buf.cc:58
void push_many(v_array< T > &v, const T *_begin, size_t num)
Definition: v_array.h:207
void push_back(const T &new_ele)
Definition: v_array.h:107
shared_data * sd
Definition: global_data.h:375
example * ae
char * safe_index(char *start, char v, char *max)
float id(float in)
Definition: scorer.cc:51
void clear()
Definition: v_array.h:88
void tokenize(char delim, substring s, ContainerT &ret, bool allow_empty=false)
constexpr size_t NUM_NAMESPACES
Definition: constant.h:38
bool new_index
void(* parse_label)(parser *, shared_data *, void *, v_array< substring > &)
Definition: label_parser.h:13
void nameSpaceInfo()
constexpr unsigned char dictionary_namespace
Definition: constant.h:30
uint32_t hash_seed
std::array< unsigned char, NUM_NAMESPACES > redefine
Definition: global_data.h:468
T *& end()
Definition: v_array.h:43
v_array< audit_strings_ptr > space_names
constexpr int affix_constant
Definition: constant.h:10
float featureValue()
polylabel l
Definition: example.h:57
uint64_t parse_mask
Definition: global_data.h:453
void maybeFeature()
float sum_feat_sq
VW_STD14_CONSTEXPR uint64_t hashstring(substring s, uint64_t h)
Definition: hashstring.h:18
uint64_t end_parsed_examples
Definition: parser.h:82
Definition: autolink.cc:11
bool empty() const
Definition: v_array.h:59
char * reading_head
uint32_t hash_seed
Definition: global_data.h:401
Definition: parser.h:38
bool audit
Definition: global_data.h:486
size_t read_features(vw *all, char *&line, size_t &num_chars)
v_array< char > spelling
T last() const
Definition: v_array.h:57
std::vector< std::string > split(char *phrase, const std::string &delimiter)
void delete_v()
Definition: v_array.h:98
example & get_unused_example(vw *all)
Definition: parser.cc:664
float parseFloat(char *p, char **end, char *endLine=nullptr)
std::array< bool, NUM_NAMESPACES > * spelling_features
constexpr uint64_t c
Definition: rand48.cc:12
char * beginLine
label_parser lp
Definition: parser.h:102
void parserWarning(const char *message, char *begin, char *pos, const char *message2)
void nameSpace()
std::pair< std::string, std::string > audit_strings
Definition: feature_group.h:22