Vowpal Wabbit
Classes | Namespaces | Functions
parse_example.cc File Reference
#include <cmath>
#include <math.h>
#include <cctype>
#include "parse_example.h"
#include "hash.h"
#include "unique_sort.h"
#include "global_data.h"
#include "constant.h"

Go to the source code of this file.

Classes

class  TC_parser< audit >
 

Namespaces

 VW
 

Functions

size_t read_features (vw *all, char *&line, size_t &num_chars)
 
int read_features_string (vw *all, v_array< example *> &examples)
 
void substring_to_example (vw *all, example *ae, substring example)
 
std::vector< std::string > split (char *phrase, const std::string &delimiter)
 
void VW::read_line (vw &all, example *ex, char *line)
 
void VW::read_lines (vw *all, char *line, size_t, v_array< example *> &examples)
 

Function Documentation

◆ read_features()

size_t read_features ( vw all,
char *&  line,
size_t &  num_chars 
)

Definition at line 16 of file parse_example.cc.

References parser::input, vw::p, and readto().

Referenced by read_features_json(), and read_features_string().

17 {
18  line = nullptr;
19  size_t num_chars_initial = readto(*(all->p->input), line, '\n');
20  if (num_chars_initial < 1)
21  return num_chars_initial;
22  num_chars = num_chars_initial;
23  if (line[0] == '\xef' && num_chars >= 3 && line[1] == '\xbb' && line[2] == '\xbf')
24  {
25  line += 3;
26  num_chars -= 3;
27  }
28  if (num_chars > 0 && line[num_chars - 1] == '\n')
29  num_chars--;
30  if (num_chars > 0 && line[num_chars - 1] == '\r')
31  num_chars--;
32  return num_chars_initial;
33 }
io_buf * input
Definition: parser.h:69
parser * p
Definition: global_data.h:377
size_t readto(io_buf &i, char *&pointer, char terminal)
Definition: io_buf.cc:58

◆ read_features_string()

int read_features_string ( vw all,
v_array< example *> &  examples 
)

Definition at line 35 of file parse_example.cc.

References read_features(), and substring_to_example().

Referenced by enable_sources(), and reset_source().

36 {
37  char* line;
38  size_t num_chars;
39  size_t num_chars_initial = read_features(all, line, num_chars);
40  if (num_chars_initial < 1)
41  return (int)num_chars_initial;
42 
43  substring example = {line, line + num_chars};
44  substring_to_example(all, examples[0], example);
45 
46  return (int)num_chars_initial;
47 }
void substring_to_example(vw *all, example *ae, substring example)
size_t read_features(vw *all, char *&line, size_t &num_chars)

◆ split()

std::vector<std::string> split ( char *  phrase,
const std::string &  delimiter 
)

Definition at line 469 of file parse_example.cc.

Referenced by VW::read_lines().

470 {
471  std::vector<std::string> list;
472  std::string s = std::string(phrase);
473  size_t pos = 0;
474  std::string token;
475  while ((pos = s.find(delimiter)) != std::string::npos)
476  {
477  token = s.substr(0, pos);
478  list.push_back(token);
479  s.erase(0, pos + delimiter.length());
480  }
481  list.push_back(s);
482  return list;
483 }

◆ substring_to_example()

void substring_to_example ( vw all,
example ae,
substring  example 
)

Definition at line 426 of file parse_example.cc.

References vw::audit, substring::begin, v_array< T >::clear(), label_parser::default_label, v_array< T >::empty(), substring::end, vw::hash_inv, example::l, v_array< T >::last(), parser::lp, vw::p, label_parser::parse_label, v_array< T >::pop(), push_many(), safe_index(), vw::sd, example::tag, tokenize(), and parser::words.

Referenced by prepare_for_learner(), read_features_string(), and VW::read_line().

427 {
428  all->p->lp.default_label(&ae->l);
429  char* bar_location = safe_index(example.begin, '|', example.end);
430  char* tab_location = safe_index(example.begin, '\t', bar_location);
431  substring label_space;
432  if (tab_location != bar_location)
433  {
434  label_space.begin = tab_location + 1;
435  }
436  else
437  {
438  label_space.begin = example.begin;
439  }
440  label_space.end = bar_location;
441 
442  if (*example.begin == '|')
443  {
444  all->p->words.clear();
445  }
446  else
447  {
448  tokenize(' ', label_space, all->p->words);
449  if (!all->p->words.empty() &&
450  (all->p->words.last().end == label_space.end ||
451  *(all->p->words.last().begin) == '\'')) // The last field is a tag, so record and strip it off
452  {
453  substring tag = all->p->words.pop();
454  if (*tag.begin == '\'')
455  tag.begin++;
456  push_many(ae->tag, tag.begin, tag.end - tag.begin);
457  }
458  }
459 
460  if (!all->p->words.empty())
461  all->p->lp.parse_label(all->p, all->sd, &ae->l, all->p->words);
462 
463  if (all->audit || all->hash_inv)
464  TC_parser<true> parser_line(bar_location, example.end, *all, ae);
465  else
466  TC_parser<false> parser_line(bar_location, example.end, *all, ae);
467 }
v_array< char > tag
Definition: example.h:63
T pop()
Definition: v_array.h:58
v_array< substring > words
Definition: parser.h:63
bool hash_inv
Definition: global_data.h:541
char * end
Definition: hashstring.h:10
char * begin
Definition: hashstring.h:9
void(* default_label)(void *)
Definition: label_parser.h:12
parser * p
Definition: global_data.h:377
void push_many(v_array< T > &v, const T *_begin, size_t num)
Definition: v_array.h:207
shared_data * sd
Definition: global_data.h:375
char * safe_index(char *start, char v, char *max)
void clear()
Definition: v_array.h:88
void tokenize(char delim, substring s, ContainerT &ret, bool allow_empty=false)
void(* parse_label)(parser *, shared_data *, void *, v_array< substring > &)
Definition: label_parser.h:13
polylabel l
Definition: example.h:57
bool empty() const
Definition: v_array.h:59
bool audit
Definition: global_data.h:486
T last() const
Definition: v_array.h:57
label_parser lp
Definition: parser.h:102