19 size_t num_chars_initial =
readto(*(all->
p->
input), line,
'\n');
20 if (num_chars_initial < 1)
21 return num_chars_initial;
22 num_chars = num_chars_initial;
23 if (line[0] ==
'\xef' && num_chars >= 3 && line[1] ==
'\xbb' && line[2] ==
'\xbf')
28 if (num_chars > 0 && line[num_chars - 1] ==
'\n')
30 if (num_chars > 0 && line[num_chars - 1] ==
'\r')
32 return num_chars_initial;
39 size_t num_chars_initial =
read_features(all, line, num_chars);
40 if (num_chars_initial < 1)
41 return (
int)num_chars_initial;
46 return (
int)num_chars_initial;
64 std::array<unsigned char, NUM_NAMESPACES>*
redefine;
77 inline void parserWarning(
const char* message,
char* begin,
char* pos,
const char* message2)
80 ss << message << std::string(begin, pos - begin).c_str() << message2 <<
"in Example #" 81 << this->p->
end_parsed_examples <<
": \"" << std::string(this->beginLine, this->endLine).c_str() <<
"\"" 89 std::cerr << ss.str();
95 if (*reading_head ==
' ' || *reading_head ==
'\t' || *reading_head ==
'|' || reading_head == endLine ||
96 *reading_head ==
'\r')
98 else if (*reading_head ==
':')
102 char* end_read =
nullptr;
103 v =
parseFloat(reading_head, &end_read, endLine);
104 if (end_read == reading_head)
106 parserWarning(
"malformed example! Float expected after : \"", beginLine, reading_head,
"\"");
111 parserWarning(
"warning: invalid feature value:\"", reading_head, end_read,
"\" read as NaN. Replacing with 0.");
113 reading_head = end_read;
119 parserWarning(
"malformed example! '|', ':', space, or EOL expected after : \"", beginLine, reading_head,
"\"");
128 while (!(*reading_head ==
' ' || *reading_head ==
':' || *reading_head ==
'\t' || *reading_head ==
'|' ||
129 reading_head == endLine || *reading_head ==
'\r'))
138 if (*reading_head ==
' ' || *reading_head ==
'\t' || *reading_head ==
'|' || reading_head == endLine ||
139 *reading_head ==
'\r')
149 if (feature_name.
end != feature_name.
begin)
152 word_hash = channel_hash + anon++;
165 if ((*affix_features)[index] > 0 && (feature_name.
end != feature_name.
begin))
168 if (affix_fs.
size() == 0)
170 uint64_t affix = (*affix_features)[
index];
173 bool is_prefix = affix & 0x1;
174 uint64_t len = (affix >> 1) & 0x7;
176 if (affix_name.
end > affix_name.
begin + len)
179 affix_name.
end = affix_name.
begin + len;
181 affix_name.
begin = affix_name.
end - len;
191 affix_v.
push_back(is_prefix ?
'+' :
'-');
201 if ((*spelling_features)[
index])
204 if (spell_fs.
size() == 0)
208 for (
char*
c = feature_name.
begin;
c != feature_name.
end; ++
c)
211 if ((*
c >=
'0') && (*
c <=
'9'))
213 else if ((*
c >=
'a') && (*
c <=
'z'))
215 else if ((*
c >=
'A') && (*
c <=
'Z'))
225 uint64_t word_hash =
hashstring(spelling_ss, (uint64_t)channel_hash);
240 if ((*namespace_dictionaries)[
index].size() > 0)
242 for (
auto map : (*namespace_dictionaries)[
index])
245 features* feats = map->get(feature_name, hash);
246 if ((feats !=
nullptr) && (feats->
values.
size() > 0))
249 if (dict_fs.
size() == 0)
258 std::stringstream ss;
260 for (
char* fc = feature_name.
begin; fc != feature_name.
end; ++fc) ss << *fc;
272 if (*reading_head ==
' ' || *reading_head ==
'\t' || reading_head == endLine || *reading_head ==
'|' ||
273 *reading_head ==
'\r')
277 else if (*reading_head ==
':')
281 char* end_read =
nullptr;
282 cur_channel_v =
parseFloat(reading_head, &end_read);
283 if (end_read == reading_head)
285 parserWarning(
"malformed example! Float expected after : \"", beginLine, reading_head,
"\"");
287 if (std::isnan(cur_channel_v))
291 "warning: invalid namespace value:\"", reading_head, end_read,
"\" read as NaN. Replacing with 1.");
293 reading_head = end_read;
298 parserWarning(
"malformed example! '|',':', space, or EOL expected after : \"", beginLine, reading_head,
"\"");
304 if (reading_head == endLine || *reading_head ==
'|' || *reading_head ==
' ' || *reading_head ==
'\t' ||
305 *reading_head ==
':' || *reading_head ==
'\r')
308 parserWarning(
"malformed example! String expected after : \"", beginLine, reading_head,
"\"");
313 index = (
unsigned char)(*reading_head);
315 index = (*redefine)[
index];
326 base = base_v_array.
begin();
328 channel_hash = p->
hasher(name, this->hash_seed);
335 while ((*reading_head ==
' ' || *reading_head ==
'\t') && (reading_head < endLine))
341 if (!(*reading_head ==
'|' || reading_head == endLine || *reading_head ==
'\r'))
344 parserWarning(
"malformed example! '|',space, or EOL expected after : \"", beginLine, reading_head,
"\"");
354 if (*reading_head ==
' ' || *reading_head ==
'\t' || reading_head == endLine || *reading_head ==
'|' ||
355 *reading_head ==
'\r')
358 index = (
unsigned char)
' ';
365 base = calloc_or_throw<char>(2);
369 channel_hash = this->hash_seed == 0 ? 0 :
uniform_hash(
"", 0, this->hash_seed);
372 else if (*reading_head !=
':')
381 parserWarning(
"malformed example! '|',String,space, or EOL expected after : \"", beginLine, reading_head,
"\"");
389 while ((*reading_head ==
'|') && (reading_head < endLine))
394 if (reading_head != endLine && *reading_head !=
'\r')
397 parserWarning(
"malformed example! '|' or EOL expected after : \"", beginLine, reading_head,
"\"");
403 spelling = v_init<char>();
404 if (endLine != reading_head)
416 this->base =
nullptr;
432 if (tab_location != bar_location)
434 label_space.
begin = tab_location + 1;
440 label_space.
end = bar_location;
442 if (*example.
begin ==
'|')
454 if (*tag.
begin ==
'\'')
469 std::vector<std::string>
split(
char* phrase,
const std::string& delimiter)
471 std::vector<std::string> list;
472 std::string s = std::string(phrase);
475 while ((pos = s.find(delimiter)) != std::string::npos)
477 token = s.substr(0, pos);
478 list.push_back(token);
479 s.erase(0, pos + delimiter.length());
489 substring ss = {line, line + strlen(line)};
496 auto lines =
split(line,
"\n");
497 for (
size_t i = 0; i < lines.size(); i++)
500 if (examples.
size() < i + 1)
504 read_line(*all, examples[i], const_cast<char*>(lines[i].c_str()));
TC_parser(char *reading_head, char *endLine, vw &all, example *ae)
void nameSpaceInfoValue()
v_array< namespace_index > indices
std::array< bool, NUM_NAMESPACES > spelling_features
std::array< std::vector< feature_dict * >, NUM_NAMESPACES > namespace_dictionaries
void push_back(feature_value v, feature_index i)
void read_line(vw &all, example *ex, char *line)
v_array< substring > words
std::shared_ptr< audit_strings > audit_strings_ptr
constexpr unsigned char affix_namespace
constexpr int quadratic_constant
v_array< feature_index > indicies
int read_features_string(vw *all, v_array< example *> &examples)
void(* default_label)(void *)
the core definition of a set of features.
VW_STD14_CONSTEXPR uint64_t uniform_hash(const void *key, size_t len, uint64_t seed)
#define THROW_EX(ex, args)
std::array< std::vector< feature_dict * >, NUM_NAMESPACES > * namespace_dictionaries
v_array< feature_value > values
std::array< uint64_t, NUM_NAMESPACES > affix_features
std::array< unsigned char, NUM_NAMESPACES > * redefine
constexpr unsigned char spelling_namespace
void read_lines(vw *all, char *line, size_t, v_array< example *> &examples)
std::array< uint64_t, NUM_NAMESPACES > * affix_features
std::array< features, NUM_NAMESPACES > feature_space
void substring_to_example(vw *all, example *ae, substring example)
size_t readto(io_buf &i, char *&pointer, char terminal)
void push_many(v_array< T > &v, const T *_begin, size_t num)
void push_back(const T &new_ele)
char * safe_index(char *start, char v, char *max)
void tokenize(char delim, substring s, ContainerT &ret, bool allow_empty=false)
constexpr size_t NUM_NAMESPACES
void(* parse_label)(parser *, shared_data *, void *, v_array< substring > &)
constexpr unsigned char dictionary_namespace
std::array< unsigned char, NUM_NAMESPACES > redefine
v_array< audit_strings_ptr > space_names
constexpr int affix_constant
VW_STD14_CONSTEXPR uint64_t hashstring(substring s, uint64_t h)
uint64_t end_parsed_examples
size_t read_features(vw *all, char *&line, size_t &num_chars)
std::vector< std::string > split(char *phrase, const std::string &delimiter)
example & get_unused_example(vw *all)
float parseFloat(char *p, char **end, char *endLine=nullptr)
std::array< bool, NUM_NAMESPACES > * spelling_features
void parserWarning(const char *message, char *begin, char *pos, const char *message2)
std::pair< std::string, std::string > audit_strings