Vowpal Wabbit
cache.cc
Go to the documentation of this file.
1 /*
2 Copyright (c) by respective owners including Yahoo!, Microsoft, and
3 individual contributors. All rights reserved. Released under a BSD (revised)
4 license as described in the file LICENSE.
5  */
6 #include "cache.h"
7 #include "unique_sort.h"
8 #include "global_data.h"
9 #include "vw.h"
10 
11 constexpr size_t int_size = 11;
12 constexpr size_t char_size = 2;
13 constexpr size_t neg_1 = 1;
14 constexpr size_t general = 2;
15 
16 inline char* run_len_decode(char* p, uint64_t& i)
17 {
18  // read an int 7 bits at a time.
19  size_t count = 0;
20  while (*p & 128) i = i | ((uint64_t)(*(p++) & 127) << 7 * count++);
21  i = i | ((uint64_t)(*(p++)) << 7 * count);
22  return p;
23 }
24 
25 inline char* run_len_encode(char* p, uint64_t i)
26 {
27  // store an int 7 bits at a time.
28  while (i >= 128)
29  {
30  *(p++) = (i & 127) | 128;
31  i = i >> 7;
32  }
33  *(p++) = (i & 127);
34  return p;
35 }
36 
37 inline int64_t ZigZagDecode(uint64_t n) { return (n >> 1) ^ -static_cast<int64_t>(n & 1); }
38 
39 size_t read_cached_tag(io_buf& cache, example* ae)
40 {
41  char* c;
42  size_t tag_size;
43  if (cache.buf_read(c, sizeof(tag_size)) < sizeof(tag_size))
44  return 0;
45  tag_size = *(size_t*)c;
46  c += sizeof(tag_size);
47  cache.set(c);
48  if (cache.buf_read(c, tag_size) < tag_size)
49  return 0;
50 
51  ae->tag.clear();
52  push_many(ae->tag, c, tag_size);
53  return tag_size + sizeof(tag_size);
54 }
55 
56 struct one_float
57 {
58  float f;
59 }
60 #ifndef _WIN32
61 __attribute__((packed))
62 #endif
63 ;
64 
66 {
67  example* ae = examples[0];
68  ae->sorted = all->p->sorted_cache;
69  io_buf* input = all->p->input;
70 
71  size_t total = all->p->lp.read_cached_label(all->sd, &ae->l, *input);
72  if (total == 0)
73  return 0;
74  if (read_cached_tag(*input, ae) == 0)
75  return 0;
76  char* c;
77  unsigned char num_indices = 0;
78  if (input->buf_read(c, sizeof(num_indices)) < sizeof(num_indices))
79  return 0;
80  num_indices = *(unsigned char*)c;
81  c += sizeof(num_indices);
82 
83  all->p->input->set(c);
84  for (; num_indices > 0; num_indices--)
85  {
86  size_t temp;
87  unsigned char index = 0;
88  if ((temp = input->buf_read(c, sizeof(index) + sizeof(size_t))) < sizeof(index) + sizeof(size_t))
89  {
90  all->trace_message << "truncated example! " << temp << " " << char_size + sizeof(size_t) << std::endl;
91  return 0;
92  }
93 
94  index = *(unsigned char*)c;
95  c += sizeof(index);
96  ae->indices.push_back((size_t)index);
97  features& ours = ae->feature_space[index];
98  size_t storage = *(size_t*)c;
99  c += sizeof(size_t);
100  all->p->input->set(c);
101  total += storage;
102  if (input->buf_read(c, storage) < storage)
103  {
104  all->trace_message << "truncated example! wanted: " << storage << " bytes" << std::endl;
105  return 0;
106  }
107 
108  char* end = c + storage;
109 
110  uint64_t last = 0;
111 
112  for (; c != end;)
113  {
114  feature_index i = 0;
115  c = run_len_decode(c, i);
116  feature_value v = 1.f;
117  if (i & neg_1)
118  v = -1.;
119  else if (i & general)
120  {
121  v = ((one_float*)c)->f;
122  c += sizeof(float);
123  }
124  uint64_t diff = i >> 2;
125  int64_t s_diff = ZigZagDecode(diff);
126  if (s_diff < 0)
127  ae->sorted = false;
128  i = last + s_diff;
129  last = i;
130  ours.push_back(v, i);
131  }
132  all->p->input->set(c);
133  }
134 
135  return (int)total;
136 }
137 
138 inline uint64_t ZigZagEncode(int64_t n)
139 {
140  uint64_t ret = (n << 1) ^ (n >> 63);
141  return ret;
142 }
143 
144 void output_byte(io_buf& cache, unsigned char s)
145 {
146  char* c;
147 
148  cache.buf_write(c, 1);
149  *(c++) = s;
150  cache.set(c);
151 }
152 
153 void output_features(io_buf& cache, unsigned char index, features& fs, uint64_t mask)
154 {
155  char* c;
156  size_t storage = fs.size() * int_size;
157  for (feature_value f : fs.values)
158  if (f != 1. && f != -1.)
159  storage += sizeof(feature_value);
160 
161  cache.buf_write(c, sizeof(index) + storage + sizeof(size_t));
162  *reinterpret_cast<unsigned char*>(c) = index;
163  c += sizeof(index);
164 
165  char* storage_size_loc = c;
166  c += sizeof(size_t);
167 
168  uint64_t last = 0;
169  for (features::iterator& f : fs)
170  {
171  feature_index fi = f.index() & mask;
172  int64_t s_diff = (fi - last);
173  uint64_t diff = ZigZagEncode(s_diff) << 2;
174  last = fi;
175 
176  if (f.value() == 1.)
177  c = run_len_encode(c, diff);
178  else if (f.value() == -1.)
179  c = run_len_encode(c, diff | neg_1);
180  else
181  {
182  c = run_len_encode(c, diff | general);
183  memcpy(c, &f.value(), sizeof(feature_value));
184  c += sizeof(feature_value);
185  }
186  }
187 
188  cache.set(c);
189  *(size_t*)storage_size_loc = c - storage_size_loc - sizeof(size_t);
190 }
191 
192 void cache_tag(io_buf& cache, v_array<char> tag)
193 {
194  char* c;
195  cache.buf_write(c, sizeof(size_t) + tag.size());
196  *(size_t*)c = tag.size();
197  c += sizeof(size_t);
198  memcpy(c, tag.begin(), tag.size());
199  c += tag.size();
200  cache.set(c);
201 }
202 
203 void cache_features(io_buf& cache, example* ae, uint64_t mask)
204 {
205  cache_tag(cache, ae->tag);
206  output_byte(cache, (unsigned char)ae->indices.size());
207 
208  for (namespace_index ns : ae->indices) output_features(cache, ns, ae->feature_space[ns], mask);
209 }
210 
211 uint32_t VW::convert(size_t number)
212 {
213  if (number > UINT32_MAX)
214  {
215  THROW("size_t value is out of bounds of uint32_t.")
216  }
217  return static_cast<uint32_t>(number);
218 }
v_array< char > tag
Definition: example.h:63
v_array< namespace_index > indices
size_t read_cached_tag(io_buf &cache, example *ae)
Definition: cache.cc:39
void push_back(feature_value v, feature_index i)
constexpr size_t char_size
Definition: cache.cc:12
bool sorted_cache
Definition: parser.h:78
uint64_t ZigZagEncode(int64_t n)
Definition: cache.cc:138
bool sorted
Definition: example.h:78
constexpr size_t int_size
Definition: cache.cc:11
the core definition of a set of features.
float feature_value
Definition: feature_group.h:20
io_buf * input
Definition: parser.h:69
char * run_len_decode(char *p, uint64_t &i)
Definition: cache.cc:16
void output_byte(io_buf &cache, unsigned char s)
Definition: cache.cc:144
v_array< feature_value > values
void set(char *p)
Definition: io_buf.h:163
constexpr size_t neg_1
Definition: cache.cc:13
T *& begin()
Definition: v_array.h:42
size_t size() const
Definition: v_array.h:68
parser * p
Definition: global_data.h:377
float f
Definition: cache.cc:58
std::array< features, NUM_NAMESPACES > feature_space
size_t size() const
void push_many(v_array< T > &v, const T *_begin, size_t num)
Definition: v_array.h:207
void output_features(io_buf &cache, unsigned char index, features &fs, uint64_t mask)
Definition: cache.cc:153
void push_back(const T &new_ele)
Definition: v_array.h:107
shared_data * sd
Definition: global_data.h:375
void clear()
Definition: v_array.h:88
vw_ostream trace_message
Definition: global_data.h:424
unsigned char namespace_index
struct one_float __attribute__((packed))
uint64_t feature_index
Definition: feature_group.h:21
Definition: io_buf.h:54
void buf_write(char *&pointer, size_t n)
Definition: io_buf.cc:94
iterator over values and indicies
polylabel l
Definition: example.h:57
void cache_features(io_buf &cache, example *ae, uint64_t mask)
Definition: cache.cc:203
int64_t ZigZagDecode(uint64_t n)
Definition: cache.cc:37
char * run_len_encode(char *p, uint64_t i)
Definition: cache.cc:25
int read_cached_features(vw *all, v_array< example *> &examples)
Definition: cache.cc:65
uint32_t convert(size_t number)
Definition: cache.cc:211
size_t(* read_cached_label)(shared_data *, void *, io_buf &cache)
Definition: label_parser.h:15
constexpr size_t general
Definition: cache.cc:14
void cache_tag(io_buf &cache, v_array< char > tag)
Definition: cache.cc:192
#define THROW(args)
Definition: vw_exception.h:181
constexpr uint64_t c
Definition: rand48.cc:12
label_parser lp
Definition: parser.h:102
size_t buf_read(char *&pointer, size_t n)
Definition: io_buf.cc:12