Classification with Vowpal Wabbit

Note: This should be run in Binder instead of interactively on this page. This is because it depends on some files which the interactive page doesn’t pick up.

import re
import pandas as pd
import string
training_data = pd.read_csv('iris-training.csv')
testing_data = pd.read_csv('iris-testing.csv')
training_data.describe()
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) y
count 120.000000 120.000000 120.000000 120.000000 120.000000
mean 5.785833 3.045833 3.742500 1.213333 2.008333
std 0.749935 0.424243 1.677867 0.758031 0.804356
min 4.300000 2.000000 1.000000 0.100000 1.000000
25% 5.100000 2.800000 1.600000 0.375000 1.000000
50% 5.750000 3.000000 4.300000 1.300000 2.000000
75% 6.325000 3.300000 5.100000 1.800000 3.000000
max 7.700000 4.400000 6.700000 2.500000 3.000000
def to_vw_format(line):
    chars = re.escape(string.punctuation)
    res = f'{int(line.y)} |'
    for idx, value in line.drop(['y']).iteritems():
        feature_name = re.sub(r'(['+chars+']|\s)+', '_', idx)
        res += f' {feature_name}:{value}'
    return res

Vowpal Wabbit input format

Vowpal Wabbit has its own input format we can use. Lets see how it looks like.

for ex in training_data.head(10).apply(to_vw_format, axis=1):
    print(ex)
3 | sepal_length_cm_:6.4 sepal_width_cm_:3.1 petal_length_cm_:5.5 petal_width_cm_:1.8
1 | sepal_length_cm_:5.1 sepal_width_cm_:3.5 petal_length_cm_:1.4 petal_width_cm_:0.2
1 | sepal_length_cm_:5.4 sepal_width_cm_:3.7 petal_length_cm_:1.5 petal_width_cm_:0.2
3 | sepal_length_cm_:6.2 sepal_width_cm_:2.8 petal_length_cm_:4.8 petal_width_cm_:1.8
3 | sepal_length_cm_:5.8 sepal_width_cm_:2.7 petal_length_cm_:5.1 petal_width_cm_:1.9
1 | sepal_length_cm_:4.9 sepal_width_cm_:3.1 petal_length_cm_:1.5 petal_width_cm_:0.2
1 | sepal_length_cm_:4.6 sepal_width_cm_:3.2 petal_length_cm_:1.4 petal_width_cm_:0.2
3 | sepal_length_cm_:5.8 sepal_width_cm_:2.7 petal_length_cm_:5.1 petal_width_cm_:1.9
1 | sepal_length_cm_:4.3 sepal_width_cm_:3.0 petal_length_cm_:1.1 petal_width_cm_:0.1
2 | sepal_length_cm_:6.7 sepal_width_cm_:3.0 petal_length_cm_:5.0 petal_width_cm_:1.7
from vowpalwabbit import pyvw

vw = pyvw.vw("--oaa 3")

# learn from training set
for example in training_data.apply(to_vw_format, axis = 1):
    vw.learn(example)

# predict from the testing set
predictions = []
for example in testing_data.apply(to_vw_format, axis = 1):
    predicted_class = vw.predict(example)
    predictions.append(predicted_class)
Num weight bits = 18
learning rate = 0.5
initial_t = 0
power_t = 0.5
using no cache
Reading datafile = 
num sources = 1
Enabled reductions: gd, scorer-identity, oaa
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
1.000000 1.000000            1            1.0        3        1        5
1.000000 1.000000            2            2.0        1        3        5
0.750000 0.500000            4            4.0        3        1        5
0.500000 0.250000            8            8.0        3        3        5
0.500000 0.500000           16           16.0        2        2        5
0.375000 0.250000           32           32.0        3        3        5
0.375000 0.375000           64           64.0        2        3        5
0.312500 0.250000          128          128.0        1        1        5
accuracy = len(testing_data[testing_data.y == predictions]) / len(testing_data)

f'Model accuracy {accuracy}'
'Model accuracy 0.7666666666666667'

How was this data set generated?

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import os

ds = load_iris()
df = pd.DataFrame(data = ds.data, columns = ds.feature_names)
df["y"] = ds.target + 1 # vw expects labels startins on 1

training_data, testing_data = train_test_split(df, random_state = 2019, test_size = 0.2)

training_data.to_csv(os.path.join(os.getcwd(), 'iris-training.csv'), index=False)
testing_data.to_csv(os.path.join(os.getcwd(),'iris-testing.csv'), index=False)