Classification#

import pandas as pd
import sklearn
import sklearn.model_selection
import sklearn.datasets
import vowpalwabbit
iris_dataset = sklearn.datasets.load_iris()
iris_dataframe = pd.DataFrame(
    data=iris_dataset.data, columns=iris_dataset.feature_names
)
# vw expects labels starting from 1
iris_dataframe["y"] = iris_dataset.target + 1
training_data, testing_data = sklearn.model_selection.train_test_split(
    iris_dataframe, test_size=0.2
)
def to_vw_format(row):
    res = f"{int(row.y)} |"
    for idx, value in row.drop(["y"]).iteritems():
        feature_name = idx.replace(" ", "_").replace("(", "").replace(")", "")
        res += f" {feature_name}:{value}"
    return res

Vowpal Wabbit input format#

Vowpal Wabbit has its own input format we can use. Let’s see what it looks like.

for ex in training_data.head(10).apply(to_vw_format, axis=1):
    print(ex)
1 | sepal_length_cm:5.1 sepal_width_cm:3.7 petal_length_cm:1.5 petal_width_cm:0.4
3 | sepal_length_cm:5.8 sepal_width_cm:2.8 petal_length_cm:5.1 petal_width_cm:2.4
2 | sepal_length_cm:5.9 sepal_width_cm:3.2 petal_length_cm:4.8 petal_width_cm:1.8
2 | sepal_length_cm:6.1 sepal_width_cm:2.8 petal_length_cm:4.0 petal_width_cm:1.3
2 | sepal_length_cm:7.0 sepal_width_cm:3.2 petal_length_cm:4.7 petal_width_cm:1.4
3 | sepal_length_cm:5.9 sepal_width_cm:3.0 petal_length_cm:5.1 petal_width_cm:1.8
1 | sepal_length_cm:4.9 sepal_width_cm:3.6 petal_length_cm:1.4 petal_width_cm:0.1
1 | sepal_length_cm:5.1 sepal_width_cm:3.3 petal_length_cm:1.7 petal_width_cm:0.5
3 | sepal_length_cm:5.8 sepal_width_cm:2.7 petal_length_cm:5.1 petal_width_cm:1.9
1 | sepal_length_cm:4.9 sepal_width_cm:3.1 petal_length_cm:1.5 petal_width_cm:0.1
vw = vowpalwabbit.Workspace("--oaa 3 --quiet")

# learn from training set with multiple passes
for example in training_data.apply(to_vw_format, axis=1):
    vw.learn(example)

# predict from the testing set
predictions = []
for example in testing_data.apply(to_vw_format, axis=1):
    predicted_class = vw.predict(example)
    predictions.append(predicted_class)
accuracy = len(testing_data[testing_data.y == predictions]) / len(testing_data)

print(f"Model accuracy {accuracy}")
Model accuracy 0.6666666666666666