Contextual Bandits¶

import vowpalwabbit
import pandas as pd

# generate sample data that could originate from previous random trial, e.g. AB test, for the CB to explore
## data here are equivalent to example in https://github.com/VowpalWabbit/vowpal_wabbit/wiki/Logged-Contextual-Bandit-Example
train_data = [
    {
        "action": 1,
        "cost": 2,
        "probability": 0.4,
        "feature1": "a",
        "feature2": "c",
        "feature3": "",
    },
    {
        "action": 3,
        "cost": 0,
        "probability": 0.2,
        "feature1": "b",
        "feature2": "d",
        "feature3": "",
    },
    {
        "action": 4,
        "cost": 1,
        "probability": 0.5,
        "feature1": "a",
        "feature2": "b",
        "feature3": "",
    },
    {
        "action": 2,
        "cost": 1,
        "probability": 0.3,
        "feature1": "a",
        "feature2": "b",
        "feature3": "c",
    },
    {
        "action": 3,
        "cost": 1,
        "probability": 0.7,
        "feature1": "a",
        "feature2": "d",
        "feature3": "",
    },
]

train_df = pd.DataFrame(train_data)

## add index to df
train_df["index"] = range(1, len(train_df) + 1)
train_df = train_df.set_index("index")

# generate some test data that you want the CB to make decisions for, e.g. features describing new users, for the CB to exploit
test_data = [
    {"feature1": "b", "feature2": "c", "feature3": ""},
    {"feature1": "a", "feature2": "", "feature3": "b"},
    {"feature1": "b", "feature2": "b", "feature3": ""},
    {"feature1": "a", "feature2": "", "feature3": "b"},
]

test_df = pd.DataFrame(test_data)

## add index to df
test_df["index"] = range(1, len(test_df) + 1)
test_df = test_df.set_index("index")

# take a look at dataframes
print(train_df)
print(test_df)

       action  cost  probability feature1 feature2 feature3
index                                                      
1           1     2          0.4        a        c         
2           3     0          0.2        b        d         
3           4     1          0.5        a        b         
4           2     1          0.3        a        b        c
5           3     1          0.7        a        d         
      feature1 feature2 feature3
index                           
1            b        c         
2            a                 b
3            b        b         
4            a                 b

# create python model - this stores the model parameters in the python vw object; here a contextual bandit with four possible actions
vw = vowpalwabbit.Workspace("--cb 4", quiet=True)

# use the learn method to train the vw model, train model row by row using a loop
for i in train_df.index:
    ## provide data to cb in requested format
    action = train_df.loc[i, "action"]
    cost = train_df.loc[i, "cost"]
    probability = train_df.loc[i, "probability"]
    feature1 = train_df.loc[i, "feature1"]
    feature2 = train_df.loc[i, "feature2"]
    feature3 = train_df.loc[i, "feature3"]
    ## do the actual learning
    vw.learn(
        str(action)
        + ":"
        + str(cost)
        + ":"
        + str(probability)
        + " | "
        + str(feature1)
        + " "
        + str(feature2)
        + " "
        + str(feature3)
    )

# use the same model object that was trained to perform predictions

# predict row by row and output results
for j in test_df.index:
    feature1 = test_df.loc[j, "feature1"]
    feature2 = test_df.loc[j, "feature2"]
    feature3 = test_df.loc[j, "feature3"]
    choice = vw.predict(
        "| " + str(feature1) + " " + str(feature2) + " " + str(feature3)
    )
    print(j, choice)

# the CB assigns every instance to action 3 as it should per the cost structure of the train data; you can play with the cost structure to see that the CB updates its predictions accordingly

# BONUS: save and load the CB model
# save model
vw.save("cb.model")
del vw
# load from saved file
vw = vowpalwabbit.Workspace("--cb 4 -i cb.model", quiet=True)
print(vw.predict("| a b"))