Slates

from vowpalwabbit import pyvw
import matplotlib.pyplot as plt
import pandas as pd
import random
import numpy as np

NUM_ITERATIONS = 2500
shared_contexts = ["corporate", "trade"]
torso_items = ["tshirt", "buttonupshirt", "highvis"]
legs_items = ["workpants", "formalpants", "shorts"]
feet_items = ["formalshoes", "runners", "flipflops", "boots"]

Scenario: Outfit optimization

  • Slots are different clothing types

  • Actions are the individual pieces of clothing for each slot

slates_scenario.png

Reward function

The chosen action and context is supplied to the this to determine the reward. Noise is injected to make the learning problem harder and reflect how in reality there may be variation in how the reward function reacts, possibly because of missing information.

def noise(center, stddev=0.075):
    return np.random.normal(loc = center, scale=0.075)

def reward_function(shared_context, torso_index, legs_index, feet_index):    
    if shared_context == "corporate":
        torso_values = [noise(0.2), noise(0.3), noise(0.1)]
        legs_val = [noise(0.1), noise(0.3), noise(0.2)]    
        feet_values = [noise(0.4), noise(0.3), noise(0.05), noise(0.1)]
    if shared_context == "trade":
        torso_values = [noise(0.15), noise(0.2), noise(0.3)]
        legs_val = [noise(0.4), noise(0.2), noise(0.35)]    
        feet_values = [noise(0.15), noise(0.2), noise(0.1), noise(0.3)]
    
    return torso_values[torso_index] + legs_val[legs_index] + feet_values[feet_index]
    

Slates

def generate_slates_text_format(shared_context):
    return [
       f"slates shared |User {shared_context}",
        "slates action 0 |Action tshirt", 
        "slates action 0 |Action buttonupshirt", 
        "slates action 0 |Action highvis", 
        "slates action 1 |Action workpants", 
        "slates action 1 |Action formalpants", 
        "slates action 1 |Action shorts", 
        "slates action 2 |Action formalshoes", 
        "slates action 2 |Action runners", 
        "slates action 2 |Action flipflops", 
        "slates action 2 |Action boots", 
        "slates slot |Slot torso", 
        "slates slot |Slot legs",
        "slates slot |Slot feet"
    ]

def generate_slates_text_format_with_label(shared_context, reward, chosen_torso_index, chosen_torso_prob, chosen_legs_index, chosen_legs_prob, chosen_feet_index, chosen_feet_prob):
    return [
       f"slates shared {-1*reward} |User {shared_context}",
        "slates action 0 |Action tshirt", 
        "slates action 0 |Action buttonupshirt", 
        "slates action 0 |Action highvis", 
        "slates action 1 |Action workpants", 
        "slates action 1 |Action formalpants", 
        "slates action 1 |Action shorts", 
        "slates action 2 |Action formalshoes", 
        "slates action 2 |Action runners", 
        "slates action 2 |Action flipflops", 
        "slates action 2 |Action boots", 
        f"slates slot {chosen_torso_index}:{chosen_torso_prob} |Slot torso", 
        f"slates slot {chosen_legs_index}:{chosen_legs_prob} |Slot legs",
        f"slates slot {chosen_feet_index}:{chosen_feet_prob} |Slot feet"

    ]
slates_vw = pyvw.Workspace("--slates --epsilon 0.2 --interactions SA UAS US UA -l 0.05 --power_t 0")

slates_rewards = []
for _ in range(NUM_ITERATIONS):
    shared_context = random.choice(shared_contexts)
    slates_prediction = slates_vw.predict(generate_slates_text_format(shared_context))
    torso_index, torso_prob = slates_prediction[0][0]
    legs_index, legs_prob = slates_prediction[1][0]    
    feet_index, feet_prob = slates_prediction[2][0]
    reward = reward_function(shared_context, torso_index, legs_index, feet_index)
    slates_rewards.append(reward)
    slates_vw.learn(generate_slates_text_format_with_label(shared_context,reward, torso_index, torso_prob, legs_index, legs_prob, feet_index, feet_prob))

slates_vw.finish()
creating features for following interactions: SA UAS US UA
Num weight bits = 18
learning rate = 0.05
initial_t = 0
power_t = 0
using no cache
Reading datafile = stdin
num sources = 1
Enabled reductions: gd, generate_interactions, scorer-identity, csoaa_ldf-rank, cb_adf, cb_explore_adf_greedy, cb_sample, shared_feature_merger, ccb_explore_adf, slates
Input label = slates
Output pred = decision_probs
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
    n.a.     n.a.            1            1.0  ?,?,... 0,0,0,...      293
-0.701601 -0.701601            2            2.0  0,0,... 0,0,0,...      293
-0.746992 -0.792384            4            4.0  0,1,... 0,1,0,...      293
-0.766895 -0.786797            8            8.0  0,0,... 0,0,0,...      293
-0.680798 -0.594702           16           16.0  0,2,... 0,2,2,...      293
-0.665260 -0.649722           32           32.0  0,0,... 0,0,0,...      293
-0.636922 -0.608585           64           64.0  1,1,... 1,1,0,...      293
-0.654133 -0.671344          128          128.0  1,1,... 1,1,2,...      293
-0.706673 -0.759212          256          256.0  1,2,... 1,2,0,...      293
-0.796487 -0.886301          512          512.0  2,0,... 2,0,1,...      293
-0.845190 -0.893893         1024         1024.0  1,1,... 1,1,0,...      293
-0.881608 -0.918026         2048         2048.0  1,1,... 1,1,0,...      293
-0.899521 -0.917434         4096         4096.0  2,0,... 2,0,3,...      293

finished run
number of examples = 5000
weighted example sum = 5000.000000
weighted label sum = 0.000000
average loss = -0.904476
total feature number = 1465000

Contextual Bandit

This involves expanding out all possible combinations.

def generate_combinations(shared_context, torso_items, legs_items, feet_items):
    examples = [f"shared |User {shared_context}"]
    descriptions = []
    for i, torso in enumerate(torso_items):
        for j, legs in enumerate(legs_items):
            for k, feet in enumerate(feet_items):
                examples.append(f"|Action torso={torso} legs={legs} feet={feet}")
                descriptions.append((i,j,k))
                
    return examples, descriptions

def sample_custom_pmf(pmf):
    total = sum(pmf)
    scale = 1 / total
    pmf = [x * scale for x in pmf]
    draw = random.random()
    sum_prob = 0.0
    for index, prob in enumerate(pmf):
        sum_prob += prob
        if(sum_prob > draw):
            return index, prob
cb_vw = pyvw.Workspace("--cb_explore_adf --epsilon 0.2 --interactions AA AU AAU -l 0.05 --power_t 0")

cb_rewards = []
for _ in range(NUM_ITERATIONS):
    shared_context = random.choice(shared_contexts)
    examples, indices = generate_combinations(shared_context, torso_items, legs_items, feet_items)
    cb_prediction = cb_vw.predict(examples)
    chosen_index, prob = sample_custom_pmf(cb_prediction)
    torso_index, legs_index, feet_index = indices[chosen_index]
    reward = reward_function(shared_context, torso_index, legs_index, feet_index)
    cb_rewards.append(reward)
    examples[chosen_index + 1]= f"0:{-1*reward}:{prob} {examples[chosen_index + 1]}"
    cb_vw.learn(examples)

cb_vw.finish()
creating features for following interactions: AA AU AAU
Num weight bits = 18
learning rate = 0.05
initial_t = 0
power_t = 0
using no cache
Reading datafile = stdin
num sources = 1
Enabled reductions: gd, scorer-identity, csoaa_ldf-rank, cb_adf, cb_explore_adf_greedy, shared_feature_merger
Input label = cb
Output pred = action_probs
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
    n.a.     n.a.            1            1.0  unknown        0:0.0277778...      720
-0.415299 -0.415299            2            2.0 33:-0.42:0.028        0:0.0277778...      720
-0.519694 -0.624090            4            4.0 33:-0.62:0.81       33:0.805556...      720
-0.592130 -0.664566            8            8.0 33:-0.54:0.81       33:0.805556...      720
-0.698954 -0.805777           16           16.0 33:-0.96:0.81       33:0.805556...      720
-0.718624 -0.738295           32           32.0 33:-0.63:0.81       33:0.805556...      720
-0.674117 -0.629610           64           64.0 9:-0.71:0.81        9:0.805556...      720
-0.693832 -0.713547          128          128.0 12:-0.67:0.0056       25:0.805556...      720
-0.731688 -0.769544          256          256.0 13:-0.77:0.81       13:0.805556...      720
-0.781812 -0.831936          512          512.0 15:-0.52:0.0056       21:0.805556...      720
[warning] Some interactions contain duplicate characters and their characters order has been changed. Interactions affected: 1.
-0.802762 -0.823712         1024         1024.0 27:-0.94:0.81       27:0.805556...      720
-0.842221 -0.881679         2048         2048.0 27:-0.95:0.81       27:0.805556...      720
-0.888638 -0.935056         4096         4096.0 35:-0.99:0.81       35:0.805556...      720

finished run
number of examples = 5000
weighted example sum = 5000.000000
weighted label sum = 0.000000
average loss = -0.894039
total feature number = 3600000

Comparison

plt.plot(pd.Series(cb_rewards).expanding().mean())
plt.plot(pd.Series(slates_rewards).expanding().mean())
plt.xlabel('Iterations')
plt.ylabel('Average reward')
plt.legend(['cb', 'slates'])
<matplotlib.legend.Legend at 0x7f8909314f70>
../_images/python_slates_12_1.png