Slates

from vowpalwabbit import pyvw
import matplotlib.pyplot as plt
import pandas as pd
import random
import math
import numpy as np

NUM_ITERATIONS = 2500
shared_contexts = ["corporate", "trade"]
torso_items = ["tshirt", "buttonupshirt", "highvis"]
legs_items = ["workpants", "formalpants", "shorts"]
feet_items = ["formalshoes", "runners", "flipflops", "boots"]

Scenario: Outfit optimization

  • Slots are different clothing types

  • Actions are the individual pieces of clothing for each slot

slates_scenario.png

Reward function

The chosen action and context is supplied to the this to determine the reward. Noise is injected to make the learning problem harder and reflect how in reality there may be variation in how the reward function reacts, possibly because of missing information.

def noise(center, stddev=0.075):
    return np.random.normal(loc = center, scale=0.075)

def reward_function(shared_context, torso_index, legs_index, feet_index):    
    if shared_context == "corporate":
        torso_values = [noise(0.2), noise(0.3), noise(0.1)]
        legs_val = [noise(0.1), noise(0.3), noise(0.2)]    
        feet_values = [noise(0.4), noise(0.3), noise(0.05), noise(0.1)]
    if shared_context == "trade":
        torso_values = [noise(0.15), noise(0.2), noise(0.3)]
        legs_val = [noise(0.4), noise(0.2), noise(0.35)]    
        feet_values = [noise(0.15), noise(0.2), noise(0.1), noise(0.3)]
    
    return torso_values[torso_index] + legs_val[legs_index] + feet_values[feet_index]
    

Slates

def generate_slates_text_format(shared_context):
    return [
       f"slates shared |User {shared_context}",
        "slates action 0 |Action tshirt", 
        "slates action 0 |Action buttonupshirt", 
        "slates action 0 |Action highvis", 
        "slates action 1 |Action workpants", 
        "slates action 1 |Action formalpants", 
        "slates action 1 |Action shorts", 
        "slates action 2 |Action formalshoes", 
        "slates action 2 |Action runners", 
        "slates action 2 |Action flipflops", 
        "slates action 2 |Action boots", 
        "slates slot |Slot torso", 
        "slates slot |Slot legs",
        "slates slot |Slot feet"
    ]

def generate_slates_text_format_with_label(shared_context, reward, chosen_torso_index, chosen_torso_prob, chosen_legs_index, chosen_legs_prob, chosen_feet_index, chosen_feet_prob):
    return [
       f"slates shared {-1*reward} |User {shared_context}",
        "slates action 0 |Action tshirt", 
        "slates action 0 |Action buttonupshirt", 
        "slates action 0 |Action highvis", 
        "slates action 1 |Action workpants", 
        "slates action 1 |Action formalpants", 
        "slates action 1 |Action shorts", 
        "slates action 2 |Action formalshoes", 
        "slates action 2 |Action runners", 
        "slates action 2 |Action flipflops", 
        "slates action 2 |Action boots", 
        f"slates slot {chosen_torso_index}:{chosen_torso_prob} |Slot torso", 
        f"slates slot {chosen_legs_index}:{chosen_legs_prob} |Slot legs",
        f"slates slot {chosen_feet_index}:{chosen_feet_prob} |Slot feet"

    ]
slates_vw = pyvw.vw("--slates --epsilon 0.2 --interactions SA UAS US UA -l 0.05 --power_t 0")

slates_rewards = []
for _ in range(NUM_ITERATIONS):
    shared_context = random.choice(shared_contexts)
    slates_prediction = slates_vw.predict(generate_slates_text_format(shared_context))
    torso_index, torso_prob = slates_prediction[0][0]
    legs_index, legs_prob = slates_prediction[1][0]    
    feet_index, feet_prob = slates_prediction[2][0]
    reward = reward_function(shared_context, torso_index, legs_index, feet_index)
    slates_rewards.append(reward)
    slates_vw.learn(generate_slates_text_format_with_label(shared_context,reward, torso_index, torso_prob, legs_index, legs_prob, feet_index, feet_prob))

slates_vw.finish()
creating features for following interactions: SA UAS US UA
Num weight bits = 18
learning rate = 0.05
initial_t = 0
power_t = 0
using no cache
Reading datafile = 
num sources = 1
Enabled reductions: gd, generate_interactions, scorer-identity, csoaa_ldf-rank, cb_adf, cb_explore_adf_greedy, cb_sample, shared_feature_merger, ccb_explore_adf, slates
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
    n.a.     n.a.            1            1.0  ?,?,... 0,0,0,...      138
-0.744372 -0.744372            2            2.0  0,0,... 0,0,0,...      138
-0.744115 -0.743858            4            4.0  0,1,... 0,1,0,...      138
-0.667008 -0.589900            8            8.0  0,0,... 0,0,0,...      138
-0.688857 -0.710706           16           16.0  0,2,... 0,2,0,...      138
-0.688585 -0.688314           32           32.0  0,0,... 0,0,1,...      138
-0.739713 -0.790840           64           64.0  1,2,... 1,2,0,...      138
-0.798039 -0.856364          128          128.0  1,1,... 1,1,2,...      138
-0.808496 -0.818954          256          256.0  1,1,... 1,1,0,...      138
-0.848536 -0.888575          512          512.0  1,1,... 1,1,0,...      138
-0.871250 -0.893964         1024         1024.0  2,0,... 2,0,3,...      138
-0.886220 -0.901190         2048         2048.0  1,1,... 1,1,0,...      138
-0.907769 -0.929318         4096         4096.0  1,1,... 1,1,0,...      138

finished run
number of examples = 5000
weighted example sum = 5000.000000
weighted label sum = 0.000000
average loss = -0.910293
total feature number = 690000

Contextual Bandit

This involves expanding out all possible combinations.

def generate_combinations(shared_context, torso_items, legs_items, feet_items):
    examples = [f"shared |User {shared_context}"]
    descriptions = []
    for i, torso in enumerate(torso_items):
        for j, legs in enumerate(legs_items):
            for k, feet in enumerate(feet_items):
                examples.append(f"|Action torso={torso} legs={legs} feet={feet}")
                descriptions.append((i,j,k))
                
    return examples, descriptions

def sample_custom_pmf(pmf):
    total = sum(pmf)
    scale = 1 / total
    pmf = [x * scale for x in pmf]
    draw = random.random()
    sum_prob = 0.0
    for index, prob in enumerate(pmf):
        sum_prob += prob
        if(sum_prob > draw):
            return index, prob
cb_vw = pyvw.vw("--cb_explore_adf --epsilon 0.2 --interactions AA AU AAU -l 0.05 --power_t 0")

cb_rewards = []
for _ in range(NUM_ITERATIONS):
    shared_context = random.choice(shared_contexts)
    examples, indices = generate_combinations(shared_context, torso_items, legs_items, feet_items)
    cb_prediction = cb_vw.predict(examples)
    chosen_index, prob = sample_custom_pmf(cb_prediction)
    torso_index, legs_index, feet_index = indices[chosen_index]
    reward = reward_function(shared_context, torso_index, legs_index, feet_index)
    cb_rewards.append(reward)
    examples[chosen_index + 1]= f"0:{-1*reward}:{prob} {examples[chosen_index + 1]}"
    cb_vw.learn(examples)

cb_vw.finish()
creating features for following interactions: AA AU AAU
WARNING: some interactions contain duplicate characters and their characters order has been changed. Interactions affected: 1.
Num weight bits = 18
learning rate = 0.05
initial_t = 0
power_t = 0
using no cache
Reading datafile = 
num sources = 1
Enabled reductions: gd, scorer-identity, csoaa_ldf-rank, cb_adf, cb_explore_adf_greedy, shared_feature_merger
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
    n.a.     n.a.            1            1.0  unknown        0:0.0277778...      684
-0.605279 -0.605279            2            2.0 10:-0.61:0.028        0:0.0277778...      684
-0.461621 -0.317962            4            4.0 10:-0.32:0.81       10:0.805556...      684
-0.671651 -0.881682            8            8.0 33:-0.9:0.81       33:0.805556...      684
-0.698590 -0.725528           16           16.0 33:-0.57:0.81       33:0.805556...      684
-0.726397 -0.754204           32           32.0 16:-0.97:0.0056       33:0.805556...      684
-0.770160 -0.813924           64           64.0 33:-0.79:0.81       33:0.805556...      684
-0.807442 -0.844724          128          128.0 32:-0.65:0.81       32:0.805556...      684
-0.839516 -0.871589          256          256.0 15:-0.35:0.0056       16:0.805556...      684
-0.870240 -0.900965          512          512.0 25:-0.92:0.81       25:0.805556...      684
-0.880160 -0.890079         1024         1024.0 20:-0.68:0.0056       33:0.805556...      684
-0.909002 -0.937844         2048         2048.0 16:-0.97:0.81       16:0.805556...      684
-0.917558 -0.926114         4096         4096.0 0:-0.46:0.0056       27:0.805556...      684
finished run
number of examples = 5000
weighted example sum = 5000.000000
weighted label sum = 0.000000
average loss = -0.916926
total feature number = 3430000

Comparison

plt.plot(pd.Series(cb_rewards).expanding().mean())
plt.plot(pd.Series(slates_rewards).expanding().mean())
plt.xlabel('Iterations')
plt.ylabel('Average reward')
plt.legend(['cb', 'slates'])
<matplotlib.legend.Legend at 0x7fb2d0023d90>
../_images/python_slates_12_1.png