Slates

from vowpalwabbit import pyvw
import matplotlib.pyplot as plt
import pandas as pd
import random
import math
import numpy as np

NUM_ITERATIONS = 2500
shared_contexts = ["corporate", "trade"]
torso_items = ["tshirt", "buttonupshirt", "highvis"]
legs_items = ["workpants", "formalpants", "shorts"]
feet_items = ["formalshoes", "runners", "flipflops", "boots"]

Scenario: Outfit optimization

  • Slots are different clothing types

  • Actions are the individual pieces of clothing for each slot

slates_scenario.png

Reward function

The chosen action and context is supplied to the this to determine the reward. Noise is injected to make the learning problem harder and reflect how in reality there may be variation in how the reward function reacts, possibly because of missing information.

def noise(center, stddev=0.075):
    return np.random.normal(loc = center, scale=0.075)

def reward_function(shared_context, torso_index, legs_index, feet_index):    
    if shared_context == "corporate":
        torso_values = [noise(0.2), noise(0.3), noise(0.1)]
        legs_val = [noise(0.1), noise(0.3), noise(0.2)]    
        feet_values = [noise(0.4), noise(0.3), noise(0.05), noise(0.1)]
    if shared_context == "trade":
        torso_values = [noise(0.15), noise(0.2), noise(0.3)]
        legs_val = [noise(0.4), noise(0.2), noise(0.35)]    
        feet_values = [noise(0.15), noise(0.2), noise(0.1), noise(0.3)]
    
    return torso_values[torso_index] + legs_val[legs_index] + feet_values[feet_index]
    

Slates

def generate_slates_text_format(shared_context):
    return [
       f"slates shared |User {shared_context}",
        "slates action 0 |Action tshirt", 
        "slates action 0 |Action buttonupshirt", 
        "slates action 0 |Action highvis", 
        "slates action 1 |Action workpants", 
        "slates action 1 |Action formalpants", 
        "slates action 1 |Action shorts", 
        "slates action 2 |Action formalshoes", 
        "slates action 2 |Action runners", 
        "slates action 2 |Action flipflops", 
        "slates action 2 |Action boots", 
        "slates slot |Slot torso", 
        "slates slot |Slot legs",
        "slates slot |Slot feet"
    ]

def generate_slates_text_format_with_label(shared_context, reward, chosen_torso_index, chosen_torso_prob, chosen_legs_index, chosen_legs_prob, chosen_feet_index, chosen_feet_prob):
    return [
       f"slates shared {-1*reward} |User {shared_context}",
        "slates action 0 |Action tshirt", 
        "slates action 0 |Action buttonupshirt", 
        "slates action 0 |Action highvis", 
        "slates action 1 |Action workpants", 
        "slates action 1 |Action formalpants", 
        "slates action 1 |Action shorts", 
        "slates action 2 |Action formalshoes", 
        "slates action 2 |Action runners", 
        "slates action 2 |Action flipflops", 
        "slates action 2 |Action boots", 
        f"slates slot {chosen_torso_index}:{chosen_torso_prob} |Slot torso", 
        f"slates slot {chosen_legs_index}:{chosen_legs_prob} |Slot legs",
        f"slates slot {chosen_feet_index}:{chosen_feet_prob} |Slot feet"

    ]
slates_vw = pyvw.vw("--slates --epsilon 0.2 --interactions SA UAS US UA -l 0.05 --power_t 0")

slates_rewards = []
for _ in range(NUM_ITERATIONS):
    shared_context = random.choice(shared_contexts)
    slates_prediction = slates_vw.predict(generate_slates_text_format(shared_context))
    torso_index, torso_prob = slates_prediction[0][0]
    legs_index, legs_prob = slates_prediction[1][0]    
    feet_index, feet_prob = slates_prediction[2][0]
    reward = reward_function(shared_context, torso_index, legs_index, feet_index)
    slates_rewards.append(reward)
    slates_vw.learn(generate_slates_text_format_with_label(shared_context,reward, torso_index, torso_prob, legs_index, legs_prob, feet_index, feet_prob))

slates_vw.finish()
creating features for following interactions: SA UAS US UA 
Num weight bits = 18
learning rate = 0.05
initial_t = 0
power_t = 0
using no cache
Reading datafile = 
num sources = 1
Enabled reductions: gd, generate_interactions, scorer-identity, csoaa_ldf-rank, cb_adf, cb_explore_adf_greedy, cb_sample, shared_feature_merger, ccb_explore_adf, slates
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
    n.a.     n.a.            1            1.0  ?,?,... 0,0,0,...      138
-0.607109 -0.607109            2            2.0  0,0,... 0,0,0,...      138
-0.752050 -0.896991            4            4.0  0,1,... 0,1,0,...      138
-0.794053 -0.836055            8            8.0  0,0,... 0,0,0,...      138
-0.737228 -0.680403           16           16.0  0,0,... 0,0,0,...      138
-0.673894 -0.610560           32           32.0  0,0,... 0,0,2,...      138
-0.657438 -0.640983           64           64.0  2,0,... 2,0,0,...      138
-0.734342 -0.811246          128          128.0  1,1,... 1,1,3,...      138
-0.782441 -0.830540          256          256.0  1,1,... 1,1,3,...      138
-0.831416 -0.880391          512          512.0  2,0,... 2,0,1,...      138
-0.861248 -0.891079         1024         1024.0  1,1,... 1,1,0,...      138
-0.885195 -0.909143         2048         2048.0  1,1,... 1,1,0,...      138
-0.900961 -0.916727         4096         4096.0  2,2,... 2,2,3,...      138
finished run
number of examples = 5000
weighted example sum = 5000.000000
weighted label sum = 0.000000
average loss = -0.904777
total feature number = 690000

Contextual Bandit

This involves expanding out all possible combinations.

def generate_combinations(shared_context, torso_items, legs_items, feet_items):
    examples = [f"shared |User {shared_context}"]
    descriptions = []
    for i, torso in enumerate(torso_items):
        for j, legs in enumerate(legs_items):
            for k, feet in enumerate(feet_items):
                examples.append(f"|Action torso={torso} legs={legs} feet={feet}")
                descriptions.append((i,j,k))
                
    return examples, descriptions

def sample_custom_pmf(pmf):
    total = sum(pmf)
    scale = 1 / total
    pmf = [x * scale for x in pmf]
    draw = random.random()
    sum_prob = 0.0
    for index, prob in enumerate(pmf):
        sum_prob += prob
        if(sum_prob > draw):
            return index, prob
cb_vw = pyvw.vw("--cb_explore_adf --epsilon 0.2 --interactions AA AU AAU -l 0.05 --power_t 0")

cb_rewards = []
for _ in range(NUM_ITERATIONS):
    shared_context = random.choice(shared_contexts)
    examples, indices = generate_combinations(shared_context, torso_items, legs_items, feet_items)
    cb_prediction = cb_vw.predict(examples)
    chosen_index, prob = sample_custom_pmf(cb_prediction)
    torso_index, legs_index, feet_index = indices[chosen_index]
    reward = reward_function(shared_context, torso_index, legs_index, feet_index)
    cb_rewards.append(reward)
    examples[chosen_index + 1]= f"0:{-1*reward}:{prob} {examples[chosen_index + 1]}"
    cb_vw.learn(examples)

cb_vw.finish()
creating features for following interactions: AA AU AAU 
WARNING: some interactions contain duplicate characters and their characters order has been changed. Interactions affected: 1.
Num weight bits = 18
learning rate = 0.05
initial_t = 0
power_t = 0
using no cache
Reading datafile = 
num sources = 1
Enabled reductions: gd, scorer-identity, csoaa_ldf-rank, cb_adf, cb_explore_adf_greedy, shared_feature_merger
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
    n.a.     n.a.            1            1.0  unknown        0:0.0277778...      684
-0.750953 -0.750953            2            2.0 24:-0.75:0.028        0:0.0277778...      684
-0.593408 -0.435862            4            4.0 24:-0.44:0.81       24:0.805556...      684
-0.694582 -0.795757            8            8.0 24:-0.87:0.81       24:0.805556...      684
-0.644825 -0.595068           16           16.0 7:-0.71:0.81        7:0.805556...      684
-0.676694 -0.708563           32           32.0 31:-0.42:0.81       31:0.805556...      684
-0.658771 -0.640848           64           64.0 26:-0.72:0.81       26:0.805556...      684
-0.698468 -0.738166          128          128.0 24:-1:0.0056       33:0.805556...      684
-0.778972 -0.859477          256          256.0 20:-0.78:0.81       20:0.805556...      684
-0.815785 -0.852597          512          512.0 26:-0.88:0.81       26:0.805556...      684
-0.816240 -0.816696         1024         1024.0 30:-0.12:0.0056       20:0.805556...      684
-0.831836 -0.847432         2048         2048.0 16:-1.1:0.81       16:0.805556...      684
-0.864029 -0.896223         4096         4096.0 16:-1.1:0.81       16:0.805556...      684

finished run
number of examples = 5000
weighted example sum = 5000.000000
weighted label sum = 0.000000
average loss = -0.876520
total feature number = 3430000

Comparison

plt.plot(pd.Series(cb_rewards).expanding().mean())
plt.plot(pd.Series(slates_rewards).expanding().mean())
plt.xlabel('Iterations')
plt.ylabel('Average reward')
plt.legend(['cb', 'slates'])
<matplotlib.legend.Legend at 0x7f855a795b50>
../_images/python_slates_12_1.png