Poisson Regression

import numpy as np
import matplotlib.pyplot as plt
from vowpalwabbit import pyvw

%matplotlib inline
# Generate some count data that has poisson distribution 
# z ~ poisson(x + y), x \in [0,10), y \in [0,10)
x = np.random.choice(range(0,10), 100)
y = np.random.choice(range(0,10), 100)
z = np.random.poisson(x + y)

We will model this data in two ways

  • log transform the labels and use linear prediction (square loss)

  • model it directly using poisson loss

The first model predicts mean(log(label)) the second predicts log(mean(label)). Due to Jensen’s inequality, the first approach produces systematic negative bias

# Train log-transform model
training_samples = []
logz = np.log(0.001 + z)
vw = pyvw.vw("-b 2 --loss_function squared -l 0.1 --holdout_off -f vw.log.model --readable_model vw.readable.log.model")
for i in range(len(logz)):
    training_samples.append("{label} | x:{x} y:{y}".format(label=logz[i], x=x[i], y=y[i]))
# Do hundred passes over the data and store the model in vw.log.model
for iteration in range(100):
    for i in range(len(training_samples)):
        vw.learn(training_samples[i])
vw.finish()
final_regressor = vw.log.model
Num weight bits = 2
learning rate = 0.1
initial_t = 0
power_t = 0.5
using no cache
Reading datafile = 
num sources = 1
Enabled reductions: gd, scorer-identity
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
2.590933 2.590933            1            1.0   1.6096   0.0000        3
3.137801 3.684668            2            2.0   2.3027   0.3831        3
2.864668 2.591535            4            4.0   1.9461   0.2947        3
2.528114 2.191559            8            8.0   2.8333   0.3745        3
2.437603 2.347092           16           16.0   1.7919   0.7085        3
2.241253 2.044903           32           32.0   2.3980   1.1270        3
1.489080 0.736907           64           64.0   1.9461   1.1384        2
0.957040 0.424999          128          128.0   2.5650   2.1992        3
0.588098 0.219157          256          256.0   2.7727   2.6029        3
0.408366 0.228634          512          512.0   1.7919   1.8709        3
0.313180 0.217993         1024         1024.0   2.1973   2.3406        3
0.259826 0.206473         2048         2048.0   2.1973   2.7365        3
0.232338 0.204851         4096         4096.0   2.4850   2.2066        3
0.217545 0.202752         8192         8192.0   1.9461   1.9921        3

finished run
number of examples = 10000
weighted example sum = 10000.000000
weighted label sum = 19609.791941
average loss = 0.214645
best constant = 1.960979
total feature number = 27800
# Generate predictions from the log-transform model
vw = pyvw.vw("-i vw.log.model -t")
log_predictions = [vw.predict(sample) for sample in training_samples]
# Measure bias in the log-domain
log_bias = np.mean(log_predictions - logz)
bias = np.mean(np.exp(log_predictions) - z)
only testing
Num weight bits = 2
learning rate = 0.5
initial_t = 0
power_t = 0.5
using no cache
Reading datafile = 
num sources = 1
Enabled reductions: gd, scorer-identity
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
0.002911 0.002911            1            1.0   1.6096   1.5557        3
0.134465 0.266020            2            2.0   2.3027   2.8185        3
0.097165 0.059864            4            4.0   1.9461   2.2910        3
0.298797 0.500430            8            8.0   2.8333   1.8520        3
0.351343 0.403888           16           16.0   1.7919   2.0723        3
0.251262 0.151181           32           32.0   2.3980   2.1374        3
0.207267 0.163273           64           64.0   1.9461   1.4021        2

Although the model is relatively unbiased in the log-domain where we trained our model, in the original domain there is underprediction as we expected from Jensenn’s inequality

# Train original domain model using poisson regression
training_samples = []
vw = pyvw.vw("-b 2 --loss_function poisson -l 0.1 --holdout_off -f vw.poisson.model --readable_model vw.readable.poisson.model")
for i in range(len(z)):
    training_samples.append("{label} | x:{x} y:{y}".format(label=z[i], x=x[i], y=y[i]))
# Do hundred passes over the data and store the model in vw.log.model
for iteration in range(100):
    for i in range(len(training_samples)):
        vw.learn(training_samples[i])
vw.finish()
final_regressor = vw.poisson.model
Num weight bits = 2
learning rate = 0.1
initial_t = 0
power_t = 0.5
using no cache
Reading datafile = 
num sources = 1
Enabled reductions: gd, scorer-identity
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features

finished run
number of examples = 100
weighted example sum = 100.000000
weighted label sum = 196.097919
average loss = 0.201313
best constant = 1.960979
total feature number = 278
8.094381 8.094381            1            1.0   5.0000   0.0000        3
14.608094 21.121807            2            2.0  10.0000   0.3949        3
12.516924 10.425754            4            4.0   7.0000   0.3018        3
14.036211 15.555498            8            8.0  17.0000   0.3668        3
15.067509 16.098808           16           16.0   6.0000   0.7694        3
14.834533 14.601557           32           32.0  11.0000   1.2046        3
9.955944 5.077355           64           64.0   7.0000   1.2338        2
6.278758 2.601573          128          128.0  13.0000   2.3966        3
3.718418 1.158078          256          256.0  16.0000   2.7284        3
2.455249 1.192080          512          512.0   6.0000   1.9758        3
1.808541 1.161833         1024         1024.0   9.0000   2.4087        3
1.469546 1.130552         2048         2048.0   9.0000   2.7301        3
1.300288 1.131029         4096         4096.0  12.0000   2.2320        3
1.215360 1.130432         8192         8192.0   7.0000   2.1158        3
finished run
number of examples = 10000
weighted example sum = 10000.000000
weighted label sum = 88200.000000
average loss = 1.198926
total feature number = 27800
# Generate predictions from the poisson model
vw = pyvw.vw("-i vw.poisson.model")
poisson_predictions = [np.exp(vw.predict(sample)) for sample in training_samples]
poisson_bias = np.mean(poisson_predictions - z)
Num weight bits = 2
learning rate = 0.5
initial_t = 0
power_t = 0.5
using no cache
Reading datafile = 
num sources = 1
Enabled reductions: gd, scorer-identity
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
10.746887 10.746887            1            1.0   5.0000   1.7218        3
31.373859 52.000832            2            2.0  10.0000   2.7888        3
25.164916 18.955973            4            4.0   7.0000   2.3855        3
42.893786 60.622657            8            8.0  17.0000   1.9918        3
55.455202 68.016617           16           16.0   6.0000   2.1023        3
72.149586 88.843969           32           32.0  11.0000   2.2391        3
67.988767 63.827948           64           64.0   7.0000   1.5753        2
plt.figure(figsize=(18,6))
# Measure bias in the log-domain
plt.subplot(131)
plt.plot(logz, log_predictions, '.')
plt.plot(logz, logz, 'r')
plt.title('Log-domain bias:%f'%(log_bias))
plt.xlabel('label')
plt.ylabel('prediction')

plt.subplot(132)
plt.plot(z, np.exp(log_predictions), '.')
plt.plot(z, z, 'r')
plt.title('Original-domain bias:%f'%(bias))
plt.xlabel('label')
plt.ylabel('prediction')

plt.subplot(133)
plt.plot(z, poisson_predictions, '.')
plt.plot(z, z, 'r')
plt.title('Poisson bias:%f'%(poisson_bias))
plt.xlabel('label')
plt.ylabel('prediction')
Text(0, 0.5, 'prediction')
../_images/poisson_regression_9_1.png