Poisson Regression

import numpy as np
import matplotlib.pyplot as plt
from vowpalwabbit import pyvw

%matplotlib inline
# Generate some count data that has poisson distribution 
# z ~ poisson(x + y), x \in [0,10), y \in [0,10)
x = np.random.choice(range(0,10), 100)
y = np.random.choice(range(0,10), 100)
z = np.random.poisson(x + y)

We will model this data in two ways

  • log transform the labels and use linear prediction (square loss)

  • model it directly using poisson loss

The first model predicts mean(log(label)) the second predicts log(mean(label)). Due to Jensen’s inequality, the first approach produces systematic negative bias

# Train log-transform model
training_samples = []
logz = np.log(0.001 + z)
vw = pyvw.vw("-b 2 --loss_function squared -l 0.1 --holdout_off -f vw.log.model --readable_model vw.readable.log.model")
for i in range(len(logz)):
    training_samples.append("{label} | x:{x} y:{y}".format(label=logz[i], x=x[i], y=y[i]))
# Do hundred passes over the data and store the model in vw.log.model
for iteration in range(100):
    for i in range(len(training_samples)):
        vw.learn(training_samples[i])
vw.finish()
final_regressor = vw.log.model
Num weight bits = 2
learning rate = 0.1
initial_t = 0
power_t = 0.5
using no cache
Reading datafile = 
num sources = 1
Enabled reductions: gd, scorer-identity, count_label
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
5.302358 5.302358            1            1.0   2.3027   0.0000        3
6.305630 7.308902            2            2.0   2.8333   0.1298        3
4.742760 3.179891            4            4.0   1.3865   0.1875        3
4.219957 3.697154            8            8.0   2.7081   0.5648        3
3.196161 2.172365           16           16.0   2.0796   0.6042        2
2.356141 1.516122           32           32.0   1.0989   0.7966        3
4.323998 6.291855           64           64.0   1.0989   0.8057        3
2.513822 0.703645          128          128.0   2.0796   1.4883        2
2.799848 3.085875          256          256.0   2.3027   2.2876        3
2.210689 1.621529          512          512.0   2.3027   1.9261        2
2.048691 1.886692         1024         1024.0   1.6096   1.4650        3
1.955284 1.861877         2048         2048.0  -6.9078   0.6679        2
1.866178 1.777073         4096         4096.0   2.7081   2.8429        3
1.780606 1.695034         8192         8192.0   2.3980   3.2179        3

finished run
number of examples = 10000
weighted example sum = 10000.000000
weighted label sum = 16456.317532
average loss = 1.757985
best constant = 1.645632
total feature number = 27800
# Generate predictions from the log-transform model
vw = pyvw.vw("-i vw.log.model -t")
log_predictions = [vw.predict(sample) for sample in training_samples]
# Measure bias in the log-domain
log_bias = np.mean(log_predictions - logz)
bias = np.mean(np.exp(log_predictions) - z)
only testing
Num weight bits = 2
learning rate = 0.5
initial_t = 0
power_t = 0.5
using no cache
Reading datafile = 
num sources = 1
Enabled reductions: gd, scorer-identity, count_label
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
0.050297 0.050297            1            1.0   2.3027   2.5270        3
0.332390 0.614482            2            2.0   2.8333   2.0494        3
0.562207 0.792024            4            4.0   1.3865   0.4554        3
0.325790 0.089373            8            8.0   2.7081   3.2581        3
0.273723 0.221656           16           16.0   2.0796   1.4327        2
0.218938 0.164154           32           32.0   1.0989   1.3662        3
2.400028 4.581117           64           64.0   1.0989   0.6831        3

Although the model is relatively unbiased in the log-domain where we trained our model, in the original domain there is underprediction as we expected from Jensenn’s inequality

# Train original domain model using poisson regression
training_samples = []
vw = pyvw.vw("-b 2 --loss_function poisson -l 0.1 --holdout_off -f vw.poisson.model --readable_model vw.readable.poisson.model")
for i in range(len(z)):
    training_samples.append("{label} | x:{x} y:{y}".format(label=z[i], x=x[i], y=y[i]))
# Do hundred passes over the data and store the model in vw.log.model
for iteration in range(100):
    for i in range(len(training_samples)):
        vw.learn(training_samples[i])
vw.finish()
final_regressor = vw.poisson.model
Num weight bits = 2
learning rate = 0.1
initial_t = 0
power_t = 0.5
using no cache
Reading datafile = 
num sources = 1
Enabled reductions: gd, scorer-identity, count_label
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features

finished run
number of examples = 100
weighted example sum = 100.000000
weighted label sum = 164.563175
average loss = 1.654389
best constant = 1.645632
total feature number = 278
28.051704 28.051704            1            1.0  10.0000   0.0000        3
44.066526 60.081348            2            2.0  17.0000   0.1333        3
30.821339 17.576151            4            4.0   4.0000   0.1999        3
32.840433 34.859528            8            8.0  15.0000   0.5761        3
23.846012 14.851590           16           16.0   8.0000   0.5707        2
17.698584 11.551157           32           32.0   3.0000   0.7527        3
12.867062 8.035540           64           64.0   3.0000   0.9238        3
8.591750 4.316438          128          128.0   8.0000   1.7430        2
5.080934 1.570118          256          256.0  10.0000   2.5733        3
3.132797 1.184661          512          512.0  10.0000   2.0972        2
2.159466 1.186135         1024         1024.0   5.0000   1.7583        3
1.657445 1.155424         2048         2048.0   0.0000   1.2288        2
1.399527 1.141610         4096         4096.0  15.0000   2.6864        3
1.263766 1.128004         8192         8192.0  11.0000   2.8324        3

finished run
number of examples = 10000
weighted example sum = 10000.000000
weighted label sum = 82400.000000
average loss = 1.237994
total feature number = 27800
# Generate predictions from the poisson model
vw = pyvw.vw("-i vw.poisson.model")
poisson_predictions = [np.exp(vw.predict(sample)) for sample in training_samples]
poisson_bias = np.mean(poisson_predictions - z)
Num weight bits = 2
learning rate = 0.5
initial_t = 0
power_t = 0.5
using no cache
Reading datafile = 
num sources = 1
Enabled reductions: gd, scorer-identity, count_label
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
57.140408 57.140408            1            1.0  10.0000   2.4409        3
138.400251 219.660095            2            2.0  17.0000   2.1791        3
96.490940 54.581629            4            4.0   4.0000   1.2623        3
133.538922 170.586904            8            8.0  15.0000   2.9644        3
97.535066 61.531210           16           16.0   8.0000   1.7858        2
76.750609 55.966153           32           32.0   3.0000   1.7862        3
65.387952 54.025294           64           64.0   3.0000   1.3933        3
plt.figure(figsize=(18,6))
# Measure bias in the log-domain
plt.subplot(131)
plt.plot(logz, log_predictions, '.')
plt.plot(logz, logz, 'r')
plt.title('Log-domain bias:%f'%(log_bias))
plt.xlabel('label')
plt.ylabel('prediction')

plt.subplot(132)
plt.plot(z, np.exp(log_predictions), '.')
plt.plot(z, z, 'r')
plt.title('Original-domain bias:%f'%(bias))
plt.xlabel('label')
plt.ylabel('prediction')

plt.subplot(133)
plt.plot(z, poisson_predictions, '.')
plt.plot(z, z, 'r')
plt.title('Poisson bias:%f'%(poisson_bias))
plt.xlabel('label')
plt.ylabel('prediction')
Text(0, 0.5, 'prediction')
../_images/poisson_regression_9_1.png