Poisson Regression

import numpy as np
import matplotlib.pyplot as plt
from vowpalwabbit import pyvw

%matplotlib inline
# Generate some count data that has poisson distribution 
# z ~ poisson(x + y), x \in [0,10), y \in [0,10)
x = np.random.choice(range(0,10), 100)
y = np.random.choice(range(0,10), 100)
z = np.random.poisson(x + y)

We will model this data in two ways

  • log transform the labels and use linear prediction (square loss)

  • model it directly using poisson loss

The first model predicts mean(log(label)) the second predicts log(mean(label)). Due to Jensen’s inequality, the first approach produces systematic negative bias

# Train log-transform model
training_samples = []
logz = np.log(0.001 + z)
vw = pyvw.Workspace("-b 2 --loss_function squared -l 0.1 --holdout_off -f vw.log.model --readable_model vw.readable.log.model")
for i in range(len(logz)):
    training_samples.append("{label} | x:{x} y:{y}".format(label=logz[i], x=x[i], y=y[i]))
# Do hundred passes over the data and store the model in vw.log.model
for iteration in range(100):
    for i in range(len(training_samples)):
        vw.learn(training_samples[i])
vw.finish()
final_regressor = vw.log.model
Num weight bits = 2
learning rate = 0.1
initial_t = 0
power_t = 0.5
using no cache
Reading datafile = stdin
num sources = 1
Enabled reductions: gd, scorer-identity, count_label
Input label = simple
Output pred = scalar
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
1.922505 1.922505            1            1.0   1.3865   0.0000        2
4.331072 6.739639            2            2.0   2.7081   0.1120        3
3.496160 2.661248            4            4.0   1.3865   0.2164        2
2.976132 2.456104            8            8.0   1.6096   0.3757        2
6.283954 9.591776           16           16.0   2.4850   0.8025        3
5.879286 5.474617           32           32.0   2.8904   1.3215        3
3.439870 1.000454           64           64.0   0.6936   0.8076        2
3.837745 4.235621          128          128.0   2.5650   1.5728        3
3.011081 2.184417          256          256.0   2.3027   1.9678        3
2.765541 2.520002          512          512.0   1.7919   2.0015        3
2.677266 2.588990         1024         1024.0   2.4850   1.5645        3
2.563488 2.449711         2048         2048.0   2.5650   2.9012        3
2.471620 2.379752         4096         4096.0   2.5650   2.2378        3
2.394410 2.317199         8192         8192.0   2.3980   3.0911        3
finished run
number of examples = 10000
weighted example sum = 10000.000000
weighted label sum = 16384.216747
average loss = 2.372263
best constant = 1.638422
total feature number = 27400
# Generate predictions from the log-transform model
vw = pyvw.Workspace("-i vw.log.model -t")
log_predictions = [vw.predict(sample) for sample in training_samples]
# Measure bias in the log-domain
log_bias = np.mean(log_predictions - logz)
bias = np.mean(np.exp(log_predictions) - z)
only testing
Num weight bits = 2
learning rate = 0.5
initial_t = 0
power_t = 0.5
using no cache
Reading datafile = stdin
num sources = 1
Enabled reductions: gd, scorer-identity, count_label
Input label = simple
Output pred = scalar
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
0.060010 0.060010            1            1.0   1.3865   1.1416        2
0.046599 0.033187            2            2.0   2.7081   2.5259        3
0.082396 0.118193            4            4.0   1.3865   0.9443        2
0.167461 0.252526            8            8.0   1.6096   1.1733        2
3.468061 6.768661           16           16.0   2.4850   2.9680        3
3.382959 3.297857           32           32.0   2.8904   3.0911        3
1.931388 0.479817           64           64.0   0.6936   0.4705        2

Although the model is relatively unbiased in the log-domain where we trained our model, in the original domain there is underprediction as we expected from Jensenn’s inequality

# Train original domain model using poisson regression
training_samples = []
vw = pyvw.Workspace("-b 2 --loss_function poisson -l 0.1 --holdout_off -f vw.poisson.model --readable_model vw.readable.poisson.model")
for i in range(len(z)):
    training_samples.append("{label} | x:{x} y:{y}".format(label=z[i], x=x[i], y=y[i]))
# Do hundred passes over the data and store the model in vw.log.model
for iteration in range(100):
    for i in range(len(training_samples)):
        vw.learn(training_samples[i])
vw.finish()
final_regressor = vw.poisson.model
Num weight bits = 2
learning rate = 0.1
initial_t = 0
power_t = 0.5
using no cache
Reading datafile = stdin
num sources = 1
Enabled reductions: gd, scorer-identity, count_label
Input label = simple
Output pred = scalar
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features

finished run
number of examples = 100
weighted example sum = 100.000000
weighted label sum = 163.842167
average loss = 2.271243
best constant = 1.638422
total feature number = 274
5.090357 5.090357            1            1.0   4.0000   0.0000        2
27.563361 50.036366            2            2.0  15.0000   0.1150        3
20.233322 12.903283            4            4.0   4.0000   0.2223        2
19.189488 18.145653            8            8.0   5.0000   0.3696        2
20.158799 21.128111           16           16.0  12.0000   0.8780        3
16.796496 13.434193           32           32.0  18.0000   1.4774        3
11.046193 5.295890           64           64.0   2.0000   1.0289        2
7.104954 3.163716          128          128.0  13.0000   1.9516        3
4.282213 1.459471          256          256.0  10.0000   2.2517        3
2.841242 1.400271          512          512.0   6.0000   2.2145        3
2.130148 1.419053         1024         1024.0  12.0000   1.8831        3
1.761096 1.392045         2048         2048.0  13.0000   2.7171        3
1.581082 1.401068         4096         4096.0  13.0000   2.2985        3
1.487864 1.394645         8192         8192.0  11.0000   2.6950        3

finished run
number of examples = 10000
weighted example sum = 10000.000000
weighted label sum = 84300.000000
average loss = 1.469801
total feature number = 27400
# Generate predictions from the poisson model
vw = pyvw.Workspace("-i vw.poisson.model")
poisson_predictions = [np.exp(vw.predict(sample)) for sample in training_samples]
poisson_bias = np.mean(poisson_predictions - z)
Num weight bits = 2
learning rate = 0.5
initial_t = 0
power_t = 0.5
using no cache
Reading datafile = stdin
num sources = 1
Enabled reductions: gd, scorer-identity, count_label
Input label = simple
Output pred = scalar
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
5.266941 5.266941            1            1.0   4.0000   1.7050        2
81.524408 157.781876            2            2.0  15.0000   2.4389        3
56.656250 31.788092            4            4.0   4.0000   1.6980        2
64.666263 72.676276            8            8.0   5.0000   1.8142        2
77.723592 90.780921           16           16.0  12.0000   2.6168        3
76.058522 74.393452           32           32.0  18.0000   2.9291        3
63.205591 50.352660           64           64.0   2.0000   1.4109        2
plt.figure(figsize=(18,6))
# Measure bias in the log-domain
plt.subplot(131)
plt.plot(logz, log_predictions, '.')
plt.plot(logz, logz, 'r')
plt.title('Log-domain bias:%f'%(log_bias))
plt.xlabel('label')
plt.ylabel('prediction')

plt.subplot(132)
plt.plot(z, np.exp(log_predictions), '.')
plt.plot(z, z, 'r')
plt.title('Original-domain bias:%f'%(bias))
plt.xlabel('label')
plt.ylabel('prediction')

plt.subplot(133)
plt.plot(z, poisson_predictions, '.')
plt.plot(z, z, 'r')
plt.title('Poisson bias:%f'%(poisson_bias))
plt.xlabel('label')
plt.ylabel('prediction')
Text(0, 0.5, 'prediction')
../_images/poisson_regression_9_1.png