Poisson Regression#

import numpy as np
import matplotlib.pyplot as plt
import vowpalwabbit

%matplotlib inline
# Generate some count data that has poisson distribution
# z ~ poisson(x + y), x \in [0,10), y \in [0,10)
x = np.random.choice(range(0, 10), 100)
y = np.random.choice(range(0, 10), 100)
z = np.random.poisson(x + y)

We will model this data in two ways

  • log transform the labels and use linear prediction (square loss)

  • model it directly using poisson loss

The first model predicts mean(log(label)) the second predicts log(mean(label)). Due to Jensen’s inequality, the first approach produces systematic negative bias

# Train log-transform model
training_samples = []
logz = np.log(0.001 + z)
vw = vowpalwabbit.Workspace(
    "-b 2 --loss_function squared -l 0.1 --holdout_off -f vw.log.model --readable_model vw.readable.log.model",
    quiet=True,
)
for i in range(len(logz)):
    training_samples.append(
        "{label} | x:{x} y:{y}".format(label=logz[i], x=x[i], y=y[i])
    )
# Do hundred passes over the data and store the model in vw.log.model
for iteration in range(100):
    for i in range(len(training_samples)):
        vw.learn(training_samples[i])
vw.finish()
# Generate predictions from the log-transform model
vw = vowpalwabbit.Workspace("-i vw.log.model -t", quiet=True)
log_predictions = [vw.predict(sample) for sample in training_samples]
# Measure bias in the log-domain
log_bias = np.mean(log_predictions - logz)
bias = np.mean(np.exp(log_predictions) - z)

Although the model is relatively unbiased in the log-domain where we trained our model, in the original domain there is underprediction as we expected from Jensenn’s inequality

# Train original domain model using poisson regression
training_samples = []
vw = vowpalwabbit.Workspace(
    "-b 2 --loss_function poisson -l 0.1 --holdout_off -f vw.poisson.model --readable_model vw.readable.poisson.model",
    quiet=True,
)
for i in range(len(z)):
    training_samples.append("{label} | x:{x} y:{y}".format(label=z[i], x=x[i], y=y[i]))
# Do hundred passes over the data and store the model in vw.log.model
for iteration in range(100):
    for i in range(len(training_samples)):
        vw.learn(training_samples[i])
vw.finish()
# Generate predictions from the poisson model
vw = vowpalwabbit.Workspace("-i vw.poisson.model", quiet=True)
poisson_predictions = [np.exp(vw.predict(sample)) for sample in training_samples]
poisson_bias = np.mean(poisson_predictions - z)
plt.figure(figsize=(18, 6))
# Measure bias in the log-domain
plt.subplot(131)
plt.plot(logz, log_predictions, ".")
plt.plot(logz, logz, "r")
plt.title("Log-domain bias:%f" % (log_bias))
plt.xlabel("label")
plt.ylabel("prediction")

plt.subplot(132)
plt.plot(z, np.exp(log_predictions), ".")
plt.plot(z, z, "r")
plt.title("Original-domain bias:%f" % (bias))
plt.xlabel("label")
plt.ylabel("prediction")

plt.subplot(133)
plt.plot(z, poisson_predictions, ".")
plt.plot(z, z, "r")
plt.title("Poisson bias:%f" % (poisson_bias))
plt.xlabel("label")
plt.ylabel("prediction")
Text(0, 0.5, 'prediction')
../_images/6cd8f53a8c35e0e9291c5d1f6f922cb6a7aa9c17b701d063d1c8e12dd7acde2f.png