vowpalwabbit.sklearn#

This is an optional module which implements sklearn compatability.

Deprecated alias#

Deprecated since version 9.0.0: The module name vowpalwabbit.sklearn_vw has been renamed to vowpalwabbit.sklearn. Please use the new module name instead.

Example usage#

import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from vowpalwabbit.sklearn import VWClassifier
    # generate some data
X, y = datasets.make_hastie_10_2(n_samples=10000, random_state=1)
X = X.astype(np.float32)
    # split train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=256)
    # build model
model = VWClassifier()
model.fit(X_train, y_train)
    # predict model
y_pred = model.predict(X_test)
    # evaluate model
model.score(X_train, y_train)
model.score(X_test, y_test)

Module contents#

Utilities to support integration of Vowpal Wabbit and scikit-learn

class vowpalwabbit.sklearn.LinearClassifierMixin#

Bases: LogisticRegression

__init__()#

class vowpalwabbit.sklearn.VW(convert_to_vw=True, convert_labels=True, ring_size=None, strict_parse=None, learning_rate=None, l=None, power_t=None, decay_learning_rate=None, initial_t=None, feature_mask=None, initial_regressor=None, i=None, initial_weight=None, random_weights=None, normal_weights=None, truncated_normal_weights=None, sparse_weights=None, input_feature_regularizer=None, quiet=True, random_seed=None, hash=None, hash_seed=None, ignore=None, ignore_linear=None, keep=None, redefine=None, bit_precision=None, b=None, noconstant=None, constant=None, C=None, ngram=None, skips=None, feature_limit=None, affix=None, spelling=None, dictionary=None, dictionary_path=None, interactions=None, permutations=None, leave_duplicate_interactions=None, quadratic=None, q=None, cubic=None, testonly=None, t=None, holdout_off=None, holdout_period=None, holdout_after=None, early_terminate=None, passes=1, initial_pass_length=None, examples=None, min_prediction=None, max_prediction=None, sort_features=None, loss_function=None, quantile_tau=None, l1=None, l2=None, no_bias_regularization=None, named_labels=None, final_regressor=None, f=None, readable_model=None, invert_hash=None, save_resume=None, preserve_performance_counters=None, output_feature_regularizer_binary=None, output_feature_regularizer_text=None, oaa=None, ect=None, csoaa=None, wap=None, probabilities=None, nn=None, inpass=None, multitask=None, dropout=None, meanfield=None, conjugate_gradient=None, bfgs=None, hessian_on=None, mem=None, termination=None, lda=None, lda_alpha=None, lda_rho=None, lda_D=None, lda_epsilon=None, minibatch=None, svrg=None, stage_size=None, ftrl=None, coin=None, pistol=None, ftrl_alpha=None, ftrl_beta=None, ksvm=None, kernel=None, bandwidth=None, degree=None, sgd=None, adaptive=None, invariant=None, normalized=None, link=None, stage_poly=None, sched_exponent=None, batch_sz=None, batch_sz_no_doubling=None, lrq=None, lrqdropout=None, lrqfa=None, data=None, d=None, cache=None, c=None, cache_file=None, json=None, kill_cache=None, k=None)#

Bases: BaseEstimator

Vowpal Wabbit Scikit-learn Base Estimator wrapper

__init__(convert_to_vw=True, convert_labels=True, ring_size=None, strict_parse=None, learning_rate=None, l=None, power_t=None, decay_learning_rate=None, initial_t=None, feature_mask=None, initial_regressor=None, i=None, initial_weight=None, random_weights=None, normal_weights=None, truncated_normal_weights=None, sparse_weights=None, input_feature_regularizer=None, quiet=True, random_seed=None, hash=None, hash_seed=None, ignore=None, ignore_linear=None, keep=None, redefine=None, bit_precision=None, b=None, noconstant=None, constant=None, C=None, ngram=None, skips=None, feature_limit=None, affix=None, spelling=None, dictionary=None, dictionary_path=None, interactions=None, permutations=None, leave_duplicate_interactions=None, quadratic=None, q=None, cubic=None, testonly=None, t=None, holdout_off=None, holdout_period=None, holdout_after=None, early_terminate=None, passes=1, initial_pass_length=None, examples=None, min_prediction=None, max_prediction=None, sort_features=None, loss_function=None, quantile_tau=None, l1=None, l2=None, no_bias_regularization=None, named_labels=None, final_regressor=None, f=None, readable_model=None, invert_hash=None, save_resume=None, preserve_performance_counters=None, output_feature_regularizer_binary=None, output_feature_regularizer_text=None, oaa=None, ect=None, csoaa=None, wap=None, probabilities=None, nn=None, inpass=None, multitask=None, dropout=None, meanfield=None, conjugate_gradient=None, bfgs=None, hessian_on=None, mem=None, termination=None, lda=None, lda_alpha=None, lda_rho=None, lda_D=None, lda_epsilon=None, minibatch=None, svrg=None, stage_size=None, ftrl=None, coin=None, pistol=None, ftrl_alpha=None, ftrl_beta=None, ksvm=None, kernel=None, bandwidth=None, degree=None, sgd=None, adaptive=None, invariant=None, normalized=None, link=None, stage_poly=None, sched_exponent=None, batch_sz=None, batch_sz_no_doubling=None, lrq=None, lrqdropout=None, lrqfa=None, data=None, d=None, cache=None, c=None, cache_file=None, json=None, kill_cache=None, k=None)#

VW model constructor, exposing all supported parameters to keep sklearn happy

Parameters

convert_to_vw (bool) – flag to convert X input to vw format
convert_labels (bool) – Convert labels of the form [0,1] to [-1,1]
ring_size (int) – size of example ring
strict_parse (bool) – throw on malformed examples
learning_rate (float) – Set learning rate
l (float) – Set learning rate
power_t (float) – t power value
decay_learning_rate (float) – Set Decay factor for learning_rate between passes
initial_t (float) – initial t value
feature_mask (str) – Use existing regressor to determine which parameters may be updated. If no initial_regressor given, also used for initial weights.
initial_regressor (str) – Initial regressor(s)
i (str) – Initial regressor(s)
initial_weight (float) – Set all weights to an initial value of arg.
random_weights (bool) – make initial weights random
normal_weights (bool) – make initial weights normal
truncated_normal_weights (bool) – make initial weights truncated normal
sparse_weights (float) – Use a sparse datastructure for weights
input_feature_regularizer (str) – Per feature regularization input file
quiet (bool) – Don’t output disgnostics and progress updates
random_seed (integer) – seed random number generator
hash (str) – , all
hash_seed (int) – seed for hash function
ignore (str) – ignore namespaces beginning with character <arg>
ignore_linear (str) – ignore namespaces beginning with character <arg> for linear terms only
keep (str) – keep namespaces beginning with character <arg>
redefine (str) – Redefine namespaces beginning with characters of string S as namespace N. <arg> shall be in form ‘N:=S’ where := is operator. Empty N or S are treated as default namespace. Use ‘:’ as a wildcard in S.
bit_precision (integer) – number of bits in the feature table
b (integer) – number of bits in the feature table
noconstant (bool) – Don’t add a constant feature
constant (float) – Set initial value of constant
C (float) – Set initial value of constant
ngram (str) – Generate N grams. To generate N grams for a single namespace ‘foo’, arg should be fN.
skips (str) – Generate skips in N grams. This in conjunction with the ngram tag can be used to generate generalized n-skip-k-gram. To generate n-skips for a single namespace ‘foo’, arg should be fN.
feature_limit (str) – limit to N features. To apply to a single namespace ‘foo’, arg should be fN
affix (str) – generate prefixes/suffixes of features; argument ‘+2a,-3b,+1’ means generate 2-char prefixes for namespace a, 3-char suffixes for b and 1 char prefixes for default namespace
spelling (str) – compute spelling features for a give namespace (use ‘_’ for default namespace)
dictionary (str) – read a dictionary for additional features (arg either ‘x:file’ or just ‘file’)
dictionary_path (str) – look in this directory for dictionaries; defaults to current directory or env{PATH}
interactions (str) – Create feature interactions of any level between namespaces.
permutations (bool) – Use permutations instead of combinations for feature interactions of same namespace.
leave_duplicate_interactions (bool) – Don’t remove interactions with duplicate combinations of namespaces. For ex. this is a duplicate: ‘-q ab -q ba’ and a lot more in ‘-q ::’.
quadratic (str) – Create and use quadratic features, q:: corresponds to a wildcard for all printable characters
q (str) – Create and use quadratic features, q:: corresponds to a wildcard for all printable characters
cubic (str) – Create and use cubic features
testonly (bool) – Ignore label information and just test
t (bool) – Ignore label information and just test
holdout_off (bool) – no holdout data in multiple passes
holdout_period (int) – holdout period for test only
holdout_after (int) – holdout after n training examples
early_terminate (int) – Specify the number of passes tolerated when holdout loss doesn’t decrease before early termination
passes (int) – Number of Training Passes
initial_pass_length (int) – initial number of examples per pass
examples (int) – number of examples to parse
min_prediction (float) – Smallest prediction to output
max_prediction (float) – Largest prediction to output
sort_features (bool) – turn this on to disregard order in which features have been defined. This will lead to smaller cache sizes
loss_function (str) – default_value(“squared”), “Specify the loss function to be used, uses squared by default. Currently available ones are squared, classic, hinge, logistic and quantile.
quantile_tau (float) – Parameter tau associated with Quantile loss. Defaults to 0.5
l1 (float) – l_1 lambda (L1 regularization)
l2 (float) – l_2 lambda (L2 regularization)
no_bias_regularization (bool) – no bias in regularization
named_labels (str) – use names for labels (multiclass, etc.) rather than integers, argument specified all possible labels, comma-sep, eg “–named_labels Noun,Verb,Adj,Punc”
final_regressor (str) – Final regressor
f (str) – Final regressor
readable_model (str) – Output human-readable final regressor with numeric features
invert_hash (str) – Output human-readable final regressor with feature names. Computationally expensive.
save_resume (bool) – save extra state so learning can be resumed later with new data
preserve_performance_counters (bool) – reset performance counters when warmstarting
output_feature_regularizer_binary (str) – Per feature regularization output file
output_feature_regularizer_text (str) – Per feature regularization output file, in text
oaa (integer) – Use one-against-all multiclass learning with labels
oaa_subsample (int) – subsample this number of negative examples when learning
ect (integer) – Use error correcting tournament multiclass learning
csoaa (integer) – Use cost sensitive one-against-all multiclass learning
wap (integer) – Use weighted all pairs multiclass learning
probabilities (float) – predict probabilities of all classes
nn (integer) – Use a sigmoidal feed-forward neural network with N hidden units
inpass (bool) – Train or test sigmoidal feed-forward network with input pass-through
multitask (bool) – Share hidden layer across all reduced tasks
dropout (bool) – Train or test sigmoidal feed-forward network using dropout
meanfield (bool) – Train or test sigmoidal feed-forward network using mean field
conjugate_gradient (bool) – use conjugate gradient based optimization
bgfs (bool) – use bfgs updates
hessian_on (bool) – use second derivative in line search
mem (int) – memory in bfgs
termination (float) – termination threshold
lda (int) – Run lda with <int> topics
lda_alpha (float) – Prior on sparsity of per-document topic weights
lda_rho (float) – Prior on sparsity of topic distributions
lda_D (int) – Number of documents
lda_epsilon (float) – Loop convergence threshold
minibatch (int) – Minibatch size for LDA
svrg (bool) – Streaming Stochastic Variance Reduced Gradient
stage_size (int) – Number of passes per SVRG stage
ftrl (bool) – Run Follow the Proximal Regularized Leader
coin (bool) – Coin betting optimizer
pistol (bool) – PiSTOL - Parameter free STOchastic Learning
ftrl_alpha (float) – Alpha parameter for FTRL optimization
ftrl_beta (float) – Beta parameters for FTRL optimization
ksvm (bool) – kernel svm
kernel (str) – type of kernel (rbf or linear (default))
bandwidth (int) – bandwidth of rbf kernel
degree (int) – degree of poly kernel
sgd (bool) – use regular stochastic gradient descent update
adaptive (bool) – use adaptive, individual learning rates
adax (bool) – use adaptive learning rates with x^2 instead of g^2x^2
invariant (bool) – use save/importance aware updates
normalized (bool) – use per feature normalized updates
link (str) – Specify the link function - identity, logistic, glf1 or poisson
stage_poly (bool) – use stagewise polynomial feature learning
sched_exponent (int) – exponent controlling quantity of included features
batch_sz (int) – multiplier on batch size before including more features
batch_sz_no_doubling (bool) – batch_sz does not double
lrq (bool) – use low rank quadratic features
lrqdropout (bool) – use dropout training for low rank quadratic features
lrqfa (bool) – use low rank quadratic features with field aware weights
data (str) – path to data file for fitting external to sklearn
d (str) – path to data file for fitting external to sklearn
cache (str) – use a cache. default is <data>.cache
c (str) – use a cache. default is <data>.cache
cache_file (str) – path to cache file to use
json (bool) – enable JSON parsing
kill_cache (bool) – do not reuse existing cache file, create a new one always
k (bool) – do not reuse existing cache file, create a new one always

convert_labels: bool = True#: Convert labels of the form [0,1] to [-1,1]

convert_to_vw: bool = True#: flag to convert X input to vw format

fit(X=None, y=None, sample_weight=None)#

Fit the model according to the given training data

Todo

For first pass create and store example objects. For N-1 passes use example objects directly (simulate cache file…but in memory for faster processing)

Parameters

X – {array-like, sparse matrix}, shape (n_samples, n_features or 1 if not convert_to_vw) or Training vector, where n_samples in the number of samples and n_features is the number of features. if not using convert_to_vw, X is expected to be a list of vw formatted feature vector strings with labels
y – array-like, shape (n_samples,), optional if not convert_to_vw Target vector relative to X.
sample_weight – array-like, shape (n_samples,) sample weight vector relative to X.

Returns

self

get_coefs()#

Returns coefficient weights as ordered sparse matrix

Returns: coefficient weights for model
Return type: sparse matrix

get_intercept()#

Returns intercept weight for model

Returns: intercept value. 0 if no constant
Return type: int

get_params(deep=True)#: This returns the full set of vw and estimator parameters currently in use

get_vw()#

Get the vw instance

Returns: instance
Return type: vowpalwabbit.Workspace

load(filename)#: Load model from file

predict(X)#

Predict with Vowpal Wabbit model

Parameters

X ({array-like, sparse matrix}, shape (n_samples, n_features or 1)) – Training vector, where n_samples in the number of samples and n_features is the number of features. if not using convert_to_vw, X is expected to be a list of vw formatted feature vector strings with labels

Returns

Output vector relative to X.

Return type

array-like, shape (n_samples, 1 or n_classes)

save(filename)#: Save model to file

set_coefs(coefs)#

Sets coefficients weights from ordered sparse matrix

Parameters: coefs (sparse matrix) – coefficient weights for model

set_params(**kwargs)#: This destroys and recreates the Vowpal Wabbit model with updated parameters any parameters not provided will remain as they are currently

vw_: vowpalwabbit.pyvw.Workspace = None#

class vowpalwabbit.sklearn.VWClassifier(loss_function='logistic', **kwargs)#

Bases: VW, LinearClassifierMixin

Vowpal Wabbit Classifier model for binary classification Use VWMultiClassifier for multiclass classification Note - We are assuming the VW.predict returns logits, applying link=logistic will break this assumption

__init__(loss_function='logistic', **kwargs)#

VW model constructor, exposing all supported parameters to keep sklearn happy

Parameters

convert_to_vw (bool) – flag to convert X input to vw format
convert_labels (bool) – Convert labels of the form [0,1] to [-1,1]
ring_size (int) – size of example ring
strict_parse (bool) – throw on malformed examples
learning_rate (float) – Set learning rate
l (float) – Set learning rate
power_t (float) – t power value
decay_learning_rate (float) – Set Decay factor for learning_rate between passes
initial_t (float) – initial t value
feature_mask (str) – Use existing regressor to determine which parameters may be updated. If no initial_regressor given, also used for initial weights.
initial_regressor (str) – Initial regressor(s)
i (str) – Initial regressor(s)
initial_weight (float) – Set all weights to an initial value of arg.
random_weights (bool) – make initial weights random
normal_weights (bool) – make initial weights normal
truncated_normal_weights (bool) – make initial weights truncated normal
sparse_weights (float) – Use a sparse datastructure for weights
input_feature_regularizer (str) – Per feature regularization input file
quiet (bool) – Don’t output disgnostics and progress updates
random_seed (integer) – seed random number generator
hash (str) – , all
hash_seed (int) – seed for hash function
ignore (str) – ignore namespaces beginning with character <arg>
ignore_linear (str) – ignore namespaces beginning with character <arg> for linear terms only
keep (str) – keep namespaces beginning with character <arg>
redefine (str) – Redefine namespaces beginning with characters of string S as namespace N. <arg> shall be in form ‘N:=S’ where := is operator. Empty N or S are treated as default namespace. Use ‘:’ as a wildcard in S.
bit_precision (integer) – number of bits in the feature table
b (integer) – number of bits in the feature table
noconstant (bool) – Don’t add a constant feature
constant (float) – Set initial value of constant
C (float) – Set initial value of constant
ngram (str) – Generate N grams. To generate N grams for a single namespace ‘foo’, arg should be fN.
skips (str) – Generate skips in N grams. This in conjunction with the ngram tag can be used to generate generalized n-skip-k-gram. To generate n-skips for a single namespace ‘foo’, arg should be fN.
feature_limit (str) – limit to N features. To apply to a single namespace ‘foo’, arg should be fN
affix (str) – generate prefixes/suffixes of features; argument ‘+2a,-3b,+1’ means generate 2-char prefixes for namespace a, 3-char suffixes for b and 1 char prefixes for default namespace
spelling (str) – compute spelling features for a give namespace (use ‘_’ for default namespace)
dictionary (str) – read a dictionary for additional features (arg either ‘x:file’ or just ‘file’)
dictionary_path (str) – look in this directory for dictionaries; defaults to current directory or env{PATH}
interactions (str) – Create feature interactions of any level between namespaces.
permutations (bool) – Use permutations instead of combinations for feature interactions of same namespace.
leave_duplicate_interactions (bool) – Don’t remove interactions with duplicate combinations of namespaces. For ex. this is a duplicate: ‘-q ab -q ba’ and a lot more in ‘-q ::’.
quadratic (str) – Create and use quadratic features, q:: corresponds to a wildcard for all printable characters
q (str) – Create and use quadratic features, q:: corresponds to a wildcard for all printable characters
cubic (str) – Create and use cubic features
testonly (bool) – Ignore label information and just test
t (bool) – Ignore label information and just test
holdout_off (bool) – no holdout data in multiple passes
holdout_period (int) – holdout period for test only
holdout_after (int) – holdout after n training examples
early_terminate (int) – Specify the number of passes tolerated when holdout loss doesn’t decrease before early termination
passes (int) – Number of Training Passes
initial_pass_length (int) – initial number of examples per pass
examples (int) – number of examples to parse
min_prediction (float) – Smallest prediction to output
max_prediction (float) – Largest prediction to output
sort_features (bool) – turn this on to disregard order in which features have been defined. This will lead to smaller cache sizes
loss_function (str) – default_value(“squared”), “Specify the loss function to be used, uses squared by default. Currently available ones are squared, classic, hinge, logistic and quantile.
quantile_tau (float) – Parameter tau associated with Quantile loss. Defaults to 0.5
l1 (float) – l_1 lambda (L1 regularization)
l2 (float) – l_2 lambda (L2 regularization)
no_bias_regularization (bool) – no bias in regularization
named_labels (str) – use names for labels (multiclass, etc.) rather than integers, argument specified all possible labels, comma-sep, eg “–named_labels Noun,Verb,Adj,Punc”
final_regressor (str) – Final regressor
f (str) – Final regressor
readable_model (str) – Output human-readable final regressor with numeric features
invert_hash (str) – Output human-readable final regressor with feature names. Computationally expensive.
save_resume (bool) – save extra state so learning can be resumed later with new data
preserve_performance_counters (bool) – reset performance counters when warmstarting
output_feature_regularizer_binary (str) – Per feature regularization output file
output_feature_regularizer_text (str) – Per feature regularization output file, in text
oaa (integer) – Use one-against-all multiclass learning with labels
oaa_subsample (int) – subsample this number of negative examples when learning
ect (integer) – Use error correcting tournament multiclass learning
csoaa (integer) – Use cost sensitive one-against-all multiclass learning
wap (integer) – Use weighted all pairs multiclass learning
probabilities (float) – predict probabilities of all classes
nn (integer) – Use a sigmoidal feed-forward neural network with N hidden units
inpass (bool) – Train or test sigmoidal feed-forward network with input pass-through
multitask (bool) – Share hidden layer across all reduced tasks
dropout (bool) – Train or test sigmoidal feed-forward network using dropout
meanfield (bool) – Train or test sigmoidal feed-forward network using mean field
conjugate_gradient (bool) – use conjugate gradient based optimization
bgfs (bool) – use bfgs updates
hessian_on (bool) – use second derivative in line search
mem (int) – memory in bfgs
termination (float) – termination threshold
lda (int) – Run lda with <int> topics
lda_alpha (float) – Prior on sparsity of per-document topic weights
lda_rho (float) – Prior on sparsity of topic distributions
lda_D (int) – Number of documents
lda_epsilon (float) – Loop convergence threshold
minibatch (int) – Minibatch size for LDA
svrg (bool) – Streaming Stochastic Variance Reduced Gradient
stage_size (int) – Number of passes per SVRG stage
ftrl (bool) – Run Follow the Proximal Regularized Leader
coin (bool) – Coin betting optimizer
pistol (bool) – PiSTOL - Parameter free STOchastic Learning
ftrl_alpha (float) – Alpha parameter for FTRL optimization
ftrl_beta (float) – Beta parameters for FTRL optimization
ksvm (bool) – kernel svm
kernel (str) – type of kernel (rbf or linear (default))
bandwidth (int) – bandwidth of rbf kernel
degree (int) – degree of poly kernel
sgd (bool) – use regular stochastic gradient descent update
adaptive (bool) – use adaptive, individual learning rates
adax (bool) – use adaptive learning rates with x^2 instead of g^2x^2
invariant (bool) – use save/importance aware updates
normalized (bool) – use per feature normalized updates
link (str) – Specify the link function - identity, logistic, glf1 or poisson
stage_poly (bool) – use stagewise polynomial feature learning
sched_exponent (int) – exponent controlling quantity of included features
batch_sz (int) – multiplier on batch size before including more features
batch_sz_no_doubling (bool) – batch_sz does not double
lrq (bool) – use low rank quadratic features
lrqdropout (bool) – use dropout training for low rank quadratic features
lrqfa (bool) – use low rank quadratic features with field aware weights
data (str) – path to data file for fitting external to sklearn
d (str) – path to data file for fitting external to sklearn
cache (str) – use a cache. default is <data>.cache
c (str) – use a cache. default is <data>.cache
cache_file (str) – path to cache file to use
json (bool) – enable JSON parsing
kill_cache (bool) – do not reuse existing cache file, create a new one always
k (bool) – do not reuse existing cache file, create a new one always

classes_ = array([-1., 1.])#: Binary class labels

coef_ = None#: Empty sparse matrix used the check if model has been fit

decision_function(X)#

Predict confidence scores for samples. The confidence score for a sample is the signed distance of that sample to the hyperplane.

Parameters

X – array_like or sparse matrix, shape (n_samples, n_features) Samples.

Returns

array, shape=(n_samples,) if n_classes == 2 else (n_samples, n_classes): Confidence scores per (sample, class) combination. In the binary case, confidence score for self.classes_[1] where >0 means this class would be predicted.

fit(X=None, y=None, sample_weight=None)#

Fit the model according to the given training data.

Parameters

X – {array-like, sparse matrix} of shape (n_samples, n_features) Training vector, where n_samples is the number of samples and n_features is the number of features.
y – array-like of shape (n_samples,) Target vector relative to X.
sample_weight – array-like of shape (n_samples,) default=None Array of weights that are assigned to individual samples. If not provided, then each sample is given unit weight.

Returns

self

predict(X)#

Predict class labels for samples in X.

Parameters

X – array_like or sparse matrix, shape (n_samples, n_features) Samples.

Returns

Predicted class label per sample.

Return type

array, shape [n_samples]

predict_proba(X)#

Predict probabilities for samples

Parameters

X – {array-like, sparse matrix}, shape = (n_samples, n_features) Samples.

Returns

Returns the probability of the sample for each class in the model,
where classes are ordered as they are in self.classes_.

Return type

array-like of shape (n_samples, n_classes)

class vowpalwabbit.sklearn.VWMultiClassifier(probabilities=True, **kwargs)#

Bases: VWClassifier

Vowpal Wabbit MultiClassifier model Note - We are assuming the VW.predict returns probabilities, setting probabilities=False will break this assumption

__init__(probabilities=True, **kwargs)#

VW model constructor, exposing all supported parameters to keep sklearn happy

Parameters

convert_to_vw (bool) – flag to convert X input to vw format
convert_labels (bool) – Convert labels of the form [0,1] to [-1,1]
ring_size (int) – size of example ring
strict_parse (bool) – throw on malformed examples
learning_rate (float) – Set learning rate
l (float) – Set learning rate
power_t (float) – t power value
decay_learning_rate (float) – Set Decay factor for learning_rate between passes
initial_t (float) – initial t value
feature_mask (str) – Use existing regressor to determine which parameters may be updated. If no initial_regressor given, also used for initial weights.
initial_regressor (str) – Initial regressor(s)
i (str) – Initial regressor(s)
initial_weight (float) – Set all weights to an initial value of arg.
random_weights (bool) – make initial weights random
normal_weights (bool) – make initial weights normal
truncated_normal_weights (bool) – make initial weights truncated normal
sparse_weights (float) – Use a sparse datastructure for weights
input_feature_regularizer (str) – Per feature regularization input file
quiet (bool) – Don’t output disgnostics and progress updates
random_seed (integer) – seed random number generator
hash (str) – , all
hash_seed (int) – seed for hash function
ignore (str) – ignore namespaces beginning with character <arg>
ignore_linear (str) – ignore namespaces beginning with character <arg> for linear terms only
keep (str) – keep namespaces beginning with character <arg>
redefine (str) – Redefine namespaces beginning with characters of string S as namespace N. <arg> shall be in form ‘N:=S’ where := is operator. Empty N or S are treated as default namespace. Use ‘:’ as a wildcard in S.
bit_precision (integer) – number of bits in the feature table
b (integer) – number of bits in the feature table
noconstant (bool) – Don’t add a constant feature
constant (float) – Set initial value of constant
C (float) – Set initial value of constant
ngram (str) – Generate N grams. To generate N grams for a single namespace ‘foo’, arg should be fN.
skips (str) – Generate skips in N grams. This in conjunction with the ngram tag can be used to generate generalized n-skip-k-gram. To generate n-skips for a single namespace ‘foo’, arg should be fN.
feature_limit (str) – limit to N features. To apply to a single namespace ‘foo’, arg should be fN
affix (str) – generate prefixes/suffixes of features; argument ‘+2a,-3b,+1’ means generate 2-char prefixes for namespace a, 3-char suffixes for b and 1 char prefixes for default namespace
spelling (str) – compute spelling features for a give namespace (use ‘_’ for default namespace)
dictionary (str) – read a dictionary for additional features (arg either ‘x:file’ or just ‘file’)
dictionary_path (str) – look in this directory for dictionaries; defaults to current directory or env{PATH}
interactions (str) – Create feature interactions of any level between namespaces.
permutations (bool) – Use permutations instead of combinations for feature interactions of same namespace.
leave_duplicate_interactions (bool) – Don’t remove interactions with duplicate combinations of namespaces. For ex. this is a duplicate: ‘-q ab -q ba’ and a lot more in ‘-q ::’.
quadratic (str) – Create and use quadratic features, q:: corresponds to a wildcard for all printable characters
q (str) – Create and use quadratic features, q:: corresponds to a wildcard for all printable characters
cubic (str) – Create and use cubic features
testonly (bool) – Ignore label information and just test
t (bool) – Ignore label information and just test
holdout_off (bool) – no holdout data in multiple passes
holdout_period (int) – holdout period for test only
holdout_after (int) – holdout after n training examples
early_terminate (int) – Specify the number of passes tolerated when holdout loss doesn’t decrease before early termination
passes (int) – Number of Training Passes
initial_pass_length (int) – initial number of examples per pass
examples (int) – number of examples to parse
min_prediction (float) – Smallest prediction to output
max_prediction (float) – Largest prediction to output
sort_features (bool) – turn this on to disregard order in which features have been defined. This will lead to smaller cache sizes
loss_function (str) – default_value(“squared”), “Specify the loss function to be used, uses squared by default. Currently available ones are squared, classic, hinge, logistic and quantile.
quantile_tau (float) – Parameter tau associated with Quantile loss. Defaults to 0.5
l1 (float) – l_1 lambda (L1 regularization)
l2 (float) – l_2 lambda (L2 regularization)
no_bias_regularization (bool) – no bias in regularization
named_labels (str) – use names for labels (multiclass, etc.) rather than integers, argument specified all possible labels, comma-sep, eg “–named_labels Noun,Verb,Adj,Punc”
final_regressor (str) – Final regressor
f (str) – Final regressor
readable_model (str) – Output human-readable final regressor with numeric features
invert_hash (str) – Output human-readable final regressor with feature names. Computationally expensive.
save_resume (bool) – save extra state so learning can be resumed later with new data
preserve_performance_counters (bool) – reset performance counters when warmstarting
output_feature_regularizer_binary (str) – Per feature regularization output file
output_feature_regularizer_text (str) – Per feature regularization output file, in text
oaa (integer) – Use one-against-all multiclass learning with labels
oaa_subsample (int) – subsample this number of negative examples when learning
ect (integer) – Use error correcting tournament multiclass learning
csoaa (integer) – Use cost sensitive one-against-all multiclass learning
wap (integer) – Use weighted all pairs multiclass learning
probabilities (float) – predict probabilities of all classes
nn (integer) – Use a sigmoidal feed-forward neural network with N hidden units
inpass (bool) – Train or test sigmoidal feed-forward network with input pass-through
multitask (bool) – Share hidden layer across all reduced tasks
dropout (bool) – Train or test sigmoidal feed-forward network using dropout
meanfield (bool) – Train or test sigmoidal feed-forward network using mean field
conjugate_gradient (bool) – use conjugate gradient based optimization
bgfs (bool) – use bfgs updates
hessian_on (bool) – use second derivative in line search
mem (int) – memory in bfgs
termination (float) – termination threshold
lda (int) – Run lda with <int> topics
lda_alpha (float) – Prior on sparsity of per-document topic weights
lda_rho (float) – Prior on sparsity of topic distributions
lda_D (int) – Number of documents
lda_epsilon (float) – Loop convergence threshold
minibatch (int) – Minibatch size for LDA
svrg (bool) – Streaming Stochastic Variance Reduced Gradient
stage_size (int) – Number of passes per SVRG stage
ftrl (bool) – Run Follow the Proximal Regularized Leader
coin (bool) – Coin betting optimizer
pistol (bool) – PiSTOL - Parameter free STOchastic Learning
ftrl_alpha (float) – Alpha parameter for FTRL optimization
ftrl_beta (float) – Beta parameters for FTRL optimization
ksvm (bool) – kernel svm
kernel (str) – type of kernel (rbf or linear (default))
bandwidth (int) – bandwidth of rbf kernel
degree (int) – degree of poly kernel
sgd (bool) – use regular stochastic gradient descent update
adaptive (bool) – use adaptive, individual learning rates
adax (bool) – use adaptive learning rates with x^2 instead of g^2x^2
invariant (bool) – use save/importance aware updates
normalized (bool) – use per feature normalized updates
link (str) – Specify the link function - identity, logistic, glf1 or poisson
stage_poly (bool) – use stagewise polynomial feature learning
sched_exponent (int) – exponent controlling quantity of included features
batch_sz (int) – multiplier on batch size before including more features
batch_sz_no_doubling (bool) – batch_sz does not double
lrq (bool) – use low rank quadratic features
lrqdropout (bool) – use dropout training for low rank quadratic features
lrqfa (bool) – use low rank quadratic features with field aware weights
data (str) – path to data file for fitting external to sklearn
d (str) – path to data file for fitting external to sklearn
cache (str) – use a cache. default is <data>.cache
c (str) – use a cache. default is <data>.cache
cache_file (str) – path to cache file to use
json (bool) – enable JSON parsing
kill_cache (bool) – do not reuse existing cache file, create a new one always
k (bool) – do not reuse existing cache file, create a new one always

classes_ = None#: Class labels

decision_function(X)#

Predict confidence scores for samples. The confidence score for a sample is the signed distance of that sample to the hyperplane.

Parameters: X – array_like or sparse matrix, shape (n_samples, n_features) Samples.
Returns: Confidence scores per (sample, class) combination.
Return type: array, shape=(n_samples, n_classes)

estimator_ = None#: “type of estimator to use [csoaa, ect, oaa, wap] and number of classes

fit(X=None, y=None, sample_weight=None)#

Fit the model according to the given training data.

Parameters

X – {array-like, sparse matrix} of shape (n_samples, n_features) Training vector, where n_samples is the number of samples and n_features is the number of features.
y – array-like of shape (n_samples,) Target vector relative to X.
sample_weight – array-like of shape (n_samples,) default=None Array of weights that are assigned to individual samples. If not provided, then each sample is given unit weight.

Returns

self

predict_proba(X)#

Predict probabilities for each class.

Parameters

X – {array-like, sparse matrix}, shape = (n_samples, n_features) Samples.

Returns

array, shape=(n_samples,) if n_classes == 2 else (n_samples, n_classes): Confidence scores per (sample, class) combination. In the binary case, confidence score for self.classes_[1] where >0 means this class would be predicted.

Examples

>>> import numpy as np
>>> X = np.array([ [10, 10], [8, 10], [-5, 5.5], [-5.4, 5.5], [-20, -20],  [-15, -20] ])
>>> y = np.array([1, 1, 2, 2, 3, 3])
>>> from vowpalwabbit.sklearn import VWMultiClassifier
>>> model = VWMultiClassifier(oaa=3, loss_function='logistic')
>>> _ = model.fit(X, y)
>>> model.predict_proba(X)
array([[0.38928846, 0.30534211, 0.30536944],
       [0.40664235, 0.29666999, 0.29668769],
       [0.52324486, 0.23841164, 0.23834346],
       [0.5268591 , 0.23660533, 0.23653553],
       [0.65397811, 0.17312808, 0.17289382],
       [0.61190444, 0.19416356, 0.19393198]])

class vowpalwabbit.sklearn.VWRegressor(convert_labels=False, **kwargs)#

Bases: VW, RegressorMixin

Vowpal Wabbit Regressor model

__init__(convert_labels=False, **kwargs)#

VW model constructor, exposing all supported parameters to keep sklearn happy

Parameters

convert_to_vw (bool) – flag to convert X input to vw format
convert_labels (bool) – Convert labels of the form [0,1] to [-1,1]
ring_size (int) – size of example ring
strict_parse (bool) – throw on malformed examples
learning_rate (float) – Set learning rate
l (float) – Set learning rate
power_t (float) – t power value
decay_learning_rate (float) – Set Decay factor for learning_rate between passes
initial_t (float) – initial t value
feature_mask (str) – Use existing regressor to determine which parameters may be updated. If no initial_regressor given, also used for initial weights.
initial_regressor (str) – Initial regressor(s)
i (str) – Initial regressor(s)
initial_weight (float) – Set all weights to an initial value of arg.
random_weights (bool) – make initial weights random
normal_weights (bool) – make initial weights normal
truncated_normal_weights (bool) – make initial weights truncated normal
sparse_weights (float) – Use a sparse datastructure for weights
input_feature_regularizer (str) – Per feature regularization input file
quiet (bool) – Don’t output disgnostics and progress updates
random_seed (integer) – seed random number generator
hash (str) – , all
hash_seed (int) – seed for hash function
ignore (str) – ignore namespaces beginning with character <arg>
ignore_linear (str) – ignore namespaces beginning with character <arg> for linear terms only
keep (str) – keep namespaces beginning with character <arg>
redefine (str) – Redefine namespaces beginning with characters of string S as namespace N. <arg> shall be in form ‘N:=S’ where := is operator. Empty N or S are treated as default namespace. Use ‘:’ as a wildcard in S.
bit_precision (integer) – number of bits in the feature table
b (integer) – number of bits in the feature table
noconstant (bool) – Don’t add a constant feature
constant (float) – Set initial value of constant
C (float) – Set initial value of constant
ngram (str) – Generate N grams. To generate N grams for a single namespace ‘foo’, arg should be fN.
skips (str) – Generate skips in N grams. This in conjunction with the ngram tag can be used to generate generalized n-skip-k-gram. To generate n-skips for a single namespace ‘foo’, arg should be fN.
feature_limit (str) – limit to N features. To apply to a single namespace ‘foo’, arg should be fN
affix (str) – generate prefixes/suffixes of features; argument ‘+2a,-3b,+1’ means generate 2-char prefixes for namespace a, 3-char suffixes for b and 1 char prefixes for default namespace
spelling (str) – compute spelling features for a give namespace (use ‘_’ for default namespace)
dictionary (str) – read a dictionary for additional features (arg either ‘x:file’ or just ‘file’)
dictionary_path (str) – look in this directory for dictionaries; defaults to current directory or env{PATH}
interactions (str) – Create feature interactions of any level between namespaces.
permutations (bool) – Use permutations instead of combinations for feature interactions of same namespace.
leave_duplicate_interactions (bool) – Don’t remove interactions with duplicate combinations of namespaces. For ex. this is a duplicate: ‘-q ab -q ba’ and a lot more in ‘-q ::’.
quadratic (str) – Create and use quadratic features, q:: corresponds to a wildcard for all printable characters
q (str) – Create and use quadratic features, q:: corresponds to a wildcard for all printable characters
cubic (str) – Create and use cubic features
testonly (bool) – Ignore label information and just test
t (bool) – Ignore label information and just test
holdout_off (bool) – no holdout data in multiple passes
holdout_period (int) – holdout period for test only
holdout_after (int) – holdout after n training examples
early_terminate (int) – Specify the number of passes tolerated when holdout loss doesn’t decrease before early termination
passes (int) – Number of Training Passes
initial_pass_length (int) – initial number of examples per pass
examples (int) – number of examples to parse
min_prediction (float) – Smallest prediction to output
max_prediction (float) – Largest prediction to output
sort_features (bool) – turn this on to disregard order in which features have been defined. This will lead to smaller cache sizes
loss_function (str) – default_value(“squared”), “Specify the loss function to be used, uses squared by default. Currently available ones are squared, classic, hinge, logistic and quantile.
quantile_tau (float) – Parameter tau associated with Quantile loss. Defaults to 0.5
l1 (float) – l_1 lambda (L1 regularization)
l2 (float) – l_2 lambda (L2 regularization)
no_bias_regularization (bool) – no bias in regularization
named_labels (str) – use names for labels (multiclass, etc.) rather than integers, argument specified all possible labels, comma-sep, eg “–named_labels Noun,Verb,Adj,Punc”
final_regressor (str) – Final regressor
f (str) – Final regressor
readable_model (str) – Output human-readable final regressor with numeric features
invert_hash (str) – Output human-readable final regressor with feature names. Computationally expensive.
save_resume (bool) – save extra state so learning can be resumed later with new data
preserve_performance_counters (bool) – reset performance counters when warmstarting
output_feature_regularizer_binary (str) – Per feature regularization output file
output_feature_regularizer_text (str) – Per feature regularization output file, in text
oaa (integer) – Use one-against-all multiclass learning with labels
oaa_subsample (int) – subsample this number of negative examples when learning
ect (integer) – Use error correcting tournament multiclass learning
csoaa (integer) – Use cost sensitive one-against-all multiclass learning
wap (integer) – Use weighted all pairs multiclass learning
probabilities (float) – predict probabilities of all classes
nn (integer) – Use a sigmoidal feed-forward neural network with N hidden units
inpass (bool) – Train or test sigmoidal feed-forward network with input pass-through
multitask (bool) – Share hidden layer across all reduced tasks
dropout (bool) – Train or test sigmoidal feed-forward network using dropout
meanfield (bool) – Train or test sigmoidal feed-forward network using mean field
conjugate_gradient (bool) – use conjugate gradient based optimization
bgfs (bool) – use bfgs updates
hessian_on (bool) – use second derivative in line search
mem (int) – memory in bfgs
termination (float) – termination threshold
lda (int) – Run lda with <int> topics
lda_alpha (float) – Prior on sparsity of per-document topic weights
lda_rho (float) – Prior on sparsity of topic distributions
lda_D (int) – Number of documents
lda_epsilon (float) – Loop convergence threshold
minibatch (int) – Minibatch size for LDA
svrg (bool) – Streaming Stochastic Variance Reduced Gradient
stage_size (int) – Number of passes per SVRG stage
ftrl (bool) – Run Follow the Proximal Regularized Leader
coin (bool) – Coin betting optimizer
pistol (bool) – PiSTOL - Parameter free STOchastic Learning
ftrl_alpha (float) – Alpha parameter for FTRL optimization
ftrl_beta (float) – Beta parameters for FTRL optimization
ksvm (bool) – kernel svm
kernel (str) – type of kernel (rbf or linear (default))
bandwidth (int) – bandwidth of rbf kernel
degree (int) – degree of poly kernel
sgd (bool) – use regular stochastic gradient descent update
adaptive (bool) – use adaptive, individual learning rates
adax (bool) – use adaptive learning rates with x^2 instead of g^2x^2
invariant (bool) – use save/importance aware updates
normalized (bool) – use per feature normalized updates
link (str) – Specify the link function - identity, logistic, glf1 or poisson
stage_poly (bool) – use stagewise polynomial feature learning
sched_exponent (int) – exponent controlling quantity of included features
batch_sz (int) – multiplier on batch size before including more features
batch_sz_no_doubling (bool) – batch_sz does not double
lrq (bool) – use low rank quadratic features
lrqdropout (bool) – use dropout training for low rank quadratic features
lrqfa (bool) – use low rank quadratic features with field aware weights
data (str) – path to data file for fitting external to sklearn
d (str) – path to data file for fitting external to sklearn
cache (str) – use a cache. default is <data>.cache
c (str) – use a cache. default is <data>.cache
cache_file (str) – path to cache file to use
json (bool) – enable JSON parsing
kill_cache (bool) – do not reuse existing cache file, create a new one always
k (bool) – do not reuse existing cache file, create a new one always

vowpalwabbit.sklearn.tovw(x, y=None, sample_weight=None, convert_labels=False)#

Convert array or sparse matrix to Vowpal Wabbit format

Parameters

x – {array-like, sparse matrix}, shape (n_samples, n_features) Training vector, where n_samples is the number of samples and n_features is the number of features.
y – {array-like}, shape (n_samples,), optional Target vector relative to X.
sample_weight – {array-like}, shape (n_samples,), optional sample weight vector relative to X.
convert_labels – {bool} convert labels of the form [0,1] to [-1,1]

Returns

{array-like}, shape (n_samples, 1): Training vectors in VW string format

Examples

>>> import pandas as pd
>>> from sklearn.feature_extraction.text import HashingVectorizer
>>> from vowpalwabbit.sklearn import tovw
>>> X = pd.Series(['cat', 'dog', 'cat', 'cat'], name='catdog')
>>> y = pd.Series([-1, 1, -1, -1], name='label')
>>> hv = HashingVectorizer()
>>> hashed = hv.fit_transform(X)
>>> tovw(x=hashed, y=y)
['-1 1 | 300839:1', '1 1 | 980517:-1', '-1 1 | 300839:1', '-1 1 | 300839:1']