vowpalwabbit.sklearn#

This is an optional module which implements sklearn compatability.

Deprecated alias#

Deprecated since version 9.0.0: The module name vowpalwabbit.sklearn_vw has been renamed to vowpalwabbit.sklearn. Please use the new module name instead.

Example usage#

import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from vowpalwabbit.sklearn import VWClassifier
    # generate some data
X, y = datasets.make_hastie_10_2(n_samples=10000, random_state=1)
X = X.astype(np.float32)
    # split train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=256)
    # build model
model = VWClassifier()
model.fit(X_train, y_train)
    # predict model
y_pred = model.predict(X_test)
    # evaluate model
model.score(X_train, y_train)
model.score(X_test, y_test)

Module contents#

Utilities to support integration of Vowpal Wabbit and scikit-learn

class vowpalwabbit.sklearn.LinearClassifierMixin#

Bases: LogisticRegression

__init__()#
class vowpalwabbit.sklearn.VW(convert_to_vw=True, convert_labels=True, ring_size=None, strict_parse=None, learning_rate=None, l=None, power_t=None, decay_learning_rate=None, initial_t=None, feature_mask=None, initial_regressor=None, i=None, initial_weight=None, random_weights=None, normal_weights=None, truncated_normal_weights=None, sparse_weights=None, input_feature_regularizer=None, quiet=True, random_seed=None, hash=None, hash_seed=None, ignore=None, ignore_linear=None, keep=None, redefine=None, bit_precision=None, b=None, noconstant=None, constant=None, C=None, ngram=None, skips=None, feature_limit=None, affix=None, spelling=None, dictionary=None, dictionary_path=None, interactions=None, permutations=None, leave_duplicate_interactions=None, quadratic=None, q=None, cubic=None, testonly=None, t=None, holdout_off=None, holdout_period=None, holdout_after=None, early_terminate=None, passes=1, initial_pass_length=None, examples=None, min_prediction=None, max_prediction=None, sort_features=None, loss_function=None, quantile_tau=None, l1=None, l2=None, no_bias_regularization=None, named_labels=None, final_regressor=None, f=None, readable_model=None, invert_hash=None, save_resume=None, preserve_performance_counters=None, output_feature_regularizer_binary=None, output_feature_regularizer_text=None, oaa=None, ect=None, csoaa=None, wap=None, probabilities=None, nn=None, inpass=None, multitask=None, dropout=None, meanfield=None, conjugate_gradient=None, bfgs=None, hessian_on=None, mem=None, termination=None, lda=None, lda_alpha=None, lda_rho=None, lda_D=None, lda_epsilon=None, minibatch=None, svrg=None, stage_size=None, ftrl=None, coin=None, pistol=None, ftrl_alpha=None, ftrl_beta=None, ksvm=None, kernel=None, bandwidth=None, degree=None, sgd=None, adaptive=None, invariant=None, normalized=None, link=None, stage_poly=None, sched_exponent=None, batch_sz=None, batch_sz_no_doubling=None, lrq=None, lrqdropout=None, lrqfa=None, data=None, d=None, cache=None, c=None, cache_file=None, json=None, kill_cache=None, k=None)#

Bases: BaseEstimator

Vowpal Wabbit Scikit-learn Base Estimator wrapper

__init__(convert_to_vw=True, convert_labels=True, ring_size=None, strict_parse=None, learning_rate=None, l=None, power_t=None, decay_learning_rate=None, initial_t=None, feature_mask=None, initial_regressor=None, i=None, initial_weight=None, random_weights=None, normal_weights=None, truncated_normal_weights=None, sparse_weights=None, input_feature_regularizer=None, quiet=True, random_seed=None, hash=None, hash_seed=None, ignore=None, ignore_linear=None, keep=None, redefine=None, bit_precision=None, b=None, noconstant=None, constant=None, C=None, ngram=None, skips=None, feature_limit=None, affix=None, spelling=None, dictionary=None, dictionary_path=None, interactions=None, permutations=None, leave_duplicate_interactions=None, quadratic=None, q=None, cubic=None, testonly=None, t=None, holdout_off=None, holdout_period=None, holdout_after=None, early_terminate=None, passes=1, initial_pass_length=None, examples=None, min_prediction=None, max_prediction=None, sort_features=None, loss_function=None, quantile_tau=None, l1=None, l2=None, no_bias_regularization=None, named_labels=None, final_regressor=None, f=None, readable_model=None, invert_hash=None, save_resume=None, preserve_performance_counters=None, output_feature_regularizer_binary=None, output_feature_regularizer_text=None, oaa=None, ect=None, csoaa=None, wap=None, probabilities=None, nn=None, inpass=None, multitask=None, dropout=None, meanfield=None, conjugate_gradient=None, bfgs=None, hessian_on=None, mem=None, termination=None, lda=None, lda_alpha=None, lda_rho=None, lda_D=None, lda_epsilon=None, minibatch=None, svrg=None, stage_size=None, ftrl=None, coin=None, pistol=None, ftrl_alpha=None, ftrl_beta=None, ksvm=None, kernel=None, bandwidth=None, degree=None, sgd=None, adaptive=None, invariant=None, normalized=None, link=None, stage_poly=None, sched_exponent=None, batch_sz=None, batch_sz_no_doubling=None, lrq=None, lrqdropout=None, lrqfa=None, data=None, d=None, cache=None, c=None, cache_file=None, json=None, kill_cache=None, k=None)#

VW model constructor, exposing all supported parameters to keep sklearn happy

Parameters
  • convert_to_vw (bool) – flag to convert X input to vw format

  • convert_labels (bool) – Convert labels of the form [0,1] to [-1,1]

  • ring_size (int) – size of example ring

  • strict_parse (bool) – throw on malformed examples

  • learning_rate (float) – Set learning rate

  • l (float) – Set learning rate

  • power_t (float) – t power value

  • decay_learning_rate (float) – Set Decay factor for learning_rate between passes

  • initial_t (float) – initial t value

  • feature_mask (str) – Use existing regressor to determine which parameters may be updated. If no initial_regressor given, also used for initial weights.

  • initial_regressor (str) – Initial regressor(s)

  • i (str) – Initial regressor(s)

  • initial_weight (float) – Set all weights to an initial value of arg.

  • random_weights (bool) – make initial weights random

  • normal_weights (bool) – make initial weights normal

  • truncated_normal_weights (bool) – make initial weights truncated normal

  • sparse_weights (float) – Use a sparse datastructure for weights

  • input_feature_regularizer (str) – Per feature regularization input file

  • quiet (bool) – Don’t output disgnostics and progress updates

  • random_seed (integer) – seed random number generator

  • hash (str) – , all

  • hash_seed (int) – seed for hash function

  • ignore (str) – ignore namespaces beginning with character <arg>

  • ignore_linear (str) – ignore namespaces beginning with character <arg> for linear terms only

  • keep (str) – keep namespaces beginning with character <arg>

  • redefine (str) – Redefine namespaces beginning with characters of string S as namespace N. <arg> shall be in form ‘N:=S’ where := is operator. Empty N or S are treated as default namespace. Use ‘:’ as a wildcard in S.

  • bit_precision (integer) – number of bits in the feature table

  • b (integer) – number of bits in the feature table

  • noconstant (bool) – Don’t add a constant feature

  • constant (float) – Set initial value of constant

  • C (float) – Set initial value of constant

  • ngram (str) – Generate N grams. To generate N grams for a single namespace ‘foo’, arg should be fN.

  • skips (str) – Generate skips in N grams. This in conjunction with the ngram tag can be used to generate generalized n-skip-k-gram. To generate n-skips for a single namespace ‘foo’, arg should be fN.

  • feature_limit (str) – limit to N features. To apply to a single namespace ‘foo’, arg should be fN

  • affix (str) – generate prefixes/suffixes of features; argument ‘+2a,-3b,+1’ means generate 2-char prefixes for namespace a, 3-char suffixes for b and 1 char prefixes for default namespace

  • spelling (str) – compute spelling features for a give namespace (use ‘_’ for default namespace)

  • dictionary (str) – read a dictionary for additional features (arg either ‘x:file’ or just ‘file’)

  • dictionary_path (str) – look in this directory for dictionaries; defaults to current directory or env{PATH}

  • interactions (str) – Create feature interactions of any level between namespaces.

  • permutations (bool) – Use permutations instead of combinations for feature interactions of same namespace.

  • leave_duplicate_interactions (bool) – Don’t remove interactions with duplicate combinations of namespaces. For ex. this is a duplicate: ‘-q ab -q ba’ and a lot more in ‘-q ::’.

  • quadratic (str) – Create and use quadratic features, q:: corresponds to a wildcard for all printable characters

  • q (str) – Create and use quadratic features, q:: corresponds to a wildcard for all printable characters

  • cubic (str) – Create and use cubic features

  • testonly (bool) – Ignore label information and just test

  • t (bool) – Ignore label information and just test

  • holdout_off (bool) – no holdout data in multiple passes

  • holdout_period (int) – holdout period for test only

  • holdout_after (int) – holdout after n training examples

  • early_terminate (int) – Specify the number of passes tolerated when holdout loss doesn’t decrease before early termination

  • passes (int) – Number of Training Passes

  • initial_pass_length (int) – initial number of examples per pass

  • examples (int) – number of examples to parse

  • min_prediction (float) – Smallest prediction to output

  • max_prediction (float) – Largest prediction to output

  • sort_features (bool) – turn this on to disregard order in which features have been defined. This will lead to smaller cache sizes

  • loss_function (str) – default_value(“squared”), “Specify the loss function to be used, uses squared by default. Currently available ones are squared, classic, hinge, logistic and quantile.

  • quantile_tau (float) – Parameter tau associated with Quantile loss. Defaults to 0.5

  • l1 (float) – l_1 lambda (L1 regularization)

  • l2 (float) – l_2 lambda (L2 regularization)

  • no_bias_regularization (bool) – no bias in regularization

  • named_labels (str) – use names for labels (multiclass, etc.) rather than integers, argument specified all possible labels, comma-sep, eg “–named_labels Noun,Verb,Adj,Punc”

  • final_regressor (str) – Final regressor

  • f (str) – Final regressor

  • readable_model (str) – Output human-readable final regressor with numeric features

  • invert_hash (str) – Output human-readable final regressor with feature names. Computationally expensive.

  • save_resume (bool) – save extra state so learning can be resumed later with new data

  • preserve_performance_counters (bool) – reset performance counters when warmstarting

  • output_feature_regularizer_binary (str) – Per feature regularization output file

  • output_feature_regularizer_text (str) – Per feature regularization output file, in text

  • oaa (integer) – Use one-against-all multiclass learning with labels

  • oaa_subsample (int) – subsample this number of negative examples when learning

  • ect (integer) – Use error correcting tournament multiclass learning

  • csoaa (integer) – Use cost sensitive one-against-all multiclass learning

  • wap (integer) – Use weighted all pairs multiclass learning

  • probabilities (float) – predict probabilities of all classes

  • nn (integer) – Use a sigmoidal feed-forward neural network with N hidden units

  • inpass (bool) – Train or test sigmoidal feed-forward network with input pass-through

  • multitask (bool) – Share hidden layer across all reduced tasks

  • dropout (bool) – Train or test sigmoidal feed-forward network using dropout

  • meanfield (bool) – Train or test sigmoidal feed-forward network using mean field

  • conjugate_gradient (bool) – use conjugate gradient based optimization

  • bgfs (bool) – use bfgs updates

  • hessian_on (bool) – use second derivative in line search

  • mem (int) – memory in bfgs

  • termination (float) – termination threshold

  • lda (int) – Run lda with <int> topics

  • lda_alpha (float) – Prior on sparsity of per-document topic weights

  • lda_rho (float) – Prior on sparsity of topic distributions

  • lda_D (int) – Number of documents

  • lda_epsilon (float) – Loop convergence threshold

  • minibatch (int) – Minibatch size for LDA

  • svrg (bool) – Streaming Stochastic Variance Reduced Gradient

  • stage_size (int) – Number of passes per SVRG stage

  • ftrl (bool) – Run Follow the Proximal Regularized Leader

  • coin (bool) – Coin betting optimizer

  • pistol (bool) – PiSTOL - Parameter free STOchastic Learning

  • ftrl_alpha (float) – Alpha parameter for FTRL optimization

  • ftrl_beta (float) – Beta parameters for FTRL optimization

  • ksvm (bool) – kernel svm

  • kernel (str) – type of kernel (rbf or linear (default))

  • bandwidth (int) – bandwidth of rbf kernel

  • degree (int) – degree of poly kernel

  • sgd (bool) – use regular stochastic gradient descent update

  • adaptive (bool) – use adaptive, individual learning rates

  • adax (bool) – use adaptive learning rates with x^2 instead of g^2x^2

  • invariant (bool) – use save/importance aware updates

  • normalized (bool) – use per feature normalized updates

  • link (str) – Specify the link function - identity, logistic, glf1 or poisson

  • stage_poly (bool) – use stagewise polynomial feature learning

  • sched_exponent (int) – exponent controlling quantity of included features

  • batch_sz (int) – multiplier on batch size before including more features

  • batch_sz_no_doubling (bool) – batch_sz does not double

  • lrq (bool) – use low rank quadratic features

  • lrqdropout (bool) – use dropout training for low rank quadratic features

  • lrqfa (bool) – use low rank quadratic features with field aware weights

  • data (str) – path to data file for fitting external to sklearn

  • d (str) – path to data file for fitting external to sklearn

  • cache (str) – use a cache. default is <data>.cache

  • c (str) – use a cache. default is <data>.cache

  • cache_file (str) – path to cache file to use

  • json (bool) – enable JSON parsing

  • kill_cache (bool) – do not reuse existing cache file, create a new one always

  • k (bool) – do not reuse existing cache file, create a new one always

convert_labels: bool = True#

Convert labels of the form [0,1] to [-1,1]

convert_to_vw: bool = True#

flag to convert X input to vw format

fit(X=None, y=None, sample_weight=None)#

Fit the model according to the given training data

Todo

For first pass create and store example objects. For N-1 passes use example objects directly (simulate cache file…but in memory for faster processing)

Parameters
  • X – {array-like, sparse matrix}, shape (n_samples, n_features or 1 if not convert_to_vw) or Training vector, where n_samples in the number of samples and n_features is the number of features. if not using convert_to_vw, X is expected to be a list of vw formatted feature vector strings with labels

  • y – array-like, shape (n_samples,), optional if not convert_to_vw Target vector relative to X.

  • sample_weight – array-like, shape (n_samples,) sample weight vector relative to X.

Returns

self

get_coefs()#

Returns coefficient weights as ordered sparse matrix

Returns

coefficient weights for model

Return type

sparse matrix

get_intercept()#

Returns intercept weight for model

Returns

intercept value. 0 if no constant

Return type

int

get_params(deep=True)#

This returns the full set of vw and estimator parameters currently in use

get_vw()#

Get the vw instance

Returns

instance

Return type

vowpalwabbit.Workspace

load(filename)#

Load model from file

predict(X)#

Predict with Vowpal Wabbit model

Parameters

X ({array-like, sparse matrix}, shape (n_samples, n_features or 1)) – Training vector, where n_samples in the number of samples and n_features is the number of features. if not using convert_to_vw, X is expected to be a list of vw formatted feature vector strings with labels

Returns

  1. Output vector relative to X.

Return type

array-like, shape (n_samples, 1 or n_classes)

save(filename)#

Save model to file

set_coefs(coefs)#

Sets coefficients weights from ordered sparse matrix

Parameters

coefs (sparse matrix) – coefficient weights for model

set_params(**kwargs)#

This destroys and recreates the Vowpal Wabbit model with updated parameters any parameters not provided will remain as they are currently

vw_: vowpalwabbit.pyvw.Workspace = None#
class vowpalwabbit.sklearn.VWClassifier(loss_function='logistic', **kwargs)#

Bases: VW, LinearClassifierMixin

Vowpal Wabbit Classifier model for binary classification Use VWMultiClassifier for multiclass classification Note - We are assuming the VW.predict returns logits, applying link=logistic will break this assumption

__init__(loss_function='logistic', **kwargs)#

VW model constructor, exposing all supported parameters to keep sklearn happy

Parameters
  • convert_to_vw (bool) – flag to convert X input to vw format

  • convert_labels (bool) – Convert labels of the form [0,1] to [-1,1]

  • ring_size (int) – size of example ring

  • strict_parse (bool) – throw on malformed examples

  • learning_rate (float) – Set learning rate

  • l (float) – Set learning rate

  • power_t (float) – t power value

  • decay_learning_rate (float) – Set Decay factor for learning_rate between passes

  • initial_t (float) – initial t value

  • feature_mask (str) – Use existing regressor to determine which parameters may be updated. If no initial_regressor given, also used for initial weights.

  • initial_regressor (str) – Initial regressor(s)

  • i (str) – Initial regressor(s)

  • initial_weight (float) – Set all weights to an initial value of arg.

  • random_weights (bool) – make initial weights random

  • normal_weights (bool) – make initial weights normal

  • truncated_normal_weights (bool) – make initial weights truncated normal

  • sparse_weights (float) – Use a sparse datastructure for weights

  • input_feature_regularizer (str) – Per feature regularization input file

  • quiet (bool) – Don’t output disgnostics and progress updates

  • random_seed (integer) – seed random number generator

  • hash (str) – , all

  • hash_seed (int) – seed for hash function

  • ignore (str) – ignore namespaces beginning with character <arg>

  • ignore_linear (str) – ignore namespaces beginning with character <arg> for linear terms only

  • keep (str) – keep namespaces beginning with character <arg>

  • redefine (str) – Redefine namespaces beginning with characters of string S as namespace N. <arg> shall be in form ‘N:=S’ where := is operator. Empty N or S are treated as default namespace. Use ‘:’ as a wildcard in S.

  • bit_precision (integer) – number of bits in the feature table

  • b (integer) – number of bits in the feature table

  • noconstant (bool) – Don’t add a constant feature

  • constant (float) – Set initial value of constant

  • C (float) – Set initial value of constant

  • ngram (str) – Generate N grams. To generate N grams for a single namespace ‘foo’, arg should be fN.

  • skips (str) – Generate skips in N grams. This in conjunction with the ngram tag can be used to generate generalized n-skip-k-gram. To generate n-skips for a single namespace ‘foo’, arg should be fN.

  • feature_limit (str) – limit to N features. To apply to a single namespace ‘foo’, arg should be fN

  • affix (str) – generate prefixes/suffixes of features; argument ‘+2a,-3b,+1’ means generate 2-char prefixes for namespace a, 3-char suffixes for b and 1 char prefixes for default namespace

  • spelling (str) – compute spelling features for a give namespace (use ‘_’ for default namespace)

  • dictionary (str) – read a dictionary for additional features (arg either ‘x:file’ or just ‘file’)

  • dictionary_path (str) – look in this directory for dictionaries; defaults to current directory or env{PATH}

  • interactions (str) – Create feature interactions of any level between namespaces.

  • permutations (bool) – Use permutations instead of combinations for feature interactions of same namespace.

  • leave_duplicate_interactions (bool) – Don’t remove interactions with duplicate combinations of namespaces. For ex. this is a duplicate: ‘-q ab -q ba’ and a lot more in ‘-q ::’.

  • quadratic (str) – Create and use quadratic features, q:: corresponds to a wildcard for all printable characters

  • q (str) – Create and use quadratic features, q:: corresponds to a wildcard for all printable characters

  • cubic (str) – Create and use cubic features

  • testonly (bool) – Ignore label information and just test

  • t (bool) – Ignore label information and just test

  • holdout_off (bool) – no holdout data in multiple passes

  • holdout_period (int) – holdout period for test only

  • holdout_after (int) – holdout after n training examples

  • early_terminate (int) – Specify the number of passes tolerated when holdout loss doesn’t decrease before early termination

  • passes (int) – Number of Training Passes

  • initial_pass_length (int) – initial number of examples per pass

  • examples (int) – number of examples to parse

  • min_prediction (float) – Smallest prediction to output

  • max_prediction (float) – Largest prediction to output

  • sort_features (bool) – turn this on to disregard order in which features have been defined. This will lead to smaller cache sizes

  • loss_function (str) – default_value(“squared”), “Specify the loss function to be used, uses squared by default. Currently available ones are squared, classic, hinge, logistic and quantile.

  • quantile_tau (float) – Parameter tau associated with Quantile loss. Defaults to 0.5

  • l1 (float) – l_1 lambda (L1 regularization)

  • l2 (float) – l_2 lambda (L2 regularization)

  • no_bias_regularization (bool) – no bias in regularization

  • named_labels (str) – use names for labels (multiclass, etc.) rather than integers, argument specified all possible labels, comma-sep, eg “–named_labels Noun,Verb,Adj,Punc”

  • final_regressor (str) – Final regressor

  • f (str) – Final regressor

  • readable_model (str) – Output human-readable final regressor with numeric features

  • invert_hash (str) – Output human-readable final regressor with feature names. Computationally expensive.

  • save_resume (bool) – save extra state so learning can be resumed later with new data

  • preserve_performance_counters (bool) – reset performance counters when warmstarting

  • output_feature_regularizer_binary (str) – Per feature regularization output file

  • output_feature_regularizer_text (str) – Per feature regularization output file, in text

  • oaa (integer) – Use one-against-all multiclass learning with labels

  • oaa_subsample (int) – subsample this number of negative examples when learning

  • ect (integer) – Use error correcting tournament multiclass learning

  • csoaa (integer) – Use cost sensitive one-against-all multiclass learning

  • wap (integer) – Use weighted all pairs multiclass learning

  • probabilities (float) – predict probabilities of all classes

  • nn (integer) – Use a sigmoidal feed-forward neural network with N hidden units

  • inpass (bool) – Train or test sigmoidal feed-forward network with input pass-through

  • multitask (bool) – Share hidden layer across all reduced tasks

  • dropout (bool) – Train or test sigmoidal feed-forward network using dropout

  • meanfield (bool) – Train or test sigmoidal feed-forward network using mean field

  • conjugate_gradient (bool) – use conjugate gradient based optimization

  • bgfs (bool) – use bfgs updates

  • hessian_on (bool) – use second derivative in line search

  • mem (int) – memory in bfgs

  • termination (float) – termination threshold

  • lda (int) – Run lda with <int> topics

  • lda_alpha (float) – Prior on sparsity of per-document topic weights

  • lda_rho (float) – Prior on sparsity of topic distributions

  • lda_D (int) – Number of documents

  • lda_epsilon (float) – Loop convergence threshold

  • minibatch (int) – Minibatch size for LDA

  • svrg (bool) – Streaming Stochastic Variance Reduced Gradient

  • stage_size (int) – Number of passes per SVRG stage

  • ftrl (bool) – Run Follow the Proximal Regularized Leader

  • coin (bool) – Coin betting optimizer

  • pistol (bool) – PiSTOL - Parameter free STOchastic Learning

  • ftrl_alpha (float) – Alpha parameter for FTRL optimization

  • ftrl_beta (float) – Beta parameters for FTRL optimization

  • ksvm (bool) – kernel svm

  • kernel (str) – type of kernel (rbf or linear (default))

  • bandwidth (int) – bandwidth of rbf kernel

  • degree (int) – degree of poly kernel

  • sgd (bool) – use regular stochastic gradient descent update

  • adaptive (bool) – use adaptive, individual learning rates

  • adax (bool) – use adaptive learning rates with x^2 instead of g^2x^2

  • invariant (bool) – use save/importance aware updates

  • normalized (bool) – use per feature normalized updates

  • link (str) – Specify the link function - identity, logistic, glf1 or poisson

  • stage_poly (bool) – use stagewise polynomial feature learning

  • sched_exponent (int) – exponent controlling quantity of included features

  • batch_sz (int) – multiplier on batch size before including more features

  • batch_sz_no_doubling (bool) – batch_sz does not double

  • lrq (bool) – use low rank quadratic features

  • lrqdropout (bool) – use dropout training for low rank quadratic features

  • lrqfa (bool) – use low rank quadratic features with field aware weights

  • data (str) – path to data file for fitting external to sklearn

  • d (str) – path to data file for fitting external to sklearn

  • cache (str) – use a cache. default is <data>.cache

  • c (str) – use a cache. default is <data>.cache

  • cache_file (str) – path to cache file to use

  • json (bool) – enable JSON parsing

  • kill_cache (bool) – do not reuse existing cache file, create a new one always

  • k (bool) – do not reuse existing cache file, create a new one always

classes_ = array([-1.,  1.])#

Binary class labels

coef_ = None#

Empty sparse matrix used the check if model has been fit

decision_function(X)#

Predict confidence scores for samples. The confidence score for a sample is the signed distance of that sample to the hyperplane.

Parameters

X – array_like or sparse matrix, shape (n_samples, n_features) Samples.

Returns

array, shape=(n_samples,) if n_classes == 2 else (n_samples, n_classes)

Confidence scores per (sample, class) combination. In the binary case, confidence score for self.classes_[1] where >0 means this class would be predicted.

fit(X=None, y=None, sample_weight=None)#

Fit the model according to the given training data.

Parameters
  • X – {array-like, sparse matrix} of shape (n_samples, n_features) Training vector, where n_samples is the number of samples and n_features is the number of features.

  • y – array-like of shape (n_samples,) Target vector relative to X.

  • sample_weight – array-like of shape (n_samples,) default=None Array of weights that are assigned to individual samples. If not provided, then each sample is given unit weight.

Returns

self

predict(X)#

Predict class labels for samples in X.

Parameters

X – array_like or sparse matrix, shape (n_samples, n_features) Samples.

Returns

  1. Predicted class label per sample.

Return type

array, shape [n_samples]

predict_proba(X)#

Predict probabilities for samples

Parameters

X – {array-like, sparse matrix}, shape = (n_samples, n_features) Samples.

Returns

  1. Returns the probability of the sample for each class in the model,

    where classes are ordered as they are in self.classes_.

Return type

array-like of shape (n_samples, n_classes)

class vowpalwabbit.sklearn.VWMultiClassifier(probabilities=True, **kwargs)#

Bases: VWClassifier

Vowpal Wabbit MultiClassifier model Note - We are assuming the VW.predict returns probabilities, setting probabilities=False will break this assumption

__init__(probabilities=True, **kwargs)#

VW model constructor, exposing all supported parameters to keep sklearn happy

Parameters
  • convert_to_vw (bool) – flag to convert X input to vw format

  • convert_labels (bool) – Convert labels of the form [0,1] to [-1,1]

  • ring_size (int) – size of example ring

  • strict_parse (bool) – throw on malformed examples

  • learning_rate (float) – Set learning rate

  • l (float) – Set learning rate

  • power_t (float) – t power value

  • decay_learning_rate (float) – Set Decay factor for learning_rate between passes

  • initial_t (float) – initial t value

  • feature_mask (str) – Use existing regressor to determine which parameters may be updated. If no initial_regressor given, also used for initial weights.

  • initial_regressor (str) – Initial regressor(s)

  • i (str) – Initial regressor(s)

  • initial_weight (float) – Set all weights to an initial value of arg.

  • random_weights (bool) – make initial weights random

  • normal_weights (bool) – make initial weights normal

  • truncated_normal_weights (bool) – make initial weights truncated normal

  • sparse_weights (float) – Use a sparse datastructure for weights

  • input_feature_regularizer (str) – Per feature regularization input file

  • quiet (bool) – Don’t output disgnostics and progress updates

  • random_seed (integer) – seed random number generator

  • hash (str) – , all

  • hash_seed (int) – seed for hash function

  • ignore (str) – ignore namespaces beginning with character <arg>

  • ignore_linear (str) – ignore namespaces beginning with character <arg> for linear terms only

  • keep (str) – keep namespaces beginning with character <arg>

  • redefine (str) – Redefine namespaces beginning with characters of string S as namespace N. <arg> shall be in form ‘N:=S’ where := is operator. Empty N or S are treated as default namespace. Use ‘:’ as a wildcard in S.

  • bit_precision (integer) – number of bits in the feature table

  • b (integer) – number of bits in the feature table

  • noconstant (bool) – Don’t add a constant feature

  • constant (float) – Set initial value of constant

  • C (float) – Set initial value of constant

  • ngram (str) – Generate N grams. To generate N grams for a single namespace ‘foo’, arg should be fN.

  • skips (str) – Generate skips in N grams. This in conjunction with the ngram tag can be used to generate generalized n-skip-k-gram. To generate n-skips for a single namespace ‘foo’, arg should be fN.

  • feature_limit (str) – limit to N features. To apply to a single namespace ‘foo’, arg should be fN

  • affix (str) – generate prefixes/suffixes of features; argument ‘+2a,-3b,+1’ means generate 2-char prefixes for namespace a, 3-char suffixes for b and 1 char prefixes for default namespace

  • spelling (str) – compute spelling features for a give namespace (use ‘_’ for default namespace)

  • dictionary (str) – read a dictionary for additional features (arg either ‘x:file’ or just ‘file’)

  • dictionary_path (str) – look in this directory for dictionaries; defaults to current directory or env{PATH}

  • interactions (str) – Create feature interactions of any level between namespaces.

  • permutations (bool) – Use permutations instead of combinations for feature interactions of same namespace.

  • leave_duplicate_interactions (bool) – Don’t remove interactions with duplicate combinations of namespaces. For ex. this is a duplicate: ‘-q ab -q ba’ and a lot more in ‘-q ::’.

  • quadratic (str) – Create and use quadratic features, q:: corresponds to a wildcard for all printable characters

  • q (str) – Create and use quadratic features, q:: corresponds to a wildcard for all printable characters

  • cubic (str) – Create and use cubic features

  • testonly (bool) – Ignore label information and just test

  • t (bool) – Ignore label information and just test

  • holdout_off (bool) – no holdout data in multiple passes

  • holdout_period (int) – holdout period for test only

  • holdout_after (int) – holdout after n training examples

  • early_terminate (int) – Specify the number of passes tolerated when holdout loss doesn’t decrease before early termination

  • passes (int) – Number of Training Passes

  • initial_pass_length (int) – initial number of examples per pass

  • examples (int) – number of examples to parse

  • min_prediction (float) – Smallest prediction to output

  • max_prediction (float) – Largest prediction to output

  • sort_features (bool) – turn this on to disregard order in which features have been defined. This will lead to smaller cache sizes

  • loss_function (str) – default_value(“squared”), “Specify the loss function to be used, uses squared by default. Currently available ones are squared, classic, hinge, logistic and quantile.

  • quantile_tau (float) – Parameter tau associated with Quantile loss. Defaults to 0.5

  • l1 (float) – l_1 lambda (L1 regularization)

  • l2 (float) – l_2 lambda (L2 regularization)

  • no_bias_regularization (bool) – no bias in regularization

  • named_labels (str) – use names for labels (multiclass, etc.) rather than integers, argument specified all possible labels, comma-sep, eg “–named_labels Noun,Verb,Adj,Punc”

  • final_regressor (str) – Final regressor

  • f (str) – Final regressor

  • readable_model (str) – Output human-readable final regressor with numeric features

  • invert_hash (str) – Output human-readable final regressor with feature names. Computationally expensive.

  • save_resume (bool) – save extra state so learning can be resumed later with new data

  • preserve_performance_counters (bool) – reset performance counters when warmstarting

  • output_feature_regularizer_binary (str) – Per feature regularization output file

  • output_feature_regularizer_text (str) – Per feature regularization output file, in text

  • oaa (integer) – Use one-against-all multiclass learning with labels

  • oaa_subsample (int) – subsample this number of negative examples when learning

  • ect (integer) – Use error correcting tournament multiclass learning

  • csoaa (integer) – Use cost sensitive one-against-all multiclass learning

  • wap (integer) – Use weighted all pairs multiclass learning

  • probabilities (float) – predict probabilities of all classes

  • nn (integer) – Use a sigmoidal feed-forward neural network with N hidden units

  • inpass (bool) – Train or test sigmoidal feed-forward network with input pass-through

  • multitask (bool) – Share hidden layer across all reduced tasks

  • dropout (bool) – Train or test sigmoidal feed-forward network using dropout

  • meanfield (bool) – Train or test sigmoidal feed-forward network using mean field

  • conjugate_gradient (bool) – use conjugate gradient based optimization

  • bgfs (bool) – use bfgs updates

  • hessian_on (bool) – use second derivative in line search

  • mem (int) – memory in bfgs

  • termination (float) – termination threshold

  • lda (int) – Run lda with <int> topics

  • lda_alpha (float) – Prior on sparsity of per-document topic weights

  • lda_rho (float) – Prior on sparsity of topic distributions

  • lda_D (int) – Number of documents

  • lda_epsilon (float) – Loop convergence threshold

  • minibatch (int) – Minibatch size for LDA

  • svrg (bool) – Streaming Stochastic Variance Reduced Gradient

  • stage_size (int) – Number of passes per SVRG stage

  • ftrl (bool) – Run Follow the Proximal Regularized Leader

  • coin (bool) – Coin betting optimizer

  • pistol (bool) – PiSTOL - Parameter free STOchastic Learning

  • ftrl_alpha (float) – Alpha parameter for FTRL optimization

  • ftrl_beta (float) – Beta parameters for FTRL optimization

  • ksvm (bool) – kernel svm

  • kernel (str) – type of kernel (rbf or linear (default))

  • bandwidth (int) – bandwidth of rbf kernel

  • degree (int) – degree of poly kernel

  • sgd (bool) – use regular stochastic gradient descent update

  • adaptive (bool) – use adaptive, individual learning rates

  • adax (bool) – use adaptive learning rates with x^2 instead of g^2x^2

  • invariant (bool) – use save/importance aware updates

  • normalized (bool) – use per feature normalized updates

  • link (str) – Specify the link function - identity, logistic, glf1 or poisson

  • stage_poly (bool) – use stagewise polynomial feature learning

  • sched_exponent (int) – exponent controlling quantity of included features

  • batch_sz (int) – multiplier on batch size before including more features

  • batch_sz_no_doubling (bool) – batch_sz does not double

  • lrq (bool) – use low rank quadratic features

  • lrqdropout (bool) – use dropout training for low rank quadratic features

  • lrqfa (bool) – use low rank quadratic features with field aware weights

  • data (str) – path to data file for fitting external to sklearn

  • d (str) – path to data file for fitting external to sklearn

  • cache (str) – use a cache. default is <data>.cache

  • c (str) – use a cache. default is <data>.cache

  • cache_file (str) – path to cache file to use

  • json (bool) – enable JSON parsing

  • kill_cache (bool) – do not reuse existing cache file, create a new one always

  • k (bool) – do not reuse existing cache file, create a new one always

classes_ = None#

Class labels

decision_function(X)#

Predict confidence scores for samples. The confidence score for a sample is the signed distance of that sample to the hyperplane.

Parameters

X – array_like or sparse matrix, shape (n_samples, n_features) Samples.

Returns

Confidence scores per (sample, class) combination.

Return type

array, shape=(n_samples, n_classes)

estimator_ = None#

“type of estimator to use [csoaa, ect, oaa, wap] and number of classes

fit(X=None, y=None, sample_weight=None)#

Fit the model according to the given training data.

Parameters
  • X – {array-like, sparse matrix} of shape (n_samples, n_features) Training vector, where n_samples is the number of samples and n_features is the number of features.

  • y – array-like of shape (n_samples,) Target vector relative to X.

  • sample_weight – array-like of shape (n_samples,) default=None Array of weights that are assigned to individual samples. If not provided, then each sample is given unit weight.

Returns

self

predict_proba(X)#

Predict probabilities for each class.

Parameters

X – {array-like, sparse matrix}, shape = (n_samples, n_features) Samples.

Returns

array, shape=(n_samples,) if n_classes == 2 else (n_samples, n_classes)

Confidence scores per (sample, class) combination. In the binary case, confidence score for self.classes_[1] where >0 means this class would be predicted.

Examples

>>> import numpy as np
>>> X = np.array([ [10, 10], [8, 10], [-5, 5.5], [-5.4, 5.5], [-20, -20],  [-15, -20] ])
>>> y = np.array([1, 1, 2, 2, 3, 3])
>>> from vowpalwabbit.sklearn import VWMultiClassifier
>>> model = VWMultiClassifier(oaa=3, loss_function='logistic')
>>> _ = model.fit(X, y)
>>> model.predict_proba(X)
array([[0.38928846, 0.30534211, 0.30536944],
       [0.40664235, 0.29666999, 0.29668769],
       [0.52324486, 0.23841164, 0.23834346],
       [0.5268591 , 0.23660533, 0.23653553],
       [0.65397811, 0.17312808, 0.17289382],
       [0.61190444, 0.19416356, 0.19393198]])
class vowpalwabbit.sklearn.VWRegressor(convert_labels=False, **kwargs)#

Bases: VW, RegressorMixin

Vowpal Wabbit Regressor model

__init__(convert_labels=False, **kwargs)#

VW model constructor, exposing all supported parameters to keep sklearn happy

Parameters
  • convert_to_vw (bool) – flag to convert X input to vw format

  • convert_labels (bool) – Convert labels of the form [0,1] to [-1,1]

  • ring_size (int) – size of example ring

  • strict_parse (bool) – throw on malformed examples

  • learning_rate (float) – Set learning rate

  • l (float) – Set learning rate

  • power_t (float) – t power value

  • decay_learning_rate (float) – Set Decay factor for learning_rate between passes

  • initial_t (float) – initial t value

  • feature_mask (str) – Use existing regressor to determine which parameters may be updated. If no initial_regressor given, also used for initial weights.

  • initial_regressor (str) – Initial regressor(s)

  • i (str) – Initial regressor(s)

  • initial_weight (float) – Set all weights to an initial value of arg.

  • random_weights (bool) – make initial weights random

  • normal_weights (bool) – make initial weights normal

  • truncated_normal_weights (bool) – make initial weights truncated normal

  • sparse_weights (float) – Use a sparse datastructure for weights

  • input_feature_regularizer (str) – Per feature regularization input file

  • quiet (bool) – Don’t output disgnostics and progress updates

  • random_seed (integer) – seed random number generator

  • hash (str) – , all

  • hash_seed (int) – seed for hash function

  • ignore (str) – ignore namespaces beginning with character <arg>

  • ignore_linear (str) – ignore namespaces beginning with character <arg> for linear terms only

  • keep (str) – keep namespaces beginning with character <arg>

  • redefine (str) – Redefine namespaces beginning with characters of string S as namespace N. <arg> shall be in form ‘N:=S’ where := is operator. Empty N or S are treated as default namespace. Use ‘:’ as a wildcard in S.

  • bit_precision (integer) – number of bits in the feature table

  • b (integer) – number of bits in the feature table

  • noconstant (bool) – Don’t add a constant feature

  • constant (float) – Set initial value of constant

  • C (float) – Set initial value of constant

  • ngram (str) – Generate N grams. To generate N grams for a single namespace ‘foo’, arg should be fN.

  • skips (str) – Generate skips in N grams. This in conjunction with the ngram tag can be used to generate generalized n-skip-k-gram. To generate n-skips for a single namespace ‘foo’, arg should be fN.

  • feature_limit (str) – limit to N features. To apply to a single namespace ‘foo’, arg should be fN

  • affix (str) – generate prefixes/suffixes of features; argument ‘+2a,-3b,+1’ means generate 2-char prefixes for namespace a, 3-char suffixes for b and 1 char prefixes for default namespace

  • spelling (str) – compute spelling features for a give namespace (use ‘_’ for default namespace)

  • dictionary (str) – read a dictionary for additional features (arg either ‘x:file’ or just ‘file’)

  • dictionary_path (str) – look in this directory for dictionaries; defaults to current directory or env{PATH}

  • interactions (str) – Create feature interactions of any level between namespaces.

  • permutations (bool) – Use permutations instead of combinations for feature interactions of same namespace.

  • leave_duplicate_interactions (bool) – Don’t remove interactions with duplicate combinations of namespaces. For ex. this is a duplicate: ‘-q ab -q ba’ and a lot more in ‘-q ::’.

  • quadratic (str) – Create and use quadratic features, q:: corresponds to a wildcard for all printable characters

  • q (str) – Create and use quadratic features, q:: corresponds to a wildcard for all printable characters

  • cubic (str) – Create and use cubic features

  • testonly (bool) – Ignore label information and just test

  • t (bool) – Ignore label information and just test

  • holdout_off (bool) – no holdout data in multiple passes

  • holdout_period (int) – holdout period for test only

  • holdout_after (int) – holdout after n training examples

  • early_terminate (int) – Specify the number of passes tolerated when holdout loss doesn’t decrease before early termination

  • passes (int) – Number of Training Passes

  • initial_pass_length (int) – initial number of examples per pass

  • examples (int) – number of examples to parse

  • min_prediction (float) – Smallest prediction to output

  • max_prediction (float) – Largest prediction to output

  • sort_features (bool) – turn this on to disregard order in which features have been defined. This will lead to smaller cache sizes

  • loss_function (str) – default_value(“squared”), “Specify the loss function to be used, uses squared by default. Currently available ones are squared, classic, hinge, logistic and quantile.

  • quantile_tau (float) – Parameter tau associated with Quantile loss. Defaults to 0.5

  • l1 (float) – l_1 lambda (L1 regularization)

  • l2 (float) – l_2 lambda (L2 regularization)

  • no_bias_regularization (bool) – no bias in regularization

  • named_labels (str) – use names for labels (multiclass, etc.) rather than integers, argument specified all possible labels, comma-sep, eg “–named_labels Noun,Verb,Adj,Punc”

  • final_regressor (str) – Final regressor

  • f (str) – Final regressor

  • readable_model (str) – Output human-readable final regressor with numeric features

  • invert_hash (str) – Output human-readable final regressor with feature names. Computationally expensive.

  • save_resume (bool) – save extra state so learning can be resumed later with new data

  • preserve_performance_counters (bool) – reset performance counters when warmstarting

  • output_feature_regularizer_binary (str) – Per feature regularization output file

  • output_feature_regularizer_text (str) – Per feature regularization output file, in text

  • oaa (integer) – Use one-against-all multiclass learning with labels

  • oaa_subsample (int) – subsample this number of negative examples when learning

  • ect (integer) – Use error correcting tournament multiclass learning

  • csoaa (integer) – Use cost sensitive one-against-all multiclass learning

  • wap (integer) – Use weighted all pairs multiclass learning

  • probabilities (float) – predict probabilities of all classes

  • nn (integer) – Use a sigmoidal feed-forward neural network with N hidden units

  • inpass (bool) – Train or test sigmoidal feed-forward network with input pass-through

  • multitask (bool) – Share hidden layer across all reduced tasks

  • dropout (bool) – Train or test sigmoidal feed-forward network using dropout

  • meanfield (bool) – Train or test sigmoidal feed-forward network using mean field

  • conjugate_gradient (bool) – use conjugate gradient based optimization

  • bgfs (bool) – use bfgs updates

  • hessian_on (bool) – use second derivative in line search

  • mem (int) – memory in bfgs

  • termination (float) – termination threshold

  • lda (int) – Run lda with <int> topics

  • lda_alpha (float) – Prior on sparsity of per-document topic weights

  • lda_rho (float) – Prior on sparsity of topic distributions

  • lda_D (int) – Number of documents

  • lda_epsilon (float) – Loop convergence threshold

  • minibatch (int) – Minibatch size for LDA

  • svrg (bool) – Streaming Stochastic Variance Reduced Gradient

  • stage_size (int) – Number of passes per SVRG stage

  • ftrl (bool) – Run Follow the Proximal Regularized Leader

  • coin (bool) – Coin betting optimizer

  • pistol (bool) – PiSTOL - Parameter free STOchastic Learning

  • ftrl_alpha (float) – Alpha parameter for FTRL optimization

  • ftrl_beta (float) – Beta parameters for FTRL optimization

  • ksvm (bool) – kernel svm

  • kernel (str) – type of kernel (rbf or linear (default))

  • bandwidth (int) – bandwidth of rbf kernel

  • degree (int) – degree of poly kernel

  • sgd (bool) – use regular stochastic gradient descent update

  • adaptive (bool) – use adaptive, individual learning rates

  • adax (bool) – use adaptive learning rates with x^2 instead of g^2x^2

  • invariant (bool) – use save/importance aware updates

  • normalized (bool) – use per feature normalized updates

  • link (str) – Specify the link function - identity, logistic, glf1 or poisson

  • stage_poly (bool) – use stagewise polynomial feature learning

  • sched_exponent (int) – exponent controlling quantity of included features

  • batch_sz (int) – multiplier on batch size before including more features

  • batch_sz_no_doubling (bool) – batch_sz does not double

  • lrq (bool) – use low rank quadratic features

  • lrqdropout (bool) – use dropout training for low rank quadratic features

  • lrqfa (bool) – use low rank quadratic features with field aware weights

  • data (str) – path to data file for fitting external to sklearn

  • d (str) – path to data file for fitting external to sklearn

  • cache (str) – use a cache. default is <data>.cache

  • c (str) – use a cache. default is <data>.cache

  • cache_file (str) – path to cache file to use

  • json (bool) – enable JSON parsing

  • kill_cache (bool) – do not reuse existing cache file, create a new one always

  • k (bool) – do not reuse existing cache file, create a new one always

vowpalwabbit.sklearn.tovw(x, y=None, sample_weight=None, convert_labels=False)#

Convert array or sparse matrix to Vowpal Wabbit format

Parameters
  • x – {array-like, sparse matrix}, shape (n_samples, n_features) Training vector, where n_samples is the number of samples and n_features is the number of features.

  • y – {array-like}, shape (n_samples,), optional Target vector relative to X.

  • sample_weight – {array-like}, shape (n_samples,), optional sample weight vector relative to X.

  • convert_labels – {bool} convert labels of the form [0,1] to [-1,1]

Returns

{array-like}, shape (n_samples, 1)

Training vectors in VW string format

Examples

>>> import pandas as pd
>>> from sklearn.feature_extraction.text import HashingVectorizer
>>> from vowpalwabbit.sklearn import tovw
>>> X = pd.Series(['cat', 'dog', 'cat', 'cat'], name='catdog')
>>> y = pd.Series([-1, 1, -1, -1], name='label')
>>> hv = HashingVectorizer()
>>> hashed = hv.fit_transform(X)
>>> tovw(x=hashed, y=y)
['-1 1 | 300839:1', '1 1 | 980517:-1', '-1 1 | 300839:1', '-1 1 | 300839:1']