Source code for speedml.model
"""
Speedml Model component with methods that work on sklearn models workflow. Contact author https://twitter.com/manavsehgal. Code, docs and demos https://speedml.com.
"""
from __future__ import (absolute_import, division,
print_function, unicode_literals)
from builtins import *
from speedml.base import Base
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
[docs]class Model(Base):
[docs] def data(self):
"""
Prepare model input data ``Base.train_y`` as Series, ``Base.train_X``, and ``Base.test_X`` datasets as Matrix.
"""
Base.train_y = Base.train[Base.target]
Base.train_X = Base.train.drop([Base.target], axis=1).as_matrix()
Base.test_X = Base.test.as_matrix()
message = 'train_X: {} train_y: {} test_X: {}'
return message.format(Base.train_X.shape,
Base.train_y.shape,
Base.test_X.shape)
[docs] def evaluate(self):
"""
Model evaluation across multiple classifiers based on accuracy of predictions.
"""
classifiers = [
xgb.XGBClassifier(**Base.xgb_params),
KNeighborsClassifier(3),
SVC(probability=True),
DecisionTreeClassifier(),
RandomForestClassifier(),
AdaBoostClassifier(),
GradientBoostingClassifier(),
GaussianNB(),
LogisticRegression()]
log_cols = ["Classifier", "Accuracy"]
Base.model_ranking = pd.DataFrame(columns=log_cols)
sss = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=0)
X = Base.train_X
y = Base.train_y
acc_dict = {}
for train_index, test_index in sss.split(X, y):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
for clf in classifiers:
name = clf.__class__.__name__
clf.fit(X_train, y_train)
train_predictions = clf.predict(X_test)
acc = accuracy_score(y_test, train_predictions)
if name in acc_dict:
acc_dict[name] += acc
else:
acc_dict[name] = acc
for clf in acc_dict:
acc_dict[clf] = acc_dict[clf] / 10.0
log_entry = pd.DataFrame([[clf, acc_dict[clf]]], columns=log_cols)
Base.model_ranking = Base.model_ranking.append(log_entry)
Base.model_ranking = Base.model_ranking.sort_values(by='Accuracy', ascending=False)
[docs] def ranks(self):
"""
Returns DataFrame of model ranking sorted by Accuracy.
"""
self.xgb_accuracy = Base.model_ranking[Base.model_ranking['Classifier'] == 'XGBClassifier']['Accuracy'][0]
return Base.model_ranking.sort_values(by='Accuracy', ascending=False)