Source code for speedml.xgb

"""
Speedml Xgb component with methods that work on XGBoost model workflow. Contact author https://twitter.com/manavsehgal. Code, docs and demos https://speedml.com.
"""

from __future__ import (absolute_import, division,
                        print_function, unicode_literals)
from builtins import *

from speedml.base import Base

import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel

[docs]class Xgb(Base):
[docs] def sample_accuracy(self): """ Calculate the accuracy of an XGBoost model based on number of correct labels in prediction. """ train_preds = Base.xgb_model.predict(Base.train_X) rounded_preds = np.round(train_preds).astype(int).flatten() correct = np.where(rounded_preds == Base.train_y)[0] correct_labels = len(correct) total_labels = Base.train_y.shape[0] self.sample_accuracy = round(correct_labels / total_labels * 100, 2) message = 'Accuracy = {}%. Found {} correct of {} total labels' return message.format(self.sample_accuracy, correct_labels, total_labels)
[docs] def hyper(self, select_params, fixed_params): """ Tune XGBoost hyper-parameters by selecting from permutations of values from the ``select_params`` dictionary. Remaining parameters with single values are specified by the ``fixed_params`` dictionary. Returns a dataframe with ranking of ``select_params`` items. """ optimized_GBM = GridSearchCV(xgb.XGBClassifier(**fixed_params), select_params, scoring = 'accuracy', cv = 5, n_jobs = -1) optimized_GBM.fit(Base.train_X, Base.train_y) df = pd.DataFrame(optimized_GBM.cv_results_)[['rank_test_score', 'params']].sort_values(by='rank_test_score') df.rename(columns = {'rank_test_score': 'rank'}, inplace = True) return df
[docs] def cv(self, grid_params): """ Calculate the Cross-Validation (CV) score for XGBoost model based on ``grid_params`` parameters. Sets xgb.cv_results variable to the resulting dataframe. """ xgdmat = xgb.DMatrix(Base.train_X, Base.train_y) self.cv_results = xgb.cv( params = grid_params, dtrain = xgdmat, num_boost_round = 1000, nfold = 5, metrics = ['error'], early_stopping_rounds = 20) self.error = self.cv_results.get_value(len(self.cv_results) - 1, 'test-error-mean')
[docs] def params(self, params): """ Sets Base.xgb_params to ``params`` dictionary. """ Base.xgb_params = params
[docs] def classifier(self): """ Creates the XGBoost Classifier with Base.xgb_params dictionary of model hyper-parameters. """ self.clf = xgb.XGBClassifier(**Base.xgb_params)
[docs] def fit(self): """ Sets Base.xgb_model with trained XGBoost model. """ Base.xgb_model = self.clf.fit(Base.train_X, Base.train_y)
[docs] def predict(self): """ Sets xgb.predictions with predictions from the XGBoost model. """ self.predictions = Base.xgb_model.predict(Base.test_X)
[docs] def feature_selection(self): """ Returns threshold and accuracy for ``n`` number of features. """ Base.data_n() X = Base.train_n.drop([Base.target], axis=1) Y = Base.train[Base.target] # Split data into train and test sets X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=7) # Fit model on all training data model = xgb.XGBClassifier() model.fit(X_train, y_train) # Make predictions for test data and evaluate y_pred = model.predict(X_test) predictions = [round(value) for value in y_pred] accuracy = accuracy_score(y_test, predictions) self.feature_accuracy = round(accuracy * 100.0, 2) print("Accuracy: %f%%" % (self.feature_accuracy)) # Fit model using each importance as a threshold thresholds = np.sort(model.feature_importances_) for thresh in thresholds: # Select features using threshold selection = SelectFromModel(model, threshold=thresh, prefit=True) select_X_train = selection.transform(X_train) # Train model selection_model = xgb.XGBClassifier() selection_model.fit(select_X_train, y_train) # Evalation model select_X_test = selection.transform(X_test) y_pred = selection_model.predict(select_X_test) predictions = [round(value) for value in y_pred] accuracy = accuracy_score(y_test, predictions) print ("Thresh=%.3f, n=%d, Accuracy: %.2f%%" % (thresh, select_X_train.shape[1], accuracy*100.0))