Source code for speedml.xgb

"""
Speedml Xgb component with methods that work on XGBoost model workflow. Contact author https://twitter.com/manavsehgal. Code, docs and demos https://speedml.com.
"""

from __future__ import (absolute_import, division,
                        print_function, unicode_literals)
from builtins import *

from speedml.base import Base

import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel

[docs]class Xgb(Base):
[docs]    def sample_accuracy(self):
        """
        Calculate the accuracy of an XGBoost model based on number of correct labels in prediction.
        """
        train_preds = Base.xgb_model.predict(Base.train_X)
        rounded_preds = np.round(train_preds).astype(int).flatten()
        correct = np.where(rounded_preds == Base.train_y)[0]
        correct_labels = len(correct)
        total_labels = Base.train_y.shape[0]
        self.sample_accuracy = round(correct_labels / total_labels * 100, 2)
        message = 'Accuracy = {}%. Found {} correct of {} total labels'
        return message.format(self.sample_accuracy,
                              correct_labels,
                              total_labels)

[docs]    def hyper(self, select_params, fixed_params):
        """
        Tune XGBoost hyper-parameters by selecting from permutations of values from the ``select_params`` dictionary. Remaining parameters with single values are specified by the ``fixed_params`` dictionary. Returns a dataframe with ranking of ``select_params`` items.
        """
        optimized_GBM = GridSearchCV(xgb.XGBClassifier(**fixed_params), select_params, scoring = 'accuracy', cv = 5, n_jobs = -1)
        optimized_GBM.fit(Base.train_X, Base.train_y)
        df = pd.DataFrame(optimized_GBM.cv_results_)[['rank_test_score', 'params']].sort_values(by='rank_test_score')
        df.rename(columns = {'rank_test_score': 'rank'}, inplace = True)
        return df

[docs]    def cv(self, grid_params):
        """
        Calculate the Cross-Validation (CV) score for XGBoost model based on ``grid_params`` parameters. Sets xgb.cv_results variable to the resulting dataframe.
        """
        xgdmat = xgb.DMatrix(Base.train_X, Base.train_y)
        self.cv_results = xgb.cv(
            params = grid_params, dtrain = xgdmat,
            num_boost_round = 1000, nfold = 5,
            metrics = ['error'], early_stopping_rounds = 20)
        self.error = self.cv_results.get_value(len(self.cv_results) - 1, 'test-error-mean')

[docs]    def params(self, params):
        """
        Sets Base.xgb_params to ``params`` dictionary.
        """
        Base.xgb_params = params

[docs]    def classifier(self):
        """
        Creates the XGBoost Classifier with Base.xgb_params dictionary of model hyper-parameters.
        """
        self.clf = xgb.XGBClassifier(**Base.xgb_params)

[docs]    def fit(self):
        """
        Sets Base.xgb_model with trained XGBoost model.
        """
        Base.xgb_model = self.clf.fit(Base.train_X, Base.train_y)

[docs]    def predict(self):
        """
        Sets xgb.predictions with predictions from the XGBoost model.
        """
        self.predictions = Base.xgb_model.predict(Base.test_X)

[docs]    def feature_selection(self):
        """
        Returns threshold and accuracy for ``n`` number of features.
        """
        Base.data_n()
        X = Base.train_n.drop([Base.target], axis=1)
        Y = Base.train[Base.target]

        # Split data into train and test sets
        X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=7)

        # Fit model on all training data
        model = xgb.XGBClassifier()
        model.fit(X_train, y_train)

        # Make predictions for test data and evaluate
        y_pred = model.predict(X_test)
        predictions = [round(value) for value in y_pred]
        accuracy = accuracy_score(y_test, predictions)
        self.feature_accuracy = round(accuracy * 100.0, 2)
        print("Accuracy: %f%%" % (self.feature_accuracy))

        # Fit model using each importance as a threshold
        thresholds = np.sort(model.feature_importances_)
        for thresh in thresholds:
        	# Select features using threshold
        	selection = SelectFromModel(model, threshold=thresh, prefit=True)
        	select_X_train = selection.transform(X_train)

        	# Train model
        	selection_model = xgb.XGBClassifier()
        	selection_model.fit(select_X_train, y_train)

        	# Evalation model
        	select_X_test = selection.transform(X_test)
        	y_pred = selection_model.predict(select_X_test)
        	predictions = [round(value) for value in y_pred]
        	accuracy = accuracy_score(y_test, predictions)
        	print ("Thresh=%.3f, n=%d, Accuracy: %.2f%%" % (thresh, select_X_train.shape[1], accuracy*100.0))