# code for loading the format for the notebook
import os
# path : store the current path to convert back to it later
path = os.getcwd()
os.chdir(os.path.join('..', 'notebook_format'))
from formats import load_style
load_style(css_style='custom2.css', plot_style=False)
os.chdir(path)
# 1. magic for inline plot
# 2. magic to print version
# 3. magic so that the notebook will reload external python modules
# 4. magic to enable retina (high resolution) plots
# https://gist.github.com/minrk/3301035
%matplotlib inline
%load_ext watermark
%load_ext autoreload
%autoreload 2
%config InlineBackend.figure_format='retina'
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from operator import itemgetter
from xgboost import XGBClassifier
from xgboost.callback import EarlyStopping
from scipy.stats import randint, uniform
from sklearn.metrics import roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, RandomizedSearchCV
%watermark -a 'Ethen' -d -u -v -p numpy,pandas,xgboost,sklearn,matplotlib
Both xgboost (Extreme gradient boosting) and gbm follows the principle of gradient boosting. The name xgboost, though, actually refers to the engineering goal to push the limit of computations resources for boosted tree algorithms. Which is the reason why many people use xgboost. For model, it might be more suitable to be called as regularized gradient boosting, as it uses a more regularized model formalization to control overfitting.
In this toy example, we will be dealing with a binary classification task. We start off by generating a 20 dimensional artificial dataset with 1000 samples, where 8 features holding information, 3 are redundant and 2 repeated. And perform a train/test split. The testing data will be useful for validating the performance of our algorithms.
seed = 104
X, y = make_classification(n_samples=1000, n_features=20,
n_informative=8, n_redundant=3,
n_repeated=2, random_state=seed)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
# confirm that the dataset is balanced,
# that is the target variable is equally
# distributed across both dataset
print('Train label distribution:')
print(np.bincount(y_train))
print('\nTest label distribution:')
print(np.bincount(y_test))
We can use a decision tree classifier to establish our baseline and see if a more complex model is capable of beating it.
tree = DecisionTreeClassifier(random_state=seed, max_depth=6)
# train classifier
tree.fit(X_train, y_train)
# predict output
tree_y_pred = tree.predict(X_test)
tree_y_pred_prob = tree.predict_proba(X_test)[:, 1]
# evaluation
tree_auc = roc_auc_score(y_test, tree_y_pred_prob)
print('auc:', tree_auc)
We start by training a xgboost model using a fix set of parameters. For further details of the parameter (using scikit-learn like API) refer to the XGBoost Documentation: Python API documentation.
xgb_params_fixed = {
'learning_rate': 0.1,
# use 'multi:softprob' for multi-class problems
'objective': 'binary:logistic',
# length of the longest path from a root to a leaf
'max_depth': 6,
# subsample ratio of columns when constructing each tree
'colsample_bytree': 0.8,
# setting it to a positive value
# might help when class is extremely imbalanced
# as it makes the update more conservative
'max_delta_step': 1,
'n_estimators': 150,
# use all possible cores for training
'n_jobs': -1
}
model_xgb = XGBClassifier(**xgb_params_fixed)
# we also specify the evaluation dataset and metric
# to record the model's performance history, note that
# we can supply multiple evaluation metric by passig a
# list to `eval_metric`
eval_set = [(X_train, y_train), (X_test, y_test)]
model_xgb.fit(X_train, y_train, eval_metric='auc', eval_set=eval_set, verbose=False)
We can retrieve the performance of the model on the evaluation dataset and plot it to get insight into the training process. The evals_results_
dictionary stores the validation_0
and validation_1
as its first key. This corresponds to the order that datasets were provided to the eval_set
argument. The second key is the eval_metric
that were provided.
# change default figure and font size
plt.rcParams['figure.figsize'] = 10, 8
plt.rcParams['font.size'] = 12
history = model_xgb.evals_result_
x_axis = range(len(history['validation_0']['auc']))
plt.plot(x_axis, history['validation_0']['auc'], label='Train')
plt.plot(x_axis, history['validation_1']['auc'], label='Test')
plt.legend(loc='best')
plt.ylabel('AUC')
plt.title('Xgboost AUC')
plt.show()
From reviewing our plot, it looks like there is an opportunity to stop the training process at an earlier stage, since test dataset's auc score stopped increasing around 80 estimators. Luckily, xgboost supports this functionality.
Early stopping works by monitoring model's performance that is being trained on a separate validation dataset and stopping the training procedure once performance on the validation dataset has not improved after a fixed number of training iterations (we can specify this number). This will potentially save us a lot of time from training a model that does not improve its performance over time.
The evaluation measure may be the loss function that is being optimized to train our model (such as logarithmic loss), or an external metric of interest to the problem in general (such as the auc score that we've used above). The full list of performace measure that we can directly specify can be found at the eval_metric
section of the XGBoost Doc: Learning Task Parameters.
In addition to specifying a evaluation metric and dataset, to use early stopping we also need to specify the rounds
in our call back function. This is essentially telling our model to stop the training process if the evaluation dataset's evaluation metric does not improve over this many rounds. Note that if multiple evaluation datasets or multiple evaluation metrics are provided in a list, then early stopping will use the last one in the list.
For example, we can check for no improvement in auc over the 5 rounds as follows:
# we set verbose to 10 so that it will print out the evaluation metric for the
# evaluation dataset for every 10 round
early_stop = EarlyStopping(rounds=5, save_best=True)
xgb_params_fixed['callbacks'] = [early_stop]
model_xgb = XGBClassifier(**xgb_params_fixed)
model_xgb.fit(
X_train, y_train,
eval_metric='auc',
eval_set=eval_set,
verbose=10
)
# we can then access the best number of tree and use it later for prediction
print('best iteration', model_xgb.best_ntree_limit)
we can see from the result below that this is already better than our original decision tree model.
# print the model's performance
y_pred_prob = model_xgb.predict_proba(X_test)[:, 1]
print('auc:', roc_auc_score(y_test, y_pred_prob))
# ensure we can serialize and deserialize the model
model_checkpoint = 'model.xgb'
model_xgb.save_model(model_checkpoint)
model_xgb_loaded = XGBClassifier()
model_xgb_loaded.load_model(model_checkpoint)
y_pred_prob = model_xgb_loaded.predict_proba(X_test,)[:, 1]
print('auc:', roc_auc_score(y_test, y_pred_prob))
def plot_xgboost_importance(xgboost_model, feature_names, threshold=5):
"""
Improvements on xgboost's plot_importance function, where
1. the importance are scaled relative to the max importance, and
number that are below 5% of the max importance will be chopped off
2. we need to supply the actual feature name so the label won't
just show up as feature 1, feature 2, which are not very interpretable
returns the important features's index sorted in descending order
"""
# convert from dictionary to tuples and sort by the
# importance score in ascending order for plotting purpose
importance = xgboost_model.get_booster().get_score(importance_type='gain')
tuples = [(int(k[1:]), importance[k]) for k in importance]
tuples = sorted(tuples, key = itemgetter(1))
labels, values = zip(*tuples)
# make importances relative to max importance,
# and filter out those that have smaller than 5%
# relative importance (threshold chosen arbitrarily)
labels, values = np.array(labels), np.array(values)
values = np.round(100 * values / np.max(values), 2)
mask = values > threshold
labels, values = labels[mask], values[mask]
feature_labels = feature_names[labels]
ylocs = np.arange(values.shape[0])
plt.barh(ylocs, values, align='center')
for x, y in zip(values, ylocs):
plt.text(x + 1, y, x, va='center')
plt.ylabel('Features')
plt.xlabel('Relative Importance Score')
plt.title('Feature Importance Score')
plt.xlim([0, 110])
plt.yticks(ylocs, feature_labels)
# revert the ordering of the importance
return labels[::-1]
# we don't actually have the feature's actual name as those
# were simply randomly generated numbers, thus we simply supply
# a number ranging from 0 ~ the number of features
feature_names = np.arange(X_train.shape[1])
plot_xgboost_importance(xgboost_model=model_xgb, feature_names=feature_names)
Side note: Apart from using the built-in evaluation metric, we can also define one ourselves. The evaluation metric should be a function that takes two argument y_pred, y_true (it doesn't have to named like this). It is assumed that y_true will be a DMatrix object so that we can call the get_label
method to access the true labels. As for the return value, the function ust return a str, value pair where the str is a name for the evaluation metric and value is the value of the evaluation. This objective is always minimized.
def misclassified(y_pred, y_true):
"""
custom evaluation metric for xgboost, the metric
counts the number of misclassified examples assuming
that classes with p>0.5 are positive
"""
labels = y_true.get_label() # obtain true labels
preds = y_pred > 0.5 # obtain predicted values
return 'misclassified', np.sum(labels != preds)
early_stop = EarlyStopping(rounds=5, save_best=True)
xgb_params_fixed['callbacks'] = [early_stop]
model_xgb = XGBClassifier(**xgb_params_fixed)
model_xgb.fit(
X_train, y_train,
eval_metric=misclassified,
eval_set=eval_set,
verbose=10
)
y_pred_prob = model_xgb.predict_proba(X_test)[:, 1]
print('auc:', roc_auc_score(y_test, y_pred_prob))
Another example of writing the customized rsquared evaluation metric.
def rsquared(y_pred, y_true):
"""rsquared evaluation metric for xgboost's regression"""
labels = y_true.get_label()
sse = np.sum((labels - y_pred) ** 2)
sst = np.sum((labels - np.mean(labels)) ** 2)
rsquared = 1 - sse / sst
# note that the documentation says the
# objective function is minimized, thus
# we take the negative sign of rsquared
return 'r2', -rsquared
Next, since overfitting is a common problem with sophisticated algorithms like gradient boosting, we'll introduce ways to tune the model's hyperparameter and deal with them. If a xgboost model is too complex we can try:
max_depth
, the depth of each tree.min_child_weight
, minimum sum of observation's weight needed in a child (think of it as the number of observation's needed in a tree's node).gamma
, the minimum loss reduction required to make a further partition.reg_lambda
(l2 regularization) and reg_alpha
(l1 regularization).subsample
(the fraction of observations to be randomly samples for fitting each tree), colsample_bytree
(the fraction of columns to be randomly samples for fitting each tree) parameters.We'll use a Random Search
to tune the model's hyperparameter.
def build_xgboost(X_train, y_train, X_test, y_test, n_iter):
"""
random search hyperparameter tuning for xgboost
classification task, n_iter controls the number
of hyperparameter combinations that it will search for
"""
# xgboost base parameter:
early_stop = EarlyStopping(rounds=5, save_best=True)
xgb_param_fixed = {
# setting it to a positive value
# might help when class is extremely imbalanced
# as it makes the update more conservative
'max_delta_step': 1,
# use all possible cores for training
'n_jobs': -1,
# set number of estimator to a large number
# and the learning rate to be a small number,
# we'll let early stopping decide when to stop
'n_estimators': 300,
'learning_rate': 0.1,
'callbacks': [early_stop]
}
xgb_base = XGBClassifier(**xgb_param_fixed)
# random search's parameter:
# scikit-learn's random search works with distributions;
# but it must provide a rvs method for sampling values from it,
# such as those from scipy.stats.distributions
# randint: discrete random variables ranging from low to high
# uniform: uniform continuous random variable between loc and loc + scale
xgb_param_options = {
'max_depth': randint(low=3, high=15),
'colsample_bytree': uniform(loc=0.7, scale=0.3),
'subsample': uniform(loc=0.7, scale=0.3)}
eval_set = [(X_train, y_train), (X_test, y_test)]
early_stop = EarlyStopping(rounds=5, save_best=True)
xgb_fit_params = {
'eval_metric': 'auc',
'eval_set': eval_set,
'verbose': False
}
model_xgb = RandomizedSearchCV(
estimator=xgb_base,
param_distributions=xgb_param_options,
cv=10,
# number of parameter settings that are sampled
n_iter=n_iter,
# n_jobs can be a parameter (since it's a fast task
# for this toy dataset, we'll simply use 1 jobs)
n_jobs=1,
verbose=1
).fit(X_train, y_train, **xgb_fit_params)
print('Best score obtained: {0}'.format(model_xgb.best_score_))
print('Best Parameters:')
for param, value in model_xgb.best_params_.items():
print('\t{}: {}'.format(param, value))
return model_xgb.best_estimator_
xgb_model = build_xgboost(X_train, y_train, X_test, y_test, n_iter=15)
y_pred_prob = xgb_model.predict_proba(X_test)[:, 1]
print('auc:', roc_auc_score(y_test, y_pred_prob))