In [1]:
# code for loading the format for the notebook
import os

# path : store the current path to convert back to it later
path = os.getcwd()
os.chdir(os.path.join('..',  'notebook_format'))

from formats import load_style
load_style(plot_style=False)
Out[1]:
In [2]:
os.chdir(path)

# 1. magic for inline plot
# 2. magic to print version
# 3. magic so that the notebook will reload external python modules
# 4. magic to enable retina (high resolution) plots
# https://gist.github.com/minrk/3301035
%matplotlib inline
%load_ext watermark
%load_ext autoreload
%autoreload 2
%config InlineBackend.figure_format='retina'

import xgboost as xgb
import sklearn.datasets
import sklearn.metrics as metrics
from ray import tune
from sklearn.model_selection import train_test_split
from ray.tune.integration.xgboost import TuneReportCallback

%watermark -a 'Ethen' -u -d -v -iv
Author: Ethen

Last updated: 2022-07-10

Python implementation: CPython
Python version       : 3.7.11
IPython version      : 7.27.0

xgboost: 1.6.1
sklearn: 1.0.2
ray    : 1.6.0

HyperParameter Tuning Ray Tune and HyperBand

One of steps in training a machine learning model involves hyperparameter tuning, and two most common hyper parameter tuning strategies that we might first come across are grid and random search.

In this article, we will take a look at how we can perform hyperparameter tuning using Ray Tune, as well as explore another hyperparameter tuning strategy called HyperBand.

We'll be using xgboost library as well as a sample dataset provided by scikit-learn in this example, there will be no feature preprocessing as that is not the focus of this post.

In [3]:
bunch = sklearn.datasets.load_breast_cancer(return_X_y=False)
print(f'number of rows: {bunch.data.shape[0]}, cols: {bunch.data.shape[1]}')
bunch.data
number of rows: 569, cols: 30
Out[3]:
array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
        1.189e-01],
       [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
        8.902e-02],
       [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
        8.758e-02],
       ...,
       [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
        7.820e-02],
       [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
        1.240e-01],
       [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
        7.039e-02]])
In [4]:
X_train, X_test, y_train, y_test = train_test_split(bunch.data, bunch.target, test_size=0.25)

We first train a model using the default parameters to get a baseline performance number.

In [5]:
model_xgb = xgb.XGBClassifier()

eval_set = [(X_train, y_train), (X_test, y_test)]
model_xgb.fit(X_train, y_train, eval_set=eval_set, verbose=10)
[0]	validation_0-logloss:0.46159	validation_1-logloss:0.49076
[10]	validation_0-logloss:0.04348	validation_1-logloss:0.15069
[20]	validation_0-logloss:0.01570	validation_1-logloss:0.12307
[30]	validation_0-logloss:0.01028	validation_1-logloss:0.11609
[40]	validation_0-logloss:0.00816	validation_1-logloss:0.11306
[50]	validation_0-logloss:0.00732	validation_1-logloss:0.11206
[60]	validation_0-logloss:0.00679	validation_1-logloss:0.11029
[70]	validation_0-logloss:0.00637	validation_1-logloss:0.10848
[80]	validation_0-logloss:0.00603	validation_1-logloss:0.10662
[90]	validation_0-logloss:0.00576	validation_1-logloss:0.10662
[99]	validation_0-logloss:0.00556	validation_1-logloss:0.10615
Out[5]:
XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, ...)

Hyperparameter Tuning

To use hyperparameter tuning with ray, we need to:

  • Have a config dictionary, so tune can choose from a range of valid options.
  • Use the config dictionary in our model object.
  • Once we are done training the model, report all the necessary metrics.
In [6]:
config = {
    #"n_estimators": tune.randint(30, 100),
    "max_depth": tune.randint(2, 6),
    "colsample_bytree": tune.uniform(0.8, 1.0),
    "subsample": tune.uniform(0.8, 1.0),
    "learning_rate": tune.loguniform(1e-4, 1e-1)
}


def ray_train(config, X_train, y_train, X_test, y_test):
    model = xgb.XGBClassifier(**config)
    eval_set = [(X_train, y_train), (X_test, y_test)]
    model.fit(X_train, y_train, eval_set=eval_set, verbose=False)

    log_loss_test = metrics.log_loss(y_test, model.predict_proba(X_test)[:, 1])
    tune_report_metrics = {'validation_1-logloss': round(log_loss_test, 3)}
    tune.report(**tune_report_metrics)

For running hyperparameter tuning:

  • We pass our training function/callable, ray_train, as the first parameter. Here we leverage with_parameters so we can broadcast large objects to our trainable.
  • We specify additional necessary arguments such as what metrics to optimize for as well as resources, and the hyperparameter tuning config space.
  • ray allows us to specify a time budget along with num_samples of -1, this allows us to train for an infinite sample of configurations until a time budget is met.
In [7]:
def ray_hyperparameter_tuning(config, time_budget_s: int):
    analysis = tune.run(
        tune.with_parameters(ray_train, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test),
        config=config,
        metric='validation_1-logloss',
        mode='min',
        num_samples=-1,
        resources_per_trial={'cpu': 8},
        time_budget_s=time_budget_s,
        verbose=1
    )
    return analysis
In [8]:
analysis = ray_hyperparameter_tuning(config, time_budget_s=120)
== Status ==
Current time: 2022-07-09 23:18:29 (running for 00:02:03.06)
Memory usage on this node: 204.9/1007.3 GiB
Using FIFO scheduling algorithm.
Resources requested: 0/80 CPUs, 0/0 GPUs, 0.0/810.33 GiB heap, 0.0/186.26 GiB objects
Current best trial: 27502_00264 with validation_1-logloss=0.113 and parameters={'max_depth': 2, 'colsample_bytree': 0.8522675185448618, 'subsample': 0.8360198940451542, 'learning_rate': 0.09915622430990471}
Result logdir: /home/mingyuliu/ray_results/ray_train_2022-07-09_23-16-25
Number of trials: 1222/infinite (1222 TERMINATED)

2022-07-09 23:18:29,903	INFO tune.py:748 -- Total run time: 124.52 seconds (121.16 seconds for the tuning loop).
In [9]:
num_done_experiments = analysis.results_df[analysis.results_df['done'] == True].shape[0]

print(f'ran {num_done_experiments} hyperparameter tuning experiments')
print('best metric: ', analysis.best_result['validation_1-logloss'])
print('best config: ', analysis.best_result['config'])
ran 1124 hyperparameter tuning experiments
best metric:  0.113
best config:  {'max_depth': 2, 'colsample_bytree': 0.8522675185448618, 'subsample': 0.8360198940451542, 'learning_rate': 0.09915622430990471}
/home/mingyuliu/.local/lib/python3.7/site-packages/ray/tune/analysis/experiment_analysis.py:304: UserWarning: Dataframes will use '/' instead of '.' to delimit nested result keys in future versions of Ray. For forward compatibility, set the environment variable TUNE_RESULT_DELIM='/'
  "Dataframes will use '/' instead of '.' to delimit "

HyperParameter Tuning with HyperBand

Apart from grid or random search, ray tune offers multiple hyperparameter tuning strategies, here we will be looking at one of them called Hyperband.

Hyperband can be seen as successive halving algorithm on steriods that focuses on speeding up configuration evaluation, where configuration refers to one specific set of hypereparameters. To elaborate, successive halving works by allocating a certain amount of budget to a set of hyper parameter configurations, i.e. it runs the configuration for a few iterations to get a sense of their performance, after that it will start allocating more resources to more promising configurations, while tossing away other non-performing configurations. This process repeats until one configuration remains. One of the potential drawback with successive halving is that given some finite budget $B$ (e.g. training time), and number of configurations $n$, it is not clear a priori whether we should either consider many configurations (large $n$), each with a small average training time, or the opposite, i.e. consider a small number of configurations (large $B / n$), each having a larger average training time. In other words, as practitioners, how do we decide whether we want more "depth" or more "breadth". Let's now take a look at how Hyperband aims to address this issue:

Looking at the psuedocode above, Hyperband takes in two inputs:

  • R: The maximum resources that can be allocated to a single configuration, e.g. number of iterations to run the algorithm.
  • $\eta$: Controls the proportion of configurations to be discarded for each round of successive halving.

Then it essentially performs a grid search over different possible values of $n$, associated with $n$ is a minimum resource $r$ that is allocated to each configuration. Lines 1-2, the outer loop, iterates over different values of $n$ and $r$, whereas the inner loop, lines 3–9, runs successive halving for the fixed $n$ and $r$.

The following code chunk provides a vanilla implementation, and returns the resource allocation table.

In [10]:
from random import random
from math import log, ceil
import heapq
import pandas as pd


def hyperband(R, eta):
    s_max = int(log(R) / log(eta))
    B = (s_max + 1) * R

    rows = []
    for s in reversed(range(s_max + 1)):
        # initial number of configurations
        n = int(ceil(B / R / (s + 1) * eta ** s))
        # initial number of iterations per config
        r = R * eta ** (-s)

        # get hyperparameter configurations,
        # we use a random value to represent a sampled configuration
        # from a defined hyperparameter search space
        T = [random() for _ in range(n)]
        for i in range(s + 1):
            n_configs = n * eta ** (-i)
            n_iterations = r * eta ** (i)
    
            # run then return validation loss, here
            # we use a random value to represent the algorithm's
            # perform after taking in n_iterations and config as inputs
            losses = [(random(), t) for t in T]

            # return top k configurations, if we are minimizing the loss
            # then we pick the top k smallest
            top_k_losses = heapq.nsmallest(int(n_configs / eta), losses)
            T = [t for loss, t in top_k_losses]

            row = [s, n_configs, n_iterations]
            rows.append(row)

    return pd.DataFrame(rows, columns=['s', 'n_configs', 'n_iterations'])
In [11]:
R = 81
eta = 3
hyperband(R, eta)
Out[11]:
s n_configs n_iterations
0 4 81.000000 1.0
1 4 27.000000 3.0
2 4 9.000000 9.0
3 4 3.000000 27.0
4 4 1.000000 81.0
5 3 34.000000 3.0
6 3 11.333333 9.0
7 3 3.777778 27.0
8 3 1.259259 81.0
9 2 15.000000 9.0
10 2 5.000000 27.0
11 2 1.666667 81.0
12 1 8.000000 27.0
13 1 2.666667 81.0
14 0 5.000000 81.0

Notice in the last row, $s = 0$, where every configuration is allocated $R$ resources, this setting is essentially performing our good old random search. On the other extreme end of things, the first row $s = 4$, we are essentially running 81 configurations each for only 1 iteration, then proceeding on to dropping 2/3 of the bottom performing configurations. By performing a mix of more exploration and more exploitation search strategies, it automatically accomodates for scenarios where an iterative training algorithm converges very slowly and require more resources to show differentiating performance (in these scenarios, we should consider smaller $n$), as well as the opposite end of the story, where we perform aggresive early stopping to provide massive speedups and scan many different combinations.

To leverage hyperband tuning algorithm, we'll use Ray Tune's scheduler ASHAScheduler (recommended over the standard hyperband scheduler). We will also need to report our model's loss for every iteration back to tune. Ray Tune already comes with a callback class TuneReportCallback that does this without us having to implement it ourselves.

In [12]:
config = {
    #"n_estimators": tune.randint(30, 100),
    "max_depth": tune.randint(2, 6),
    "colsample_bytree": tune.uniform(0.8, 1.0),
    "subsample": tune.uniform(0.8, 1.0),
    "learning_rate": tune.loguniform(1e-4, 1e-1),
    "callbacks": [TuneReportCallback()]
}


def ray_train(config, X_train, y_train, X_test, y_test):
    model = xgb.XGBClassifier(**config)
    eval_set = [(X_train, y_train), (X_test, y_test)]
    model.fit(X_train, y_train, eval_set=eval_set, verbose=False)
In [13]:
def ray_hyperparameter_tuning(config, time_budget_s: int):
    scheduler = tune.schedulers.ASHAScheduler(
        max_t=100,
        grace_period=10,
        reduction_factor=2
    )
    analysis = tune.run(
        tune.with_parameters(ray_train, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test),
        config=config,
        metric='validation_1-logloss',
        mode='min',
        num_samples=-1,
        scheduler=scheduler,
        resources_per_trial={'cpu': 8},
        time_budget_s=time_budget_s,
        verbose=1
    )
    return analysis
In [14]:
analysis = ray_hyperparameter_tuning(config, time_budget_s=120)
== Status ==
Current time: 2022-07-09 23:20:37 (running for 00:02:02.53)
Memory usage on this node: 205.1/1007.3 GiB
Using AsyncHyperBand: num_stopped=1045 Bracket: Iter 80.000: -0.13818851038748464 | Iter 40.000: -0.22655531805712026 | Iter 20.000: -0.4710694114853452 | Iter 10.000: -0.66420287727476
Resources requested: 0/80 CPUs, 0/0 GPUs, 0.0/810.33 GiB heap, 0.0/186.26 GiB objects
Current best trial: 745b9_00477 with validation_1-logloss=0.11607122477355668 and parameters={'max_depth': 2, 'colsample_bytree': 0.9223334898321227, 'subsample': 0.8638757189475169, 'learning_rate': 0.08456338948443516, 'callbacks': []}
Result logdir: /home/mingyuliu/ray_results/ray_train_2022-07-09_23-18-34
Number of trials: 1143/infinite (1143 TERMINATED)

2022-07-09 23:20:37,527	INFO tune.py:748 -- Total run time: 122.87 seconds (120.41 seconds for the tuning loop).
In [15]:
num_done_experiments = analysis.results_df[analysis.results_df['done'] == True].shape[0]

print(f'ran {num_done_experiments} hyperparameter tuning experiments')
print('best metric: ', analysis.best_result['validation_1-logloss'])
print('best config: ', analysis.best_result['config'])
ran 1045 hyperparameter tuning experiments
best metric:  0.11607122477355668
best config:  {'max_depth': 2, 'colsample_bytree': 0.9223334898321227, 'subsample': 0.8638757189475169, 'learning_rate': 0.08456338948443516, 'callbacks': [<ray.tune.integration.xgboost.TuneReportCallback object at 0x7fc949bf1c10>]}

We can retrieve the best config, and re-train our model to check if we get similar performance numbers.

In [16]:
best_config = analysis.best_result['config']
del best_config['callbacks']
In [17]:
model_xgb = xgb.XGBClassifier(**best_config)

eval_set = [(X_train, y_train), (X_test, y_test)]
model_xgb.fit(X_train, y_train, eval_set=eval_set, verbose=10)
[0]	validation_0-logloss:0.62644	validation_1-logloss:0.63045
[10]	validation_0-logloss:0.28323	validation_1-logloss:0.31413
[20]	validation_0-logloss:0.16305	validation_1-logloss:0.21245
[30]	validation_0-logloss:0.10716	validation_1-logloss:0.17367
[40]	validation_0-logloss:0.07839	validation_1-logloss:0.14843
[50]	validation_0-logloss:0.06050	validation_1-logloss:0.13471
[60]	validation_0-logloss:0.04858	validation_1-logloss:0.13113
[70]	validation_0-logloss:0.03952	validation_1-logloss:0.12595
[80]	validation_0-logloss:0.03287	validation_1-logloss:0.12086
[90]	validation_0-logloss:0.02801	validation_1-logloss:0.11741
[99]	validation_0-logloss:0.02380	validation_1-logloss:0.11607
Out[17]:
XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1,
              colsample_bytree=0.9223334898321227, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, gamma=0, gpu_id=-1,
              grow_policy='depthwise', importance_type=None,
              interaction_constraints='', learning_rate=0.08456338948443516,
              max_bin=256, max_cat_to_onehot=4, max_delta_step=0, max_depth=2,
              max_leaves=0, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=0,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, ...)

Ray Tune provides different hyperparameter tuning algorithms other than the classic grid or random search, here we only looked at one of them, Hyperband.

Caveat: If learning rate is a hyperparameter, smaller values will likely result in inferior performance at the beginning, but may outperform other configurations if given sufficent amount of time. Hence, when using hyperhand like hyperparameter tuning methods, it might not be able to find the small learning rate and many iterations combinations that can squeeze out performance.

Reference