In [1]:
# code for loading the format for the notebook
import os

# path : store the current path to convert back to it later
path = os.getcwd()
os.chdir(os.path.join('..', 'notebook_format'))

from formats import load_style
load_style(plot_style=False)
Out[1]:
In [2]:
os.chdir(path)

# 1. magic to print version
# 2. magic so that the notebook will reload external python modules
%load_ext watermark
%load_ext autoreload 
%autoreload 2

import numpy as np
import pandas as pd
from keras.regularizers import l2
from keras.models import Sequential
from keras.callbacks import EarlyStopping
from keras.layers.advanced_activations import PReLU
from keras.wrappers.scikit_learn import KerasClassifier
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.normalization import BatchNormalization
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV
from tensorflow.examples.tutorials.mnist import input_data

%watermark -a 'Ethen' -d -t -v -p numpy,pandas,keras,sklearn,tensorflow
Using TensorFlow backend.
Ethen 2017-03-24 10:55:48 

CPython 3.5.2
IPython 5.3.0

numpy 1.12.1
pandas 0.19.2
keras 2.0.2
sklearn 0.18
tensorflow 1.0.1

Keras Hyperparameter Tuning

We'll use MNIST dataset. The downloaded data is split into three parts, 55,000 data points of training data (mnist.train), 10,000 points of test data (mnist.test), and 5,000 points of validation data (mnist.validation).

Every part of the dataset contains the data and label and we can access them via .images and .labels. e.g. the training images are mnist.train.images and the train labels are mnist.train.labels (one-hot encoded).

In [3]:
# convenient one-liner to load the dataset
mnist = input_data.read_data_sets('MNIST_data', one_hot = True)

# extract the training, validation and test set
X_train = mnist.train.images
y_train = mnist.train.labels
X_val = mnist.validation.images
y_val = mnist.validation.labels
X_test = mnist.validation.images
y_test = mnist.validation.labels
print()
print('number of training observations: ', X_train.shape[0])
print('number of validation observations: ', X_val.shape[0])
print('number of testing observations: ', X_test.shape[0])

# the labels have already been one-hot encoded
n_input = X_train.shape[1]
n_class = y_train.shape[1]
print('feature num: ', n_input)
print('class num: ', n_class)
Extracting MNIST_data/train-images-idx3-ubyte.gz
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz

number of training observations:  55000
number of validation observations:  5000
number of testing observations:  5000
feature num:  784
class num:  10

Keras provides a wrapper class KerasClassifier that allows us to use our deep learning models with scikit-learn, this is especially useful when you want to tune hyperparameters using scikit-learn's RandomizedSearchCV or GridSearchCV.

To use it, we first define a function that takes the arguments that we wish to tune, inside the function, you define the network's structure as usual and compile it. Then the function is passed to KerasClassifier's build_fn parameter. Note that like all other estimators in scikit-learn, build_fn should provide default values for its arguments, so that we could create the estimator even without passing in values for every parameters.

In [6]:
def build_keras_base(hidden_layers = [64, 64, 64], dropout_rate = 0, 
                     l2_penalty = 0.1, optimizer = 'adam',
                     n_input = 100, n_class = 2):
    """
    Keras Multi-layer neural network. Fixed parameters include: 
    1. activation function (PRelu)
    2. always uses batch normalization after the activation
    3. use adam as the optimizer
    
    Parameters
    ----------
    Tunable parameters are (commonly tuned)
    
    hidden_layers: list
        the number of hidden layers, and the size of each hidden layer
    
    dropout_rate: float 0 ~ 1
        if bigger than 0, there will be a dropout layer
    
    l2_penalty: float
        or so called l2 regularization
    
    optimizer: string or keras optimizer
        method to train the network
    
    Returns
    -------
    model : 
        a keras model

    Reference
    ---------
    https://keras.io/scikit-learn-api/
    """   
    model = Sequential()   
    for index, layers in enumerate(hidden_layers):       
        if not index:
            # specify the input_dim to be the number of features for the first layer
            model.add(Dense(layers, input_dim = n_input, kernel_regularizer = l2(l2_penalty)))
        else:
            model.add(Dense(layers, kernel_regularizer = l2(l2_penalty)))
        
        # insert BatchNorm layer immediately after fully connected layers
        # and before activation layer
        model.add(BatchNormalization())
        model.add(PReLU())        
        if dropout_rate:
            model.add(Dropout(p = dropout_rate))
    
    model.add(Dense(n_class))
    model.add(Activation('softmax'))
    
    # the loss for binary and muti-class classification is different 
    loss = 'binary_crossentropy'
    if n_class > 2:
        loss = 'categorical_crossentropy'
    
    model.compile(loss = loss, optimizer = optimizer, metrics = ['accuracy'])   
    return model
In [7]:
# pass in fixed parameters n_input and n_class
model_keras = KerasClassifier(
    build_fn = build_keras_base,
    n_input = n_input,
    n_class = n_class,
)

# specify other extra parameters pass to the .fit
# number of epochs is set to a large number, we'll
# let early stopping terminate the training process
early_stop = EarlyStopping(
    monitor = 'val_loss', min_delta = 0.1, patience = 5, verbose = 0)

callbacks = [early_stop]
keras_fit_params = {   
    'callbacks': callbacks,
    'epochs': 200,
    'batch_size': 2048,
    'validation_data': (X_val, y_val),
    'verbose': 0
}

# random search's parameter:
# specify the options and store them inside the dictionary
# batch size and training method can also be hyperparameters, 
# but it is fixed
dropout_rate_opts  = [0, 0.2, 0.5]
hidden_layers_opts = [[64, 64, 64, 64], [32, 32, 32, 32, 32], [100, 100, 100]]
l2_penalty_opts = [0.01, 0.1, 0.5]
keras_param_options = {
    'hidden_layers': hidden_layers_opts,
    'dropout_rate': dropout_rate_opts,  
    'l2_penalty': l2_penalty_opts
}
In [8]:
# `verbose` 2 will print the class info for every cross validation, 
# kind of too much
rs_keras = RandomizedSearchCV( 
    model_keras, 
    param_distributions = keras_param_options,
    fit_params = keras_fit_params,
    scoring = 'neg_log_loss',
    n_iter = 3, 
    cv = 3,
    n_jobs = -1,
    verbose = 1
)
rs_keras.fit(X_train, y_train)

print('Best score obtained: {0}'.format(rs_keras.best_score_))
print('Parameters:')
for param, value in rs_keras.best_params_.items():
    print('\t{}: {}'.format(param, value))
Fitting 3 folds for each of 3 candidates, totalling 9 fits
18333/18333 [==============================] - 2s     
36192/36667 [============================>.] - ETA: 0s
[Parallel(n_jobs=-1)]: Done   6 out of   9 | elapsed:  3.0min remaining:  1.5min
18333/18333 [==============================] - 1s     
36256/36667 [============================>.] - ETA: 0s
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:  3.1min finished
Best score obtained: -0.2284632457149802
Parameters:
	dropout_rate: 0.2
	hidden_layers: [100, 100, 100]
	l2_penalty: 0.5
In [9]:
# flatten the one-hot encoded labels for
# acessing prediction accuracy on the test set
y_true = np.nonzero(y_test)[1]
y_pred = rs_keras.predict(X_test)
accuracy_score(y_true, y_pred)
4960/5000 [============================>.] - ETA: 0s
Out[9]:
0.95979999999999999
In [10]:
# validator.best_estimator_ returns sklearn-wrapped version of best model.
# validator.best_estimator_.model returns the (unwrapped) keras model
best_model = rs_keras.best_estimator_.model
metric_names = best_model.metrics_names
metric_values = best_model.evaluate(X_train, y_train)
for metric, value in zip(metric_names, metric_values):
    print(metric, ': ', value)
54304/55000 [============================>.] - ETA: 0sloss :  0.339452602564
acc :  0.956618181818

Reference