os.chdir(path)

# 1. magic for inline plot
# 2. magic to print version
# 3. magic so that the notebook will reload external python modules
# 4. magic to enable retina (high resolution) plots
# https://gist.github.com/minrk/3301035
%matplotlib inline
%load_ext watermark
%load_ext autoreload
%autoreload 2
%config InlineBackend.figure_format='retina'

import os
import time
import numpy as np
import pandas as pd
from typing import List, Tuple, Dict
from sklearn.model_selection import train_test_split
from keras import layers
from keras.models import Model
from keras.preprocessing.text import Tokenizer
from keras.utils.np_utils import to_categorical
from keras.preprocessing.sequence import pad_sequences

# prevent scientific notations
pd.set_option('display.float_format', lambda x: '%.3f' % x)

%watermark -a 'Ethen' -d -t -v -p numpy,pandas,sklearn,keras

Using TensorFlow backend.

Ethen 2019-12-09 11:59:08 

CPython 3.6.4
IPython 7.9.0

numpy 1.16.5
pandas 0.25.0
sklearn 0.21.2
keras 2.2.2

Leveraging Pre-trained Word Embedding for Text Classification¶

There are two main ways to obtain word embeddings:

Learn it from scratch: We specify a neural network architecture and learn the word embeddings jointly with the main task at our hand (e.g. sentiment classification). i.e. we would start off with some random word embeddings, and it would update itself along with the word embeddings.
Transfer Learning: The whole idea behind transfer learning is to avoid reinventing the wheel as much as possible. It gives us the capability to transfer knowledge that was gained/learned in some other task and use it to improve the learning of another related task. In practice, one way to do this is for the embedding part of the neural network architecture, we load some other embeddings that were trained on a different machine learning task than the one we are trying to solve and use that to bootstrap the process.

One area that transfer learning shines is when we have little training data available and using our data alone might not be enough to learn an appropriate task specific embedding/features for our vocabulary. In this case, leveraging a word embedding that captures generic aspect of the language can prove to be beneficial from both a performance and time perspective (i.e. we won't have to spend hours/days training a model from scratch to achieve a similar performance). Keep in mind that, as with all machine learning application, everything is still all about trial and error. What makes a embedding good depends heavily on the task at hand: The word embedding for a movie review sentiment classification model may look very different from a legal document classification model as the semantic of the corpus varies between these two tasks.

Data Preparation¶

We'll use the movie review sentiment analysis dataset from Kaggle for this example. It's a binary classification problem with AUC as the ultimate evaluation metric. The next few code chunk performs the usual text preprocessing, build up the word vocabulary and performing a train/test split.

data_dir = 'data'
submission_dir = 'submission'

input_path = os.path.join(data_dir, 'word2vec-nlp-tutorial', 'labeledTrainData.tsv')
df = pd.read_csv(input_path, delimiter='\t')
print(df.shape)
df.head()

(25000, 3)

raw_text = df['review'].iloc[0]
raw_text

"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally starts is only on for 20 minutes or so excluding the Smooth Criminal sequence and Joe Pesci is convincing as a psychopathic all powerful drug lord. Why he wants MJ dead so bad is beyond me. Because MJ overheard his plans? Nah, Joe Pesci's character ranted that he wanted people to know it is he who is supplying drugs etc so i dunno, maybe he just hates MJ's music.<br /><br />Lots of cool things in this like MJ turning into a car and a robot and the whole Speed Demon sequence. Also, the director must have had the patience of a saint when it came to filming the kiddy Bad sequence as usually directors hate working with one kid let alone a whole bunch of them performing a complex dance scene.<br /><br />Bottom line, this movie is for people who like MJ on one level or another (which i think is most people). If not, then stay away. It does try and give off a wholesome message and ironically MJ's bestest buddy in this movie is a girl! Michael Jackson is truly one of the most talented people ever to grace this planet but is he guilty? Well, with all the attention i've gave this subject....hmmm well i don't know because people can be different behind closed doors, i know this for a fact. He is either an extremely nice but stupid guy or one of the most sickest liars. I hope he is not the latter."

import re

def clean_str(string: str) -> str:
    string = re.sub(r"\\", "", string)    
    string = re.sub(r"\'", "", string)    
    string = re.sub(r"\"", "", string)    
    return string.strip().lower()

from bs4 import BeautifulSoup

def clean_text(df: pd.DataFrame,
               text_col: str,
               label_col: str) -> Tuple[List[str], List[int]]:
    texts = []
    labels = []
    for raw_text, label in zip(df[text_col], df[label_col]):  
        text = BeautifulSoup(raw_text).get_text()
        cleaned_text = clean_str(text)
        texts.append(cleaned_text)
        labels.append(label)

    return texts, labels

text_col = 'review'
label_col = 'sentiment'
texts, labels = clean_text(df, text_col, label_col)
print('sample text: ', texts[0])
print('corresponding label:', labels[0])

sample text:  with all this stuff going down at the moment with mj ive started listening to his music, watching the odd documentary here and there, watched the wiz and watched moonwalker again. maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. some of it has subtle messages about mjs feeling towards the press and also the obvious message of drugs are bad mkay.visually impressive but of course this is all about michael jackson so unless you remotely like mj in anyway then you are going to hate this and find it boring. some may call mj an egotist for consenting to the making of this movie but mj and most of his fans would say that he made it for the fans which if true is really nice of him.the actual feature film bit when it finally starts is only on for 20 minutes or so excluding the smooth criminal sequence and joe pesci is convincing as a psychopathic all powerful drug lord. why he wants mj dead so bad is beyond me. because mj overheard his plans? nah, joe pescis character ranted that he wanted people to know it is he who is supplying drugs etc so i dunno, maybe he just hates mjs music.lots of cool things in this like mj turning into a car and a robot and the whole speed demon sequence. also, the director must have had the patience of a saint when it came to filming the kiddy bad sequence as usually directors hate working with one kid let alone a whole bunch of them performing a complex dance scene.bottom line, this movie is for people who like mj on one level or another (which i think is most people). if not, then stay away. it does try and give off a wholesome message and ironically mjs bestest buddy in this movie is a girl! michael jackson is truly one of the most talented people ever to grace this planet but is he guilty? well, with all the attention ive gave this subject....hmmm well i dont know because people can be different behind closed doors, i know this for a fact. he is either an extremely nice but stupid guy or one of the most sickest liars. i hope he is not the latter.
corresponding label: 1

random_state = 1234
val_split = 0.2

labels = to_categorical(labels)
texts_train, texts_val, y_train, y_val = train_test_split(
    texts, labels,
    test_size=val_split,
    random_state=random_state)

print('labels shape:', labels.shape)
print('train size: ', len(texts_train))
print('validation size: ', len(texts_val))

labels shape: (25000, 2)
train size:  20000
validation size:  5000

max_num_words = 20000

tokenizer = Tokenizer(num_words=max_num_words, oov_token='<unk>')
tokenizer.fit_on_texts(texts_train)
print('Found %s unique tokens.' % len(tokenizer.word_index))

Found 74207 unique tokens.

max_sequence_len = 1000

sequences_train = tokenizer.texts_to_sequences(texts_train)
x_train = pad_sequences(sequences_train, maxlen=max_sequence_len)

sequences_val = tokenizer.texts_to_sequences(texts_val)
x_val = pad_sequences(sequences_val, maxlen=max_sequence_len)

sequences_train[0][:5]

[50, 1, 863, 2, 17]

Glove¶

There are many different pretrained word embeddings online. The one we'll be using is from Glove. Others include but not limited to FastText, bpemb.

If we look at the project's wiki page, we can find any different pretrained embeddings available for us to experiment.

import requests
from tqdm import tqdm

def download_glove(embedding_type: str='glove.6B.zip'):
    """
    download GloVe word vector representations, this step may take a while
    
    Parameters
    ----------
    embedding_type : str, default 'glove.6B.zip'
        Specifying different glove embeddings to download if not already there.
        {'glove.6B.zip', 'glove.42B.300d.zip', 'glove.840B.300d.zip', 'glove.twitter.27B.zip'}
        Be wary of the size. e.g. 'glove.6B.zip' is a 822 MB zipped, 2GB unzipped
    """

    base_url = 'http://nlp.stanford.edu/data/'
    if not os.path.isfile(embedding_type):
        url = base_url + embedding_type

        # the following section is a pretty generic http get request for
        # saving large files, provides progress bars for checking progress
        response = requests.get(url, stream=True)
        response.raise_for_status()

        content_len = response.headers.get('Content-Length')
        total = int(content_len) if content_len is not None else 0

        with tqdm(unit='B', total=total) as pbar, open(embedding_type, 'wb') as f:
            for chunk in response.iter_content(chunk_size=1024):
                if chunk:
                    pbar.update(len(chunk))
                    f.write(chunk)

        if response.headers.get('Content-Type') == 'application/zip':
            from zipfile import ZipFile
            with ZipFile(embedding_type, 'r') as f:
                f.extractall(embedding_type.strip('.zip'))


download_glove()

The way we'll leverage the pretrained embedding is to first read it in as a dictionary lookup, where the key is the word and the value is the corresponding word embedding. Then for each token in our vocabulary, we'll lookup this dictionary to see if there's a pretrained embedding available, if there is, we'll use the pretrained embedding, if there isn't, we'll leave the embedding for this word in its original randomly initialized form.

The format for this particular pretrained embedding is for every line, we have a space delimited values, where the first token is the word, and the rest are its corresponding embedding values. e.g. the first line from the line looks like:

the -0.038194 -0.24487 0.72812 -0.39961 0.083172 0.043953 -0.39141 0.3344 -0.57545 0.087459 0.28787 -0.06731 0.30906 -0.26384 -0.13231 -0.20757 0.33395 -0.33848 -0.31743 -0.48336 0.1464 -0.37304 0.34577 0.052041 0.44946 -0.46971 0.02628 -0.54155 -0.15518 -0.14107 -0.039722 0.28277 0.14393 0.23464 -0.31021 0.086173 0.20397 0.52624 0.17164 -0.082378 -0.71787 -0.41531 0.20335 -0.12763 0.41367 0.55187 0.57908 -0.33477 -0.36559 -0.54857 -0.062892 0.26584 0.30205 0.99775 -0.80481 -3.0243 0.01254 -0.36942 2.2167 0.72201 -0.24978 0.92136 0.034514 0.46745 1.1079 -0.19358 -0.074575 0.23353 -0.052062 -0.22044 0.057162 -0.15806 -0.30798 -0.41625 0.37972 0.15006 -0.53212 -0.2055 -1.2526 0.071624 0.70565 0.49744 -0.42063 0.26148 -1.538 -0.30223 -0.073438 -0.28312 0.37104 -0.25217 0.016215 -0.017099 -0.38984 0.87424 -0.72569 -0.51058 -0.52028 -0.1459 0.8278 0.27062

def get_embedding_lookup(embedding_path) -> Dict[str, np.ndarray]:
    embedding_lookup = {}
    with open(embedding_path) as f:
        for line in f:
            values = line.split()
            word = values[0]
            coef = np.array(values[1:], dtype=np.float32)
            embedding_lookup[word] = coef

    return embedding_lookup


def get_pretrained_embedding(embedding_path: str,
                             index2word: Dict[int, str],
                             max_features: int) -> np.ndarray:
    embedding_lookup = get_embedding_lookup(embedding_path)

    pretrained_embedding = np.stack(list(embedding_lookup.values()))
    embedding_dim = pretrained_embedding.shape[1]
    embeddings = np.random.normal(pretrained_embedding.mean(),
                                  pretrained_embedding.std(),
                                  (max_features, embedding_dim)).astype(np.float32)
    # we track how many tokens in our vocabulary exists in the pre-trained embedding,
    # i.e. how many tokens has a pre-trained embedding from this particular file
    n_found = 0
    
    # the loop starts from 1 due to keras' Tokenizer reserves 0 for padding index
    for i in range(1, max_features):
        word = index2word[i]
        embedding_vector = embedding_lookup.get(word)
        if embedding_vector is not None:
            embeddings[i] = embedding_vector
            n_found += 1

    print('number of words found:', n_found)
    return embeddings

glove_path = os.path.join('glove.6B', 'glove.6B.100d.txt')
max_features = max_num_words + 1

pretrained_embedding = get_pretrained_embedding(glove_path, tokenizer.index_word, max_features)
pretrained_embedding.shape

number of words found: 19654

(20001, 100)

Model¶

To train our text classifier, we specify a 1D convolutional network. Our embedding layer can either be initialized randomly or loaded from a pre-trained embedding. Note that for the pre-trained embedding case, apart from loading the weights, we also "freeze" the embedding layer, i.e. we set its trainable attribute to False. This idea is often times used in transfer learning, where when parts of a model are pre-trained (in our case, only our Embedding layer), and parts of it are randomly initialized, the pre-trained part should ideally not be trained together with the randomly initialized part. The rationale behind it is that a large gradient update triggered by the randomly initialized layer would become very disruptive to those pre-trained weights.

Once we train the randomly initialized weights for a few iterations, we can then go about un-freezing the layers that were loaded with pre-trained weights, and do an update on the weight for the entire thing. The keras documentation also provides an example of how to do this, although the example is for image models, the same idea can also be applied here, and can be something that's worth experimenting.

def simple_text_cnn(max_sequence_len: int,
                    max_features: int,
                    num_classes: int,
                    optimizer: str='adam',
                    metrics: List[str]=['acc'],
                    pretrained_embedding: np.ndarray=None) -> Model:

    sequence_input = layers.Input(shape=(max_sequence_len,), dtype='int32')
    if pretrained_embedding is None:
        embedded_sequences = layers.Embedding(max_features, 100,
                                              name='embedding')(sequence_input)
    else:
        embedded_sequences = layers.Embedding(max_features, pretrained_embedding.shape[1],
                                              weights=[pretrained_embedding],
                                              name='embedding',
                                              trainable=False)(sequence_input)

    conv1 = layers.Conv1D(128, 5, activation='relu')(embedded_sequences)
    pool1 = layers.MaxPooling1D(5)(conv1)
    conv2 = layers.Conv1D(128, 5, activation='relu')(pool1)
    pool2 = layers.MaxPooling1D(5)(conv2)
    conv3 = layers.Conv1D(128, 5, activation='relu')(pool2)
    pool3 = layers.MaxPooling1D(35)(conv3)
    flatten = layers.Flatten()(pool3)
    dense = layers.Dense(128, activation='relu')(flatten)
    preds = layers.Dense(num_classes, activation='softmax')(dense)

    model = Model(sequence_input, preds)
    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizer,
                  metrics=metrics)
    return model

Model with Pretrained Embedding¶

num_classes = 2
model1 = simple_text_cnn(max_sequence_len, max_features, num_classes,
                         pretrained_embedding=pretrained_embedding)
model1.summary()

WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:66: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:541: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:4432: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:190: The name tf.get_default_session is deprecated. Please use tf.compat.v1.get_default_session instead.

WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:197: The name tf.ConfigProto is deprecated. Please use tf.compat.v1.ConfigProto instead.

WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:203: The name tf.Session is deprecated. Please use tf.compat.v1.Session instead.

WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:207: The name tf.global_variables is deprecated. Please use tf.compat.v1.global_variables instead.

WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:216: The name tf.is_variable_initialized is deprecated. Please use tf.compat.v1.is_variable_initialized instead.

WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:223: The name tf.variables_initializer is deprecated. Please use tf.compat.v1.variables_initializer instead.

WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:4267: The name tf.nn.max_pool is deprecated. Please use tf.nn.max_pool2d instead.

WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/optimizers.py:793: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:3576: The name tf.log is deprecated. Please use tf.math.log instead.

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
input_1 (InputLayer)         (None, 1000)              0         
_________________________________________________________________
embedding (Embedding)        (None, 1000, 100)         2000100   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 996, 128)          64128     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 199, 128)          0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 195, 128)          82048     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 39, 128)           0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 35, 128)           82048     
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 1, 128)            0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               16512     
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 258       
=================================================================
Total params: 2,245,094
Trainable params: 244,994
Non-trainable params: 2,000,100
_________________________________________________________________

We can confirm whether our embedding layer is trainable by looping through each layer and checking the trainable attribute.

df_model_layers = pd.DataFrame(
    [(layer.name, layer.trainable, layer.count_params()) for layer in model1.layers],
    columns=['layer', 'trainable', 'n_params']
)
df_model_layers

# time : 70
# test performance : auc 0.93212
start = time.time()
history1 = model1.fit(x_train, y_train,
                      validation_data=(x_val, y_val),
                      batch_size=128,
                      epochs=8)
end = time.time()
elapse1 = end - start
elapse1

WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/ops/math_grad.py:1424: where (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:1033: The name tf.assign_add is deprecated. Please use tf.compat.v1.assign_add instead.

WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:1020: The name tf.assign is deprecated. Please use tf.compat.v1.assign instead.

Train on 20000 samples, validate on 5000 samples
Epoch 1/8
20000/20000 [==============================] - 12s 604us/step - loss: 0.5854 - acc: 0.6748 - val_loss: 0.4772 - val_acc: 0.7808
Epoch 2/8
20000/20000 [==============================] - 8s 416us/step - loss: 0.4001 - acc: 0.8186 - val_loss: 0.3766 - val_acc: 0.8352
Epoch 3/8
20000/20000 [==============================] - 8s 414us/step - loss: 0.3428 - acc: 0.8507 - val_loss: 0.4276 - val_acc: 0.7966
Epoch 4/8
20000/20000 [==============================] - 8s 415us/step - loss: 0.2790 - acc: 0.8842 - val_loss: 0.3433 - val_acc: 0.8594
Epoch 5/8
20000/20000 [==============================] - 8s 415us/step - loss: 0.2469 - acc: 0.8987 - val_loss: 0.4015 - val_acc: 0.8310
Epoch 6/8
20000/20000 [==============================] - 8s 420us/step - loss: 0.1782 - acc: 0.9289 - val_loss: 0.4670 - val_acc: 0.8296
Epoch 7/8
20000/20000 [==============================] - 8s 419us/step - loss: 0.1017 - acc: 0.9643 - val_loss: 0.5965 - val_acc: 0.8146
Epoch 8/8
20000/20000 [==============================] - 8s 418us/step - loss: 0.0680 - acc: 0.9758 - val_loss: 0.6876 - val_acc: 0.8332

70.9072277545929

Model without Pretrained Embedding¶

num_classes = 2
model2 = simple_text_cnn(max_sequence_len, max_features, num_classes)
model2.summary()

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
input_2 (InputLayer)         (None, 1000)              0         
_________________________________________________________________
embedding (Embedding)        (None, 1000, 100)         2000100   
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 996, 128)          64128     
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 199, 128)          0         
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 195, 128)          82048     
_________________________________________________________________
max_pooling1d_5 (MaxPooling1 (None, 39, 128)           0         
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 35, 128)           82048     
_________________________________________________________________
max_pooling1d_6 (MaxPooling1 (None, 1, 128)            0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 128)               16512     
_________________________________________________________________
dense_4 (Dense)              (None, 2)                 258       
=================================================================
Total params: 2,245,094
Trainable params: 2,245,094
Non-trainable params: 0
_________________________________________________________________

# time : 86 secs
# test performance : auc 0.92310
start = time.time()
history1 = model2.fit(x_train, y_train,
                      validation_data=(x_val, y_val),
                      batch_size=128,
                      epochs=8)
end = time.time()
elapse1 = end - start
elapse1

Train on 20000 samples, validate on 5000 samples
Epoch 1/8
20000/20000 [==============================] - 11s 570us/step - loss: 0.5010 - acc: 0.7065 - val_loss: 0.3016 - val_acc: 0.8730
Epoch 2/8
20000/20000 [==============================] - 11s 542us/step - loss: 0.2024 - acc: 0.9243 - val_loss: 0.2816 - val_acc: 0.8824
Epoch 3/8
20000/20000 [==============================] - 11s 538us/step - loss: 0.0806 - acc: 0.9734 - val_loss: 0.3552 - val_acc: 0.8812
Epoch 4/8
20000/20000 [==============================] - 11s 535us/step - loss: 0.0272 - acc: 0.9917 - val_loss: 0.4671 - val_acc: 0.8836
Epoch 5/8
20000/20000 [==============================] - 11s 543us/step - loss: 0.0088 - acc: 0.9973 - val_loss: 0.6534 - val_acc: 0.8788
Epoch 6/8
20000/20000 [==============================] - 11s 542us/step - loss: 0.0090 - acc: 0.9973 - val_loss: 0.7522 - val_acc: 0.8740
Epoch 7/8
20000/20000 [==============================] - 11s 542us/step - loss: 0.0104 - acc: 0.9967 - val_loss: 1.0453 - val_acc: 0.8480
Epoch 8/8
20000/20000 [==============================] - 11s 543us/step - loss: 0.0205 - acc: 0.9924 - val_loss: 0.6930 - val_acc: 0.8712

87.63022541999817

Submission¶

For the submission section, we read in and preprocess the test data provided by the competition, then generate the predicted probability column for both the model that uses pretrained embedding and one that doesn't to compare their performance.

input_path = os.path.join(data_dir, 'word2vec-nlp-tutorial', 'testData.tsv')
df_test = pd.read_csv(input_path, delimiter='\t')
print(df_test.shape)
df_test.head()

(25000, 2)

def clean_text_without_label(df: pd.DataFrame, text_col: str) -> List[str]:
    texts = []
    for raw_text in df[text_col]:
        text = BeautifulSoup(raw_text).get_text()
        cleaned_text = clean_str(text)
        texts.append(cleaned_text)

    return texts

texts_test = clean_text_without_label(df_test, text_col)
sequences_test = tokenizer.texts_to_sequences(texts_test)
x_test = pad_sequences(sequences_test, maxlen=max_sequence_len)
len(x_test)

25000

def create_submission(ids, predictions, ids_col, prediction_col, submission_path) -> pd.DataFrame:
    df_submission = pd.DataFrame({
        ids_col: ids,
        prediction_col: predictions
    }, columns=[ids_col, prediction_col])

    if submission_path is not None:
        # create the directory if need be, e.g. if the submission_path = submission/submission.csv
        # we'll create the submission directory first if it doesn't exist
        directory = os.path.split(submission_path)[0]
        if (directory != '' or directory != '.') and not os.path.isdir(directory):
            os.makedirs(directory, exist_ok=True)

        df_submission.to_csv(submission_path, index=False, header=True)

    return df_submission

ids_col = 'id'
prediction_col = 'sentiment'
ids = df_test[ids_col]

models = {
    'pretrained_embedding': model1,
    'without_pretrained_embedding': model2
}

for model_name, model in models.items():
    print('generating submission for: ', model_name)
    submission_path = os.path.join(submission_dir, '{}_submission.csv'.format(model_name))
    predictions = model.predict(x_test, verbose=1)[:, 1]
    df_submission = create_submission(ids, predictions, ids_col, prediction_col, submission_path)

# sanity check to make sure the size and the output of the submission makes sense
print(df_submission.shape)
df_submission.head()

generating submission for:  pretrained_embedding
25000/25000 [==============================] - 6s 228us/step
generating submission for:  without_pretrained_embedding
25000/25000 [==============================] - 6s 222us/step
(25000, 2)

Summary¶

In this article, we took a look at how to leverage pre-trained word embeddings for our text classification task. There're also various Kaggle Kernels here and here that experiments whether different pre-trained embeddings or even an ensemble of models each with a different pre-trained embedding on various text classification tasks to see if it gives us an edge.

	id	sentiment	review
0	5814_8	1	With all this stuff going down at the moment w...
1	2381_9	1	\The Classic War of the Worlds\" by Timothy Hi...
2	7759_3	0	The film starts with a manager (Nicholas Bell)...
3	3630_4	0	It must be assumed that those who praised this...
4	9495_8	1	Superbly trashy and wondrously unpretentious 8...

	id	review
0	12311_10	Naturally in a film who's main themes are of m...
1	8348_2	This movie is a disaster within a disaster fil...
2	5828_4	All in all, this is a movie for kids. We saw i...
3	7186_2	Afraid of the Dark left me with the impression...
4	12128_7	A very accurate depiction of small time mob li...

	id	sentiment
0	12311_10	1.000
1	8348_2	0.000
2	5828_4	0.005
3	7186_2	0.071
4	12128_7	1.000

Table of Contents