# code for loading the format for the notebook
import os
# path : store the current path to convert back to it later
path = os.getcwd()
os.chdir(os.path.join('..', '..', 'notebook_format'))
from formats import load_style
load_style(plot_style=False)
os.chdir(path)
# 1. magic for inline plot
# 2. magic to print version
# 3. magic so that the notebook will reload external python modules
# 4. magic to enable retina (high resolution) plots
# https://gist.github.com/minrk/3301035
%matplotlib inline
%load_ext watermark
%load_ext autoreload
%autoreload 2
%config InlineBackend.figure_format='retina'
import os
import time
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from typing import List, Tuple
from keras import layers
from keras.models import Model
from keras.preprocessing.text import Tokenizer
from keras.utils.np_utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
# prevent scientific notations
pd.set_option('display.float_format', lambda x: '%.3f' % x)
%watermark -a 'Ethen' -d -t -v -p numpy,pandas,sklearn,keras,sentencepiece
In this notebook, we will be experimenting with subword tokenization. Tokenization is often times one of the first mandatory task that's performed in NLP task, where we break down a piece of text into meaningful individual units/tokens.
There're three major ways of performing tokenization.
Character Level
Treats each character (or unicode) as one individual token.
Word Level
Performs word segmentation on top of our text data.
Blog: Language modeling a billion words also shared some thoughts comparing character based tokenization v.s. word based tokenization. Taken directly from the post.
Word-level models have an important advantage over char-level models. Take the following sequence as an example (a quote from Robert A. Heinlein):
Progress isn't made by early risers. It's made by lazy men trying to find easier ways to do something.
After tokenization, the word-level model might view this sequence as containing 22 tokens. On the other hand, the char-level will view this sequence as containing 102 tokens. This longer sequence makes the task of the character model harder than the word model, as it must take into account dependencies between more tokens over more time-steps. Another issue with character language models is that they need to learn spelling in addition to syntax, semantics, etc. In any case, word language models will typically have lower error than character models.
The main advantage of character over word language models is that they have a really small vocabulary. For example, the GBW dataset will contain approximately 800 characters compared to 800,000 words (after pruning low-frequency tokens). In practice this means that character models will require less memory and have faster inference than their word counterparts. Another advantage is that they do not require tokenization as a preprocessing step.
Subword Level
As we can probably imagine, subword level is somewhere between character level and word level, hence tries to bring in the the pros (being able to handle out of vocabulary or rare words better) and mitigate the drawback (too fine-grained for downstream tasks) from both approaches. With subword level, what we are aiming for is to represent open vocabulary through a fixed-sized vocabulary of variable length character sequences. e.g. the word highest might be segmented into subwords high and est.
There're many different methods for generating these subwords. e.g.
We'll use the movie review sentiment analysis dataset from Kaggle for this example. It's a binary classification problem with AUC as the ultimate evaluation metric. The next few code chunk performs the usual text preprocessing, build up the word vocabulary and performing a train/test split.
data_dir = 'data'
submission_dir = 'submission'
input_path = os.path.join(data_dir, 'word2vec-nlp-tutorial', 'labeledTrainData.tsv')
df = pd.read_csv(input_path, delimiter='\t')
print(df.shape)
df.head()
raw_text = df['review'].iloc[0]
raw_text
import re
def clean_str(string: str) -> str:
string = re.sub(r"\\", "", string)
string = re.sub(r"\'", "", string)
string = re.sub(r"\"", "", string)
return string.strip().lower()
from bs4 import BeautifulSoup
def clean_text(df: pd.DataFrame,
text_col: str,
label_col: str) -> Tuple[List[str], List[int]]:
texts = []
labels = []
for raw_text, label in zip(df[text_col], df[label_col]):
text = BeautifulSoup(raw_text).get_text()
cleaned_text = clean_str(text)
texts.append(cleaned_text)
labels.append(label)
return texts, labels
text_col = 'review'
label_col = 'sentiment'
texts, labels = clean_text(df, text_col, label_col)
print('sample text: ', texts[0])
print('corresponding label:', labels[0])
random_state = 1234
val_split = 0.2
labels = to_categorical(labels)
texts_train, texts_val, y_train, y_val = train_test_split(
texts, labels,
test_size=val_split,
random_state=random_state)
print('labels shape:', labels.shape)
print('train size: ', len(texts_train))
print('validation size: ', len(texts_val))
To train our text classifier, we specify a 1D convolutional network. The comparison we'll be experimenting is whether subword-level model gives a better performance than word-level model.
def simple_text_cnn(max_sequence_len: int, max_features: int, num_classes: int,
optimizer: str='adam', metrics: List[str]=['acc']) -> Model:
sequence_input = layers.Input(shape=(max_sequence_len,), dtype='int32')
embedded_sequences = layers.Embedding(max_features, 100,
trainable=True)(sequence_input)
conv1 = layers.Conv1D(128, 5, activation='relu')(embedded_sequences)
pool1 = layers.MaxPooling1D(5)(conv1)
conv2 = layers.Conv1D(128, 5, activation='relu')(pool1)
pool2 = layers.MaxPooling1D(5)(conv2)
conv3 = layers.Conv1D(128, 5, activation='relu')(pool2)
pool3 = layers.MaxPooling1D(35)(conv3)
flatten = layers.Flatten()(pool3)
dense = layers.Dense(128, activation='relu')(flatten)
preds = layers.Dense(num_classes, activation='softmax')(dense)
model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
optimizer=optimizer,
metrics=metrics)
return model
The next couple of code chunks trains the subword vocabulary, encode our original text into these subwords and pads the sequences into a fixed length.
Note the the pad_sequences
function from keras assumes that index 0 is reserved for padding, hence when learning the subword vocabulary using sentencepiece
, we make sure to keep the index consistent.
# write the raw text so that sentencepiece can consume it
temp_file = 'train.txt'
with open(temp_file, 'w') as f:
f.write('\n'.join(texts))
from sentencepiece import SentencePieceTrainer, SentencePieceProcessor
max_num_words = 30000
model_type = 'unigram'
model_prefix = model_type
pad_id = 0
unk_id = 1
bos_id = 2
eos_id = 3
sentencepiece_params = ' '.join([
'--input={}'.format(temp_file),
'--model_type={}'.format(model_type),
'--model_prefix={}'.format(model_type),
'--vocab_size={}'.format(max_num_words),
'--pad_id={}'.format(pad_id),
'--unk_id={}'.format(unk_id),
'--bos_id={}'.format(bos_id),
'--eos_id={}'.format(eos_id)
])
print(sentencepiece_params)
SentencePieceTrainer.train(sentencepiece_params)
sp = SentencePieceProcessor()
sp.load("{}.model".format(model_prefix))
print('Found %s unique tokens.' % sp.get_piece_size())
max_sequence_len = 1000
sequences_train = [sp.encode_as_ids(text) for text in texts_train]
x_train = pad_sequences(sequences_train, maxlen=max_sequence_len)
sequences_val = [sp.encode_as_ids(text) for text in texts_val]
x_val = pad_sequences(sequences_val, maxlen=max_sequence_len)
sequences_train[0][:5]
print('sample text: ', texts_train[0])
print('sample text: ', sp.encode_as_pieces(sp.decode_ids(x_train[0].tolist())))
num_classes = 2
model1 = simple_text_cnn(max_sequence_len, max_num_words + 1, num_classes)
model1.summary()
# time : 120
# performance : 0.92936
start = time.time()
history1 = model1.fit(x_train, y_train,
validation_data=(x_val, y_val),
batch_size=128,
epochs=8)
end = time.time()
elapse1 = end - start
elapse1
tokenizer = Tokenizer(num_words=max_num_words, oov_token='<unk>')
tokenizer.fit_on_texts(texts_train)
print('Found %s unique tokens.' % len(tokenizer.word_index))
sequences_train = tokenizer.texts_to_sequences(texts_train)
x_train = pad_sequences(sequences_train, maxlen=max_sequence_len)
sequences_val = tokenizer.texts_to_sequences(texts_val)
x_val = pad_sequences(sequences_val, maxlen=max_sequence_len)
num_classes = 2
model2 = simple_text_cnn(max_sequence_len, max_num_words + 1, num_classes)
model2.summary()
# time : 120
# performance : 0.92520
start = time.time()
history2 = model2.fit(x_train, y_train,
validation_data=(x_val, y_val),
batch_size=128,
epochs=8)
end = time.time()
elapse2 = end - start
elapse2
For the submission section, we read in and preprocess the test data provided by the competition, then generate the predicted probability column for both the model that uses word-level tokenization and one that uses subword tokenization to compare their performance.
input_path = os.path.join(data_dir, 'word2vec-nlp-tutorial', 'testData.tsv')
df_test = pd.read_csv(input_path, delimiter='\t')
print(df_test.shape)
df_test.head()
def clean_text_without_label(df: pd.DataFrame, text_col: str) -> List[str]:
texts = []
for raw_text in df[text_col]:
text = BeautifulSoup(raw_text).get_text()
cleaned_text = clean_str(text)
texts.append(cleaned_text)
return texts
texts_test = clean_text_without_label(df_test, text_col)
# word-level
word_sequences_test = tokenizer.texts_to_sequences(texts_test)
word_x_test = pad_sequences(word_sequences_test, maxlen=max_sequence_len)
len(word_x_test)
# subword-level
sentencepiece_sequences_test = [sp.encode_as_ids(text) for text in texts_test]
sentencepiece_x_test = pad_sequences(sentencepiece_sequences_test, maxlen=max_sequence_len)
len(sentencepiece_x_test)
def create_submission(ids, predictions, ids_col, prediction_col, submission_path) -> pd.DataFrame:
df_submission = pd.DataFrame({
ids_col: ids,
prediction_col: predictions
}, columns=[ids_col, prediction_col])
if submission_path is not None:
# create the directory if need be, e.g. if the submission_path = submission/submission.csv
# we'll create the submission directory first if it doesn't exist
directory = os.path.split(submission_path)[0]
if (directory != '' or directory != '.') and not os.path.isdir(directory):
os.makedirs(directory, exist_ok=True)
df_submission.to_csv(submission_path, index=False, header=True)
return df_submission
ids_col = 'id'
prediction_col = 'sentiment'
ids = df_test[ids_col]
predictions_dict = {
'sentencepiece_cnn': model1.predict(sentencepiece_x_test)[:, 1], # 0.92936
'word_cnn': model2.predict(word_x_test)[:, 1] # 0.92520
}
for model_name, predictions in predictions_dict.items():
print('generating submission for: ', model_name)
submission_path = os.path.join(submission_dir, '{}_submission.csv'.format(model_name))
df_submission = create_submission(ids, predictions, ids_col, prediction_col, submission_path)
# sanity check to make sure the size and the output of the submission makes sense
print(df_submission.shape)
df_submission.head()
We've looked at the performance of leveraging subword tokenization for our text classification task. Note that some other ideas that we did not try out are: