Sequence classification using Recurrent Neural Networks
PyTorch implementation for sequence classification using RNNs
An example of many-to-one (sequence classification)
Original experiment from Hochreiter & Schmidhuber (1997).
The goal here is to classify sequences.
Elements and targets are represented locally (input vectors with only one non-zero bit).
The sequence starts with a B, ends with a E (the “trigger symbol”), and otherwise consists of randomly chosen symbols from the set {a, b, c, d} except for two elements at positions t1 and t2 that are either X or Y.
For the DifficultyLevel.HARD case, the sequence length is randomly chosen between 100 and 110, t1 is randomly chosen between 10 and 20, and t2 is randomly chosen between 50 and 60.
There are 4 sequence classes Q, R, S, and U, which depend on the temporal order of X and Y.
The rules are:
X, X -> Q,
X, Y -> R,
Y, X -> S,
Y, Y -> U.
from sequential_tasks import TemporalOrderExp6aSequence as QRSU
# Create a data generator. Predefined generator is implemented in file sequential_tasks.
example_generator = QRSU.get_predefined_generator(
difficulty_level=QRSU.DifficultyLevel.EASY,
batch_size=32,
)
example_batch = example_generator[1]
print(f'The return type is a {type(example_batch)} with length {len(example_batch)}.')
print(f'The first item in the tuple is the batch of sequences with shape {example_batch[0].shape}.')
print(f'The first element in the batch of sequences is:\n {example_batch[0][0, :, :]}')
print(f'The second item in the tuple is the corresponding batch of class labels with shape {example_batch[1].shape}.')
print(f'The first element in the batch of class labels is:\n {example_batch[1][0, :]}')
# Decoding the first sequence
sequence_decoded = example_generator.decode_x(example_batch[0][0, :, :])
print(f'The sequence is: {sequence_decoded}')
# Decoding the class label of the first sequence
class_label_decoded = example_generator.decode_y(example_batch[1][0])
print(f'The class label is: {class_label_decoded}')
We can see that our sequence contain 8 elements starting with B and ending with E. This sequence belong to class Q as per the rule defined earlier. Each element is one-hot encoded. Thus, we can represent our first sequence (BbXcXcbE) with a sequence of rows of one-hot encoded vectors (as shown above). . Similarly, class Q can be decoded as [1,0,0,0].
import torch
import torch.nn as nn
# Set the random seed for reproducible results
torch.manual_seed(1)
class SimpleRNN(nn.Module):
def __init__(self, input_size, hidden_size, output_size):
# This just calls the base class constructor
super().__init__()
# Neural network layers assigned as attributes of a Module subclass
# have their parameters registered for training automatically.
self.rnn = torch.nn.RNN(input_size, hidden_size, nonlinearity='relu', batch_first=True)
self.linear = torch.nn.Linear(hidden_size, output_size)
def forward(self, x):
# The RNN also returns its hidden state but we don't use it.
# While the RNN can also take a hidden state as input, the RNN
# gets passed a hidden state initialized with zeros by default.
h = self.rnn(x)[0]
x = self.linear(h)
return x
def train(model, train_data_gen, criterion, optimizer, device):
# Set the model to training mode. This will turn on layers that would
# otherwise behave differently during evaluation, such as dropout.
model.train()
# Store the number of sequences that were classified correctly
num_correct = 0
# Iterate over every batch of sequences. Note that the length of a data generator
# is defined as the number of batches required to produce a total of roughly 1000
# sequences given a batch size.
for batch_idx in range(len(train_data_gen)):
# Request a batch of sequences and class labels, convert them into tensors
# of the correct type, and then send them to the appropriate device.
data, target = train_data_gen[batch_idx]
data, target = torch.from_numpy(data).float().to(device), torch.from_numpy(target).long().to(device)
# Perform the forward pass of the model
output = model(data) # Step ①
# Pick only the output corresponding to last sequence element (input is pre padded)
output = output[:, -1, :] # For many-to-one RNN architecture, we need output from last RNN cell only.
# Compute the value of the loss for this batch. For loss functions like CrossEntropyLoss,
# the second argument is actually expected to be a tensor of class indices rather than
# one-hot encoded class labels. One approach is to take advantage of the one-hot encoding
# of the target and call argmax along its second dimension to create a tensor of shape
# (batch_size) containing the index of the class label that was hot for each sequence.
target = target.argmax(dim=1) # For example, [0,1,0,0] will correspond to 1 (index start from 0)
loss = criterion(output, target) # Step ②
# Clear the gradient buffers of the optimized parameters.
# Otherwise, gradients from the previous batch would be accumulated.
optimizer.zero_grad() # Step ③
loss.backward() # Step ④
optimizer.step() # Step ⑤
y_pred = output.argmax(dim=1)
num_correct += (y_pred == target).sum().item()
return num_correct, loss.item()
def test(model, test_data_gen, criterion, device):
# Set the model to evaluation mode. This will turn off layers that would
# otherwise behave differently during training, such as dropout.
model.eval()
# Store the number of sequences that were classified correctly
num_correct = 0
# A context manager is used to disable gradient calculations during inference
# to reduce memory usage, as we typically don't need the gradients at this point.
with torch.no_grad():
for batch_idx in range(len(test_data_gen)):
data, target = test_data_gen[batch_idx]
data, target = torch.from_numpy(data).float().to(device), torch.from_numpy(target).long().to(device)
output = model(data)
# Pick only the output corresponding to last sequence element (input is pre padded)
output = output[:, -1, :]
target = target.argmax(dim=1)
loss = criterion(output, target)
y_pred = output.argmax(dim=1)
num_correct += (y_pred == target).sum().item()
return num_correct, loss.item()
import matplotlib.pyplot as plt
from plot_lib import set_default, plot_state, print_colourbar
set_default()
def train_and_test(model, train_data_gen, test_data_gen, criterion, optimizer, max_epochs, verbose=True):
# Automatically determine the device that PyTorch should use for computation
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
# Move model to the device which will be used for train and test
model.to(device)
# Track the value of the loss function and model accuracy across epochs
history_train = {'loss': [], 'acc': []}
history_test = {'loss': [], 'acc': []}
for epoch in range(max_epochs):
# Run the training loop and calculate the accuracy.
# Remember that the length of a data generator is the number of batches,
# so we multiply it by the batch size to recover the total number of sequences.
num_correct, loss = train(model, train_data_gen, criterion, optimizer, device)
accuracy = float(num_correct) / (len(train_data_gen) * train_data_gen.batch_size) * 100
history_train['loss'].append(loss)
history_train['acc'].append(accuracy)
# Do the same for the testing loop
num_correct, loss = test(model, test_data_gen, criterion, device)
accuracy = float(num_correct) / (len(test_data_gen) * test_data_gen.batch_size) * 100
history_test['loss'].append(loss)
history_test['acc'].append(accuracy)
if verbose or epoch + 1 == max_epochs:
print(f'[Epoch {epoch + 1}/{max_epochs}]'
f" loss: {history_train['loss'][-1]:.4f}, acc: {history_train['acc'][-1]:2.2f}%"
f" - test_loss: {history_test['loss'][-1]:.4f}, test_acc: {history_test['acc'][-1]:2.2f}%")
# Generate diagnostic plots for the loss and accuracy
fig, axes = plt.subplots(ncols=2, figsize=(9, 4.5))
for ax, metric in zip(axes, ['loss', 'acc']):
ax.plot(history_train[metric])
ax.plot(history_test[metric])
ax.set_xlabel('epoch', fontsize=12)
ax.set_ylabel(metric, fontsize=12)
ax.legend(['Train', 'Test'], loc='best')
plt.show()
return model
# Setup the training and test data generators
difficulty = QRSU.DifficultyLevel.EASY
batch_size = 32
train_data_gen = QRSU.get_predefined_generator(difficulty, batch_size)
test_data_gen = QRSU.get_predefined_generator(difficulty, batch_size)
# Setup the RNN and training settings
input_size = train_data_gen.n_symbols
hidden_size = 4
output_size = train_data_gen.n_classes
model = SimpleRNN(input_size, hidden_size, output_size)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.RMSprop(model.parameters(), lr=0.001)
max_epochs = 10
# Train the model
model = train_and_test(model, train_data_gen, test_data_gen, criterion, optimizer, max_epochs)
for parameter_group in list(model.parameters()):
print(parameter_group.size())
# Setup the training and test data generators
difficulty = QRSU.DifficultyLevel.EASY
batch_size = 32
train_data_gen = QRSU.get_predefined_generator(difficulty, batch_size)
test_data_gen = QRSU.get_predefined_generator(difficulty, batch_size)
# Setup the RNN and training settings
input_size = train_data_gen.n_symbols
hidden_size = 4
output_size = train_data_gen.n_classes
model = SimpleRNN(input_size, hidden_size, output_size)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.RMSprop(model.parameters(), lr=0.001)
max_epochs = 100
# Train the model
model = train_and_test(model, train_data_gen, test_data_gen, criterion, optimizer, max_epochs, verbose=False)
We see that with short 8-element sequences, RNN gets about 50% accuracy. On further increasing epochs to 100, RNN gets 100% accuracy, though taking longer time to train. For a longer sequence, RNNs fail to memorize the information. Long Short-Term Memory(LSTM) solves long term memory loss by building up memory cells to preserve past information. For a very detailed explanation on the working of LSTMs, please follow this link. In my other notebook, we will see how LSTMs perform with even longer sequence classification.