Lab session: your first Neural Networks

PyTorch intro exercises

Author

Affiliation

Claudio Mirabello

NBIS

PyTorch intro exercises

1. Build a simple sequential model

A model to classify clinical variables into a number possible outcomes
Can you build a sequential model to reproduce the graph shown in the figure?
Choose whatever activations you want, wherever possible
How many outcomes/classes are we predicting?

import torch.nn as nn
#Add your model here
model = ...
...
print(model)

2. Build a better XOR classifier

Given the model seen at lecture, how do we make a better classifier (higher accuracy)?

More layers? More neurons?
Generate more data?
More epochs?
Different batch size?
Different optimizer?
It’s up to you! Let’s see who does best on validation

Only for Tuesday’s session:

Different activations?
Add Dropout? How large?

Plotting and training helper functions:

from typing import Optional
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import torchmetrics

class LivePlot():
    def __init__(self, left_label="Loss", right_label="Accuracy"):
        self.fig = go.FigureWidget(
            make_subplots(specs=[[{"secondary_y": True}]])
        )
        self.fig.update_yaxes(title_text=left_label,  secondary_y=False)
        self.fig.update_yaxes(title_text=right_label, secondary_y=True)

        self.plot_indices = {}
        self.trace_secondary = {}
        display(self.fig)
        self.limits = [0, 0]
        self.current_x = 0

    def report(self, name: str, value: float, secondary_y: bool = False):
        try:
            plot_index = self.plot_indices[name]
        except KeyError:
            plot_index = len(self.fig.data)
            self.fig.add_scatter(
                y=[], x=[], name=name,
                secondary_y=secondary_y
            )
            self.plot_indices[name] = plot_index
            self.trace_secondary[name] = secondary_y
        self.fig.data[plot_index].y += (value,)
        self.fig.data[plot_index].x += (self.current_x,)

    def increment(self, n_ticks: int):
        self.limits[1] += n_ticks
        self.fig.update_layout(xaxis_range=self.limits)

    def set_limit(self, n_ticks: int):
        self.limits[1] = n_ticks
        self.fig.update_layout(xaxis_range=self.limits)

    def tick(self, n_ticks: Optional[int] = None):
        if n_ticks is None:
            n_ticks = 1
        self.current_x += n_ticks

def train(*,
          model: torch.nn.Module, 
          train_loader: DataLoader, 
          dev_loader: DataLoader, 
          optimizer: torch.optim.Optimizer, 
          criterion: torch.nn.Module, 
          max_epochs: int,
          metric: Optional[torchmetrics.metric] = None,
          device: Optional[torch.device] = None,  
          liveplot: Optional[LivePlot]=None):
    if device is None:
        device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

    model.to(device)

    for epoch in range(max_epochs):
        training_loss_acc = 0
        training_examples = 0
        model.train()
        
        for i, batch in enumerate(train_loader):
            optimizer.zero_grad()
            
            x_batch, y_batch = batch
            x_batch = x_batch.to(device)  
            y_hat = model(x_batch)

            loss = criterion(y_hat, y_batch.to(device))
            loss.backward()

            optimizer.step()
            training_loss_acc += loss.item()
            training_examples += x_batch.size(0)
        
        model.eval()
        with torch.no_grad():
            dev_loss_acc = 0
            dev_examples = 0
            dev_accuracy = 0
            for i, batch in enumerate(dev_loader):
                x_batch, y_batch = batch
                x_batch = x_batch.to(device)
                y_hat = model(x_batch)
                dev_loss_acc += criterion(y_hat, y_batch.to(device)).item()
                dev_examples += x_batch.size(0)
                if metric:
                    dev_accuracy += metric(torch.argmax(y_hat, -1), y_batch)
        
        if liveplot is not None:
            liveplot.tick() # Update the liveplot time
            liveplot.report("Training loss", training_loss_acc / training_examples)
            liveplot.report("Development loss", dev_loss_acc / dev_examples)
            if metric:
                liveplot.report("Development accuracy", dev_accuracy / (i+1), secondary_y=True)

Data generation step:

import numpy as np
# Generate XOR data
data = np.random.random((10000, 3)) - 0.5
labels = np.zeros((10000))

labels[np.where(np.logical_xor(np.logical_xor(data[:,0] > 0, data[:,1] > 0), data[:,2] > 0))] = 1

#let's print some data and the corresponding label to check that they match the table above
for x in range(3):
    print("{0: .2f} xor {1: .2f} xor {2: .2f} equals {3:}".format(data[x,0], data[x,1], data[x,2], labels[x]))

The baseline network to improve:

class MLP(torch.nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dim):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.Tanh(),
            nn.Linear(hidden_dim, output_dim),
        )
    def forward(self, x):
        y_hat = self.layers(x)
        return y_hat

# model from MLP class
model = MLP(input_dim=3, output_dim=2, hidden_dim=3)
epochs = 20

# define optimizer and loss function
learning_rate = 1e-3
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()
accuracy = torchmetrics.Accuracy(task='multiclass', num_classes=2, top_k=1)

# convert numpy arrays to torch tensors
tdata = torch.Tensor(data)
tlabels = torch.Tensor(labels).long()
dataset = TensorDataset(tdata, tlabels)

# split the data randomly
total_samples = data.shape[0]
train_samples = int(total_samples * 0.9)
train_set, dev_set = torch.utils.data.random_split(dataset, [train_samples, total_samples-train_samples])

# shuffle data at training time
train_loader = DataLoader(train_set, batch_size=32, shuffle=True)
dev_loader = DataLoader(dev_set, batch_size=32)

# Setup plot
liveplot = LivePlot()
liveplot.increment(epochs)

train(model=model, 
      train_loader=train_loader, 
      dev_loader=dev_loader, 
      optimizer=optimizer, 
      criterion=criterion,
      metric=accuracy,
      max_epochs=epochs, 
      liveplot=liveplot,
      device=torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu'))

#Add your code here

3. Build a regression model

Take the Boston housing dataset (http://lib.stat.cmu.edu/datasets/boston)
Records a set of variables for a set of houses in Boston, including among others:
- CRIM per capita crime rate by town
- ZN proportion of residential land zoned for lots over 25,000 sq.ft.
- INDUS proportion of non-retail business acres per town
- CHAS Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
- NOX nitric oxides concentration (parts per 10 million)
- RM average number of rooms per dwelling
Can we use these variables to predict the value of a house (in tens of thousands of dollars)?

Download the data:

!mkdir -p data
!wget -P ./data/ https://github.com/selva86/datasets/raw/refs/heads/master/BostonHousing.csv

!ls data/

Load the data with pandas:

import torch
import torch.nn as nn
from sklearn.preprocessing import StandardScaler # hint

data = pd.read_csv("data/BostonHousing.csv")
print(data.head())
data = np.array(data)
print(f"Data shape is: {data.shape}")

train_samples = int(data.shape[0] * 0.9)
train_x = data[:train_samples, :13]
train_y = data[:train_samples, 13:]

dev_x = data[train_samples:, :13]
dev_y = data[train_samples:, 13:]

train_set = TensorDataset(torch.Tensor(train_x), torch.Tensor(train_y))
dev_set = TensorDataset(torch.Tensor(dev_x), torch.Tensor(dev_y))

# shuffle data at training time
train_loader = DataLoader(train_set, batch_size=32, shuffle=True)
dev_loader = DataLoader(dev_set, batch_size=32)

class MLP(torch.nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dim):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.Tanh(),
            nn.Linear(hidden_dim, output_dim),
        )
    def forward(self, x):
        y_hat = self.layers(x)
        return y_hat

# model from MLP class
model = ...
epochs = ...

# define optimizer and loss function
optimizer = ...
# https://docs.pytorch.org/docs/stable/nn.html#loss-functions
criterion = ...

# Setup plot
liveplot = LivePlot()
liveplot.increment(epochs)

train(model=model, 
      train_loader=train_loader, 
      dev_loader=dev_loader, 
      optimizer=optimizer, 
      criterion=criterion, 
      max_epochs=epochs, 
      liveplot=liveplot,
      device=torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu'))

4. The IMDB movie review sentiment dataset

This dataset contains 50k reviews for movies in IMDB, split into a train and test set of equal size. You want to predict whether the review is positive or negative.

Download the raw data and read it into a data structure:

!mkdir -p data
!wget -P ./data https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -C ./data -zxf ./data/aclImdb_v1.tar.gz

Load the dataset:

import os
import glob

def imdb_dataset(directory='data/',
                 train=False,
                 test=False,
                 train_directory='train',
                 test_directory='test',
                 extracted_name='aclImdb',
                 check_files=['aclImdb/README'],
                 sentiments=['pos', 'neg']):
    """
    Returns:
        :class:`tuple` of :class:`iterable` or :class:`iterable`:
        Returns between one and all dataset splits (train, dev and test) depending on if their
        respective boolean argument is ``True``.

    Example:
        [{
          'text': 'For a movie that gets no respect there sure are a lot of memorable quotes...',
          'sentiment': 'pos'
        }, {
          'text': 'Bizarre horror movie filled with famous faces but stolen by Cristina Raines...',
          'sentiment': 'pos'
        }]
    """

    ret = []
    splits = [
        dir_ for (requested, dir_) in [(train, train_directory), (test, test_directory)]
        if requested
    ]
    for split_directory in splits:
        full_path = os.path.join(directory, extracted_name, split_directory)

        examples = []
        for sentiment in sentiments:
            for filename in glob.iglob(os.path.join(full_path, sentiment, '*.txt')):
                with open(filename, 'r', encoding="utf-8") as f:
                    text = f.readline()
                examples.append({
                    'text': text,
                    'sentiment': sentiment,
                })
        ret.append(examples)

    if len(ret) == 1:
        return ret[0]
    else:
        return tuple(ret)

train_data = imdb_dataset(train=True)

len(train_data[0]['text'])

How do we build a predictor for this task?

...