TP2: The Convolutional Neural Network¶

Introduction to CNN and CIFAR-10 dataset¶

By: Alexandre Verine¶

In the practical session, we will implement a Convolutional Neural Network (CNN) to classify images from the CIFAR-10 dataset. The CIFAR-10 dataset consists of 60000 32x32 colour images in 10 classes, with 6000 images per class. There are 50000 training images and 10000 test images. We will compare the results with a traditional Dense Neural Network (DNN) and a CNN.

First, we load the necessary libraries:

# PyTorch library, provides tensor computation and deep neural networks
import torch

# Package that provides access to popular datasets and image transformations for computer vision
import torchvision
from torchvision import datasets, transforms

import torch.nn as nn  # Provides classes to define and manipulate neural networks
import torch.nn.functional as F  # Contains functions that do not have any parameters, such as relu, tanh, etc.
import torch.optim as optim  # Package implementing various optimization algorithms

# Library for the Python programming language, adding support for large, multi-dimensional arrays and matrices.
import numpy as np

import matplotlib.pyplot as plt  # Library for creating static, animated, and interactive visualizations in Python

To accelerate the training of the CNN, we will use the GPU (NVIDIA CUDA), the TPU (Google Colab Pro) or the Apple Shared Neural Engine (M1/M2/M3 chip).

# Set the device (cuda, cpu or mps)
if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available() and torch.backends.mps.is_built():
    device = torch.device("mps")
else:
    device = torch.device("cpu")
print("Device used: {}".format(device))
Device used: cuda

The CIFAR-10 dataset is available in torchvision.datasets:

# number of subprocesses to use for data loading
num_workers = 0
# how many sampls per batch to load
batch_size = 64
# percentage of training set to use as validation
valid_size = 0.2

# convert data to a normalized torch.FloatTensor
transform = transforms.Compose(
    [transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]
)

# Choose the training and test datasets
train_data = datasets.CIFAR10("data", train=True, download=True, transform=transform)
test_data = datasets.CIFAR10("data", train=False, download=True, transform=transform)


# prepare data loaders (combine dataset and sampler)
train_loader = torch.utils.data.DataLoader(
    train_data, batch_size=batch_size, num_workers=num_workers, shuffle=True
)


test_loader = torch.utils.data.DataLoader(
    test_data, batch_size=batch_size, num_workers=num_workers, shuffle=False
)
Files already downloaded and verified
Files already downloaded and verified

We can load the first batch and display the images:

# Load the first batch of the test data
examples = enumerate(test_loader)
batch_idx, (example_data, example_targets) = next(examples)

# plot the images in the batch, along with the corresponding labels
fig = plt.figure(figsize=(10, 4))
# display 20 images
for idx in np.arange(27):
    ax = fig.add_subplot(3, 9, idx + 1, xticks=[], yticks=[])
    imshow(example_data[idx].detach().numpy())
    ax.set_title(classes[example_targets[idx]])
plt.tight_layout()
plt.show()

We can define the training/eval functions:

def train_n_test(model, optimizer, nepochs, criterion, fname):
    # number of epochs to train the model
    n_epochs = [*range(1, nepochs + 1)]
    # List to store loss to visualize
    valid_loss_min = np.Inf  # track change in validation loss

    train_losses = []
    train_counter = []
    test_losses = []
    acc_eval = []
    test_counter = [i * len(train_loader.dataset) for i in n_epochs]

    for epoch in n_epochs:

        # keep track of training and validation loss
        valid_loss = 0.0

        ###################
        # train the model #
        ###################
        model.train()
        for batch_idx, (data, target) in enumerate(train_loader):
            # move tensors to GPU/MPS if CUDA/MPS is available
            data, target = data.to(device), target.to(device)
            # clear the gradients of all optimized variables
            optimizer.zero_grad()
            # forward pass: compute predicted outputs by passing inputs to the model
            output = model(data)
            # calculate the batch loss
            loss = criterion(output, target)
            # backward pass: compute gradient of the loss with respect to model parameters
            loss.backward()
            # perform a single optimization step (parameter update)
            optimizer.step()
            # update training loss
            train_losses.append(loss.item())
            train_counter.append(
                (batch_idx * 64) + ((epoch - 1) * len(train_loader.dataset))
            )

        ######################
        # validate the model #
        ######################
        model.eval()
        correct = 0
        for data, target in test_loader:
            # move tensors to GPU/MPS if CUDA/MPS is available
            data, target = data.to(device), target.to(device)
            # forward pass: compute predicted outputs by passing inputs to the model
            output = model(data)
            # calculate the batch loss
            loss = criterion(output, target)
            # update average validation loss
            valid_loss += loss.item() * data.size(0)
            pred = output.data.max(1, keepdim=True)[1]
            correct += pred.eq(target.data.view_as(pred)).sum().item()

        # calculate average losses
        train_loss = np.mean(train_losses[(epoch - 1) * len(train_loader) :])
        valid_loss = valid_loss / len(test_loader.dataset)
        acc_eval.append(correct / len(test_loader.dataset) * 100)
        test_losses.append(valid_loss)
        # print training/validation statistics
        print(
            "Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}".format(
                epoch, train_loss, valid_loss
            )
        )

        # save model if validation loss has decreased
        if valid_loss <= valid_loss_min:
            print(
                "Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...".format(
                    valid_loss_min, valid_loss
                )
            )
            torch.save(model.state_dict(), fname)
            valid_loss_min = valid_loss

    torch.save(
        {
            "test_loss": (test_counter, test_losses),
            "train_loss": (train_counter, train_losses),
            "acc_eval": acc_eval,
        },
        f"res_{fname}.pt",
    )
def plot_accuracies(fnames, names):
    num_models = len(fnames)
    blues = plt.cm.Blues(np.linspace(0, 1, num_models + 1))
    accs_eval = []
    plt.figure(figsize=(10, 5))
    for i, fname in enumerate(fnames):
        x = torch.load(f"res_{fname}.pt")
        test_counter, test_losses = x["test_loss"]
        acc_eval = x["acc_eval"]
        accs_eval.append(acc_eval)
        plt.plot(
            test_counter,
            acc_eval,
            color=blues[i + 1],
        )

    plt.legend(
        [
            f"{name} Best Accuracy: {max(acc_eval):.2f}%"
            for name, acc_eval in zip(names, accs_eval)
        ],
        loc="lower right",
    )
    for i, fname in enumerate(fnames):
        x = torch.load(f"res_{fname}.pt")
        test_counter, test_losses = x["test_loss"]
        acc_eval = x["acc_eval"]

        plt.scatter(
            [test_counter[np.argmax(acc_eval)]],
            [max(acc_eval)],
            color="black",
            zorder=+200,
        )
    plt.xlim([min(test_counter), max(test_counter)])
    plt.ylim([0, 100])
    plt.xlabel("Number of Examples Seen by the model")
    plt.ylabel("Accuracy")
    plt.annotate(
        "",
        xy=(min(test_counter) - 0.3, 105),
        xytext=(min(test_counter) - 0.3, -5),
        arrowprops=dict(arrowstyle="->"),
        annotation_clip=False,
        zorder=-100,
    )
    plt.annotate(
        "",
        xy=(max(test_counter) + 10000, -0.0),
        xytext=(min(test_counter) - 1000, -0.0),
        arrowprops=dict(arrowstyle="->"),
        annotation_clip=False,
        zorder=-100,
    )

We define the loss:

criterion = nn.CrossEntropyLoss()

First we can define a Dende Neural Network (DNN) with 3 hidden layers:

class FC(nn.Module):
    def __init__(self):
        super(FC, self).__init__()
        self.fc1 = nn.Linear(3 * 32 * 32, 1024)
        self.fc2 = nn.Linear(1024, 256)
        self.fc3 = nn.Linear(256, 64)
        self.fc4 = nn.Linear(64, 10)

    def forward(self, x):
        x = x.view(-1, 3 * 32 * 32)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = self.fc4(x)
        return x

We instantiate the DNN model, the optimizer and we can count the number of parameters:

model = FC().to(device)  # Create the model and move it to the device
model_parameters = filter(lambda p: p.requires_grad, model.parameters())
params = sum([np.prod(p.size()) for p in model_parameters])
print("Number of parameters: ", params)
Number of parameters:  3426250

We can train the DNN model:

optimizer = optim.SGD(model.parameters(), lr=0.001)  # Define the optimizer
train_n_test(model, optimizer, 30, criterion, "sgd_cifar_fc.pt")
Epoch: 1 	Training Loss: 2.298408 	Validation Loss: 2.293379
Validation loss decreased (inf --> 2.293379).  Saving model ...
Epoch: 2 	Training Loss: 2.287825 	Validation Loss: 2.282024
Validation loss decreased (2.293379 --> 2.282024).  Saving model ...
Epoch: 3 	Training Loss: 2.275021 	Validation Loss: 2.267214
Validation loss decreased (2.282024 --> 2.267214).  Saving model ...
Epoch: 4 	Training Loss: 2.257316 	Validation Loss: 2.245834
Validation loss decreased (2.267214 --> 2.245834).  Saving model ...
Epoch: 5 	Training Loss: 2.231110 	Validation Loss: 2.214038
Validation loss decreased (2.245834 --> 2.214038).  Saving model ...
Epoch: 6 	Training Loss: 2.193528 	Validation Loss: 2.170689
Validation loss decreased (2.214038 --> 2.170689).  Saving model ...
Epoch: 7 	Training Loss: 2.147381 	Validation Loss: 2.122519
Validation loss decreased (2.170689 --> 2.122519).  Saving model ...
Epoch: 8 	Training Loss: 2.100531 	Validation Loss: 2.076545
Validation loss decreased (2.122519 --> 2.076545).  Saving model ...
Epoch: 9 	Training Loss: 2.057069 	Validation Loss: 2.033966
Validation loss decreased (2.076545 --> 2.033966).  Saving model ...
Epoch: 10 	Training Loss: 2.017031 	Validation Loss: 1.995049
Validation loss decreased (2.033966 --> 1.995049).  Saving model ...
Epoch: 11 	Training Loss: 1.980282 	Validation Loss: 1.959829
Validation loss decreased (1.995049 --> 1.959829).  Saving model ...
Epoch: 12 	Training Loss: 1.947010 	Validation Loss: 1.929186
Validation loss decreased (1.959829 --> 1.929186).  Saving model ...
Epoch: 13 	Training Loss: 1.917824 	Validation Loss: 1.901648
Validation loss decreased (1.929186 --> 1.901648).  Saving model ...
Epoch: 14 	Training Loss: 1.891703 	Validation Loss: 1.877327
Validation loss decreased (1.901648 --> 1.877327).  Saving model ...
Epoch: 15 	Training Loss: 1.868050 	Validation Loss: 1.854632
Validation loss decreased (1.877327 --> 1.854632).  Saving model ...
Epoch: 16 	Training Loss: 1.846225 	Validation Loss: 1.834231
Validation loss decreased (1.854632 --> 1.834231).  Saving model ...
Epoch: 17 	Training Loss: 1.826443 	Validation Loss: 1.814482
Validation loss decreased (1.834231 --> 1.814482).  Saving model ...
Epoch: 18 	Training Loss: 1.807416 	Validation Loss: 1.796260
Validation loss decreased (1.814482 --> 1.796260).  Saving model ...
Epoch: 19 	Training Loss: 1.789477 	Validation Loss: 1.778376
Validation loss decreased (1.796260 --> 1.778376).  Saving model ...
Epoch: 20 	Training Loss: 1.772407 	Validation Loss: 1.762206
Validation loss decreased (1.778376 --> 1.762206).  Saving model ...
Epoch: 21 	Training Loss: 1.756357 	Validation Loss: 1.746533
Validation loss decreased (1.762206 --> 1.746533).  Saving model ...
Epoch: 22 	Training Loss: 1.741044 	Validation Loss: 1.731481
Validation loss decreased (1.746533 --> 1.731481).  Saving model ...
Epoch: 23 	Training Loss: 1.726189 	Validation Loss: 1.716993
Validation loss decreased (1.731481 --> 1.716993).  Saving model ...
Epoch: 24 	Training Loss: 1.712934 	Validation Loss: 1.703785
Validation loss decreased (1.716993 --> 1.703785).  Saving model ...
Epoch: 25 	Training Loss: 1.699721 	Validation Loss: 1.691575
Validation loss decreased (1.703785 --> 1.691575).  Saving model ...
Epoch: 26 	Training Loss: 1.687766 	Validation Loss: 1.679835
Validation loss decreased (1.691575 --> 1.679835).  Saving model ...
Epoch: 27 	Training Loss: 1.675872 	Validation Loss: 1.668598
Validation loss decreased (1.679835 --> 1.668598).  Saving model ...
Epoch: 28 	Training Loss: 1.664840 	Validation Loss: 1.657806
Validation loss decreased (1.668598 --> 1.657806).  Saving model ...
Epoch: 29 	Training Loss: 1.654181 	Validation Loss: 1.648067
Validation loss decreased (1.657806 --> 1.648067).  Saving model ...
Epoch: 30 	Training Loss: 1.644267 	Validation Loss: 1.638514
Validation loss decreased (1.648067 --> 1.638514).  Saving model ...
plot("sgd_cifar_fc.pt")

By using Adam optimizer, we reach a better performance:

optimizer = optim.Adam(model.parameters(), lr=0.001)  # Define the optimizer
train_n_test(model, optimizer, 30, criterion, "adam_cifar_fc.pt")
Epoch: 1 	Training Loss: 1.647408 	Validation Loss: 1.517360
Validation loss decreased (inf --> 1.517360).  Saving model ...
Epoch: 2 	Training Loss: 1.452193 	Validation Loss: 1.451674
Validation loss decreased (1.517360 --> 1.451674).  Saving model ...
Epoch: 3 	Training Loss: 1.342919 	Validation Loss: 1.386244
Validation loss decreased (1.451674 --> 1.386244).  Saving model ...
Epoch: 4 	Training Loss: 1.251634 	Validation Loss: 1.386287
Epoch: 5 	Training Loss: 1.171663 	Validation Loss: 1.347294
Validation loss decreased (1.386244 --> 1.347294).  Saving model ...
Epoch: 6 	Training Loss: 1.097073 	Validation Loss: 1.361448
Epoch: 7 	Training Loss: 1.026438 	Validation Loss: 1.386757
Epoch: 8 	Training Loss: 0.948088 	Validation Loss: 1.403232
Epoch: 9 	Training Loss: 0.882839 	Validation Loss: 1.444463
Epoch: 10 	Training Loss: 0.814854 	Validation Loss: 1.502697
Epoch: 11 	Training Loss: 0.756677 	Validation Loss: 1.604978
Epoch: 12 	Training Loss: 0.703948 	Validation Loss: 1.698845
Epoch: 13 	Training Loss: 0.654056 	Validation Loss: 1.726790
Epoch: 14 	Training Loss: 0.605096 	Validation Loss: 1.792672
Epoch: 15 	Training Loss: 0.566104 	Validation Loss: 1.950972
Epoch: 16 	Training Loss: 0.538840 	Validation Loss: 1.945389
Epoch: 17 	Training Loss: 0.490365 	Validation Loss: 2.059897
Epoch: 18 	Training Loss: 0.469634 	Validation Loss: 2.034726
Epoch: 19 	Training Loss: 0.447501 	Validation Loss: 2.080638
Epoch: 20 	Training Loss: 0.423825 	Validation Loss: 2.313976
Epoch: 21 	Training Loss: 0.405261 	Validation Loss: 2.340054
Epoch: 22 	Training Loss: 0.384436 	Validation Loss: 2.416563
Epoch: 23 	Training Loss: 0.364228 	Validation Loss: 2.549311
Epoch: 24 	Training Loss: 0.361097 	Validation Loss: 2.448377
Epoch: 25 	Training Loss: 0.342479 	Validation Loss: 2.533963
Epoch: 26 	Training Loss: 0.328682 	Validation Loss: 2.579750
Epoch: 27 	Training Loss: 0.310609 	Validation Loss: 2.578867
Epoch: 28 	Training Loss: 0.304096 	Validation Loss: 2.731312
Epoch: 29 	Training Loss: 0.296812 	Validation Loss: 2.816342
Epoch: 30 	Training Loss: 0.280095 	Validation Loss: 2.909314
plot("adam_cifar_fc.pt")

By comparing the two optimizers, we can see that the Adam optimizer converges faster than the SGD optimizer:

plot_accuracies(["sgd_cifar_fc.pt", "adam_cifar_fc.pt"], ["SGD", "Adam"])

We consider now a small CNN with 2 convolutional layers and 3 fully connected layers:

class CNN(nn.Module):
    def __init__(self):  # Define the layers of the network
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(
            3, 6, 5
        )  # 3 input channels, 6 output channels, 5x5 kernel
        self.pool = nn.MaxPool2d(2, 2)  # 2x2 kernel
        self.conv2 = nn.Conv2d(
            6, 16, 5
        )  # 6 input channels, 16 output channels, 5x5 kernel
        self.fc1 = nn.Linear(
            16 * 5 * 5, 120
        )  # 16x5x5 input features, 120 output features
        self.fc2 = nn.Linear(120, 84)  # 120 input features, 84 output features
        self.fc3 = nn.Linear(84, 10)  # 84 input features, 10 output features

    def forward(self, x):  # Define the forward pass
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 5 * 5)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

We instantiate the CNN model, the optimizer and we can count the number of parameters:

model = CNN().to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)  # Define the optimizer
model_parameters = filter(lambda p: p.requires_grad, model.parameters())
params = sum([np.prod(p.size()) for p in model_parameters])
print("Number of parameters: ", params)
Number of parameters:  62006

The CNN has less parameters than the DNN !

We train the model using Adam:

model = CNN().to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
train_n_test(model, optimizer, 30, criterion, "cifar_cnn.pt")
Epoch: 1 	Training Loss: 1.659091 	Validation Loss: 1.426273
Validation loss decreased (inf --> 1.426273).  Saving model ...
Epoch: 2 	Training Loss: 1.348434 	Validation Loss: 1.260200
Validation loss decreased (1.426273 --> 1.260200).  Saving model ...
Epoch: 3 	Training Loss: 1.211291 	Validation Loss: 1.190875
Validation loss decreased (1.260200 --> 1.190875).  Saving model ...
Epoch: 4 	Training Loss: 1.124590 	Validation Loss: 1.120725
Validation loss decreased (1.190875 --> 1.120725).  Saving model ...
Epoch: 5 	Training Loss: 1.057546 	Validation Loss: 1.075490
Validation loss decreased (1.120725 --> 1.075490).  Saving model ...
Epoch: 6 	Training Loss: 1.002188 	Validation Loss: 1.051245
Validation loss decreased (1.075490 --> 1.051245).  Saving model ...
Epoch: 7 	Training Loss: 0.956595 	Validation Loss: 1.016638
Validation loss decreased (1.051245 --> 1.016638).  Saving model ...
Epoch: 8 	Training Loss: 0.913452 	Validation Loss: 1.027900
Epoch: 9 	Training Loss: 0.881005 	Validation Loss: 1.008607
Validation loss decreased (1.016638 --> 1.008607).  Saving model ...
Epoch: 10 	Training Loss: 0.849762 	Validation Loss: 1.013531
Epoch: 11 	Training Loss: 0.815056 	Validation Loss: 1.006732
Validation loss decreased (1.008607 --> 1.006732).  Saving model ...
Epoch: 12 	Training Loss: 0.789146 	Validation Loss: 0.995275
Validation loss decreased (1.006732 --> 0.995275).  Saving model ...
Epoch: 13 	Training Loss: 0.764450 	Validation Loss: 1.000758
Epoch: 14 	Training Loss: 0.739543 	Validation Loss: 1.006464
Epoch: 15 	Training Loss: 0.713389 	Validation Loss: 1.030374
Epoch: 16 	Training Loss: 0.690429 	Validation Loss: 1.067279
Epoch: 17 	Training Loss: 0.669459 	Validation Loss: 1.065921
Epoch: 18 	Training Loss: 0.647894 	Validation Loss: 1.072320
Epoch: 19 	Training Loss: 0.630330 	Validation Loss: 1.102415
Epoch: 20 	Training Loss: 0.606180 	Validation Loss: 1.125398
Epoch: 21 	Training Loss: 0.594327 	Validation Loss: 1.186328
Epoch: 22 	Training Loss: 0.578217 	Validation Loss: 1.166590
Epoch: 23 	Training Loss: 0.556354 	Validation Loss: 1.161449
Epoch: 24 	Training Loss: 0.543732 	Validation Loss: 1.207963
Epoch: 25 	Training Loss: 0.526854 	Validation Loss: 1.209673
Epoch: 26 	Training Loss: 0.509426 	Validation Loss: 1.229373
Epoch: 27 	Training Loss: 0.498067 	Validation Loss: 1.277879
Epoch: 28 	Training Loss: 0.481433 	Validation Loss: 1.369175
Epoch: 29 	Training Loss: 0.471013 	Validation Loss: 1.357426
Epoch: 30 	Training Loss: 0.460048 	Validation Loss: 1.385186
plot("cifar_cnn.pt")

What can we observe ?

We consider a larger/wider model with batch normalization and dropout:

class LargeCNN(nn.Module):
    def __init__(self, batch_norm=False, dropout=False):
        super(LargeCNN, self).__init__()
        self.conv_layer1 = nn.Sequential(
            # Conv Layer block 1
            nn.Conv2d(in_channels=3, out_channels=32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32) if batch_norm else nn.Identity(),
            nn.ReLU(inplace=True),
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )
        self.conv_layer2 = nn.Sequential(
            # Conv Layer block 2
            nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128) if batch_norm else nn.Identity(),
            nn.ReLU(inplace=True),
            nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Dropout2d(p=0.15) if dropout else nn.Identity(),
        )
        self.conv_layer3 = nn.Sequential(
            # Conv Layer block 3
            nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, padding=1),
            nn.BatchNorm2d(256) if batch_norm else nn.Identity(),
            nn.ReLU(inplace=True),
            nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )

        self.fc_layer = nn.Sequential(
            nn.Dropout(p=0.15) if dropout else nn.Identity(),
            nn.Linear(4096, 1024),
            nn.ReLU(inplace=True),
            nn.Linear(1024, 512),
            nn.ReLU(inplace=True),
            nn.Dropout(p=0.1),
            nn.Linear(512, 10),
        )

    def conv_out(self, x, c):
        return self.conv_layer1(x)[:, c, :, :]

    def forward(self, x):
        """Perform forward."""

        # conv layers
        x = self.conv_layer1(x)
        x = self.conv_layer2(x)
        x = self.conv_layer3(x)

        # flatten
        x = x.view(x.size(0), -1)

        # fc layer
        x = self.fc_layer(x)
        return x

We instantiate the CNN model, the optimizer and we can count the number of parameters:

model = LargeCNN(batch_norm=False, dropout=False).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
model_parameters = filter(lambda p: p.requires_grad, model.parameters())
params = sum([np.prod(p.size()) for p in model_parameters])
print("Number of parameters: ", params)
Number of parameters:  5851338
train_n_test(model, optimizer, 30, criterion, "cifar_large_cnn.pt")
Epoch: 1 	Training Loss: 1.599818 	Validation Loss: 1.204596
Validation loss decreased (inf --> 1.204596).  Saving model ...
Epoch: 2 	Training Loss: 0.997227 	Validation Loss: 0.885379
Validation loss decreased (1.204596 --> 0.885379).  Saving model ...
Epoch: 3 	Training Loss: 0.749767 	Validation Loss: 0.766493
Validation loss decreased (0.885379 --> 0.766493).  Saving model ...
Epoch: 4 	Training Loss: 0.608492 	Validation Loss: 0.732680
Validation loss decreased (0.766493 --> 0.732680).  Saving model ...
Epoch: 5 	Training Loss: 0.495372 	Validation Loss: 0.702470
Validation loss decreased (0.732680 --> 0.702470).  Saving model ...
Epoch: 6 	Training Loss: 0.392042 	Validation Loss: 0.738384
Epoch: 7 	Training Loss: 0.308529 	Validation Loss: 0.800675
Epoch: 8 	Training Loss: 0.238643 	Validation Loss: 0.824499
Epoch: 9 	Training Loss: 0.197688 	Validation Loss: 0.917873
Epoch: 10 	Training Loss: 0.163779 	Validation Loss: 0.975063
Epoch: 11 	Training Loss: 0.152591 	Validation Loss: 1.097368
Epoch: 12 	Training Loss: 0.123191 	Validation Loss: 1.182138
Epoch: 13 	Training Loss: 0.113065 	Validation Loss: 1.235141
Epoch: 14 	Training Loss: 0.118290 	Validation Loss: 1.209514
Epoch: 15 	Training Loss: 0.101396 	Validation Loss: 1.232565
Epoch: 16 	Training Loss: 0.099923 	Validation Loss: 1.191586
Epoch: 17 	Training Loss: 0.095028 	Validation Loss: 1.341417
Epoch: 18 	Training Loss: 0.090412 	Validation Loss: 1.343380
Epoch: 19 	Training Loss: 0.086220 	Validation Loss: 1.343565
Epoch: 20 	Training Loss: 0.084747 	Validation Loss: 1.468028
Epoch: 21 	Training Loss: 0.100639 	Validation Loss: 1.442234
Epoch: 22 	Training Loss: 0.080292 	Validation Loss: 1.556585
Epoch: 23 	Training Loss: 0.090976 	Validation Loss: 1.438859
Epoch: 24 	Training Loss: 0.083141 	Validation Loss: 1.533127
Epoch: 25 	Training Loss: 0.083212 	Validation Loss: 1.566200
Epoch: 26 	Training Loss: 0.078648 	Validation Loss: 1.532702
Epoch: 27 	Training Loss: 0.084718 	Validation Loss: 1.466748
Epoch: 28 	Training Loss: 0.075996 	Validation Loss: 1.584655
Epoch: 29 	Training Loss: 0.074190 	Validation Loss: 1.621887
Epoch: 30 	Training Loss: 0.074171 	Validation Loss: 1.659567
plot("cifar_large_cnn.pt")

If we use batch normalization we can stabilize the training:

model = LargeCNN(batch_norm=True, dropout=False).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
train_n_test(model, optimizer, 30, criterion, "cifar_large_cnn_bn.pt")
Epoch: 1 	Training Loss: 1.377527 	Validation Loss: 1.151627
Validation loss decreased (inf --> 1.151627).  Saving model ...
Epoch: 2 	Training Loss: 0.894417 	Validation Loss: 0.934475
Validation loss decreased (1.151627 --> 0.934475).  Saving model ...
Epoch: 3 	Training Loss: 0.721287 	Validation Loss: 0.741299
Validation loss decreased (0.934475 --> 0.741299).  Saving model ...
Epoch: 4 	Training Loss: 0.601858 	Validation Loss: 0.736946
Validation loss decreased (0.741299 --> 0.736946).  Saving model ...
Epoch: 5 	Training Loss: 0.519130 	Validation Loss: 0.658583
Validation loss decreased (0.736946 --> 0.658583).  Saving model ...
Epoch: 6 	Training Loss: 0.438963 	Validation Loss: 0.646755
Validation loss decreased (0.658583 --> 0.646755).  Saving model ...
Epoch: 7 	Training Loss: 0.375962 	Validation Loss: 0.639045
Validation loss decreased (0.646755 --> 0.639045).  Saving model ...
Epoch: 8 	Training Loss: 0.320418 	Validation Loss: 0.737986
Epoch: 9 	Training Loss: 0.276126 	Validation Loss: 0.721208
Epoch: 10 	Training Loss: 0.230144 	Validation Loss: 0.750623
Epoch: 11 	Training Loss: 0.198359 	Validation Loss: 0.772792
Epoch: 12 	Training Loss: 0.177253 	Validation Loss: 0.872785
Epoch: 13 	Training Loss: 0.153021 	Validation Loss: 0.852632
Epoch: 14 	Training Loss: 0.136633 	Validation Loss: 0.835774
Epoch: 15 	Training Loss: 0.128613 	Validation Loss: 0.943488
Epoch: 16 	Training Loss: 0.113854 	Validation Loss: 0.984285
Epoch: 17 	Training Loss: 0.113466 	Validation Loss: 0.868559
Epoch: 18 	Training Loss: 0.091723 	Validation Loss: 0.952815
Epoch: 19 	Training Loss: 0.094924 	Validation Loss: 1.086942
Epoch: 20 	Training Loss: 0.087029 	Validation Loss: 1.085407
Epoch: 21 	Training Loss: 0.086064 	Validation Loss: 1.054228
Epoch: 22 	Training Loss: 0.071511 	Validation Loss: 1.157580
Epoch: 23 	Training Loss: 0.078778 	Validation Loss: 1.205795
Epoch: 24 	Training Loss: 0.074825 	Validation Loss: 1.140467
Epoch: 25 	Training Loss: 0.068187 	Validation Loss: 1.145609
Epoch: 26 	Training Loss: 0.069380 	Validation Loss: 1.043554
Epoch: 27 	Training Loss: 0.061416 	Validation Loss: 1.212794
Epoch: 28 	Training Loss: 0.066645 	Validation Loss: 1.144456
Epoch: 29 	Training Loss: 0.062893 	Validation Loss: 1.150404
Epoch: 30 	Training Loss: 0.059742 	Validation Loss: 1.169297
plot("cifar_large_cnn_bn.pt")

Dropout can also help to avoid overfitting:

model = LargeCNN(batch_norm=True, dropout=True).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
train_n_test(model, optimizer, 30, criterion, "cifar_large_cnn_bn_dropout.pt")
Epoch: 1 	Training Loss: 1.434861 	Validation Loss: 1.226824
Validation loss decreased (inf --> 1.226824).  Saving model ...
Epoch: 2 	Training Loss: 1.004135 	Validation Loss: 0.917697
Validation loss decreased (1.226824 --> 0.917697).  Saving model ...
Epoch: 3 	Training Loss: 0.816937 	Validation Loss: 0.844841
Validation loss decreased (0.917697 --> 0.844841).  Saving model ...
Epoch: 4 	Training Loss: 0.708206 	Validation Loss: 0.739159
Validation loss decreased (0.844841 --> 0.739159).  Saving model ...
Epoch: 5 	Training Loss: 0.627765 	Validation Loss: 0.668541
Validation loss decreased (0.739159 --> 0.668541).  Saving model ...
Epoch: 6 	Training Loss: 0.567008 	Validation Loss: 0.608081
Validation loss decreased (0.668541 --> 0.608081).  Saving model ...
Epoch: 7 	Training Loss: 0.509372 	Validation Loss: 0.669559
Epoch: 8 	Training Loss: 0.468125 	Validation Loss: 0.591877
Validation loss decreased (0.608081 --> 0.591877).  Saving model ...
Epoch: 9 	Training Loss: 0.419433 	Validation Loss: 0.595471
Epoch: 10 	Training Loss: 0.381236 	Validation Loss: 0.614316
Epoch: 11 	Training Loss: 0.357222 	Validation Loss: 0.590553
Validation loss decreased (0.591877 --> 0.590553).  Saving model ...
Epoch: 12 	Training Loss: 0.328914 	Validation Loss: 0.616229
Epoch: 13 	Training Loss: 0.305198 	Validation Loss: 0.588595
Validation loss decreased (0.590553 --> 0.588595).  Saving model ...
Epoch: 14 	Training Loss: 0.292277 	Validation Loss: 0.583493
Validation loss decreased (0.588595 --> 0.583493).  Saving model ...
Epoch: 15 	Training Loss: 0.259140 	Validation Loss: 0.612815
Epoch: 16 	Training Loss: 0.243930 	Validation Loss: 0.642086
Epoch: 17 	Training Loss: 0.228200 	Validation Loss: 0.593261
Epoch: 18 	Training Loss: 0.218168 	Validation Loss: 0.592078
Epoch: 19 	Training Loss: 0.203393 	Validation Loss: 0.623051
Epoch: 20 	Training Loss: 0.191121 	Validation Loss: 0.625495
Epoch: 21 	Training Loss: 0.185056 	Validation Loss: 0.667155
Epoch: 22 	Training Loss: 0.174955 	Validation Loss: 0.679312
Epoch: 23 	Training Loss: 0.169246 	Validation Loss: 0.672484
Epoch: 24 	Training Loss: 0.158391 	Validation Loss: 0.649721
Epoch: 25 	Training Loss: 0.161155 	Validation Loss: 0.685120
Epoch: 26 	Training Loss: 0.147813 	Validation Loss: 0.688835
Epoch: 27 	Training Loss: 0.148170 	Validation Loss: 0.652558
Epoch: 28 	Training Loss: 0.136856 	Validation Loss: 0.724329
Epoch: 29 	Training Loss: 0.138581 	Validation Loss: 0.697110
Epoch: 30 	Training Loss: 0.135540 	Validation Loss: 0.716323
plot("cifar_large_cnn_bn_dropout.pt")
plot_accuracies(
    [
        "cifar_large_cnn.pt",
        "cifar_large_cnn_bn.pt",
        "cifar_large_cnn_bn_dropout.pt",
    ],
    ["Large CNN", "Large CNN BN", "Large CNN BN Dropout"],
)
plot_accuracies(
    ["adam_cifar_fc.pt", "cifar_cnn.pt", "cifar_large_cnn_bn_dropout.pt"],
    ["FC", "CNN", "Large CNN"],
)