# Dropout Regularization Demo

This notebook demonstrates the effectiveness of dropout regularization in preventing overfitting.
We'll train two identical neural networks on MNIST - one with dropout and one without.

## Setup and Imports

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import matplotlib.pyplot as plt
import numpy as np

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

## Define the Neural Network

Our SimpleNet has three fully connected layers with ReLU activations.
The dropout layer can be toggled on/off by setting the dropout_rate.

In [None]:
class SimpleNet(nn.Module):
    def __init__(self, dropout_rate=0.0):
        super().__init__()
        self.fc1 = nn.Linear(784, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, 10)
        self.dropout = nn.Dropout(dropout_rate)
        
    def forward(self, x):
        x = x.view(-1, 784)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x

# Test the model architecture
model_test = SimpleNet(dropout_rate=0.5)
print(f"Model architecture:\n{model_test}")

## Load MNIST Dataset

In [None]:
# Data preprocessing
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])

# Download and load MNIST dataset
train_dataset = datasets.MNIST('./data', train=True, download=True, transform=transform)
test_dataset = datasets.MNIST('./data', train=False, transform=transform)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
val_loader = DataLoader(test_dataset, batch_size=1000, shuffle=False)

print(f"Training samples: {len(train_dataset)}")
print(f"Test samples: {len(test_dataset)}")

## Visualize Sample Data

In [None]:
# Show some sample images
fig, axes = plt.subplots(2, 5, figsize=(12, 6))
for i in range(10):
    img, label = train_dataset[i]
    ax = axes[i//5, i%5]
    ax.imshow(img.squeeze(), cmap='gray')
    ax.set_title(f'Label: {label}')
    ax.axis('off')
plt.tight_layout()
plt.show()

## Training Function

In [None]:
def train_model(model, train_loader, val_loader, epochs=20, lr=0.001, model_name="Model"):
    """Train the model and return training history"""
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    
    train_losses = []
    train_accuracies = []
    val_losses = []
    val_accuracies = []
    
    print(f"Training {model_name}...")
    print("-" * 50)
    
    for epoch in range(epochs):
        # Training phase
        model.train()
        train_loss = 0
        train_correct = 0
        train_total = 0
        
        for batch_idx, (data, target) in enumerate(train_loader):
            data, target = data.to(device), target.to(device)
            
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
            pred = output.argmax(dim=1, keepdim=True)
            train_correct += pred.eq(target.view_as(pred)).sum().item()
            train_total += target.size(0)
        
        # Validation phase
        model.eval()
        val_loss = 0
        val_correct = 0
        val_total = 0
        
        with torch.no_grad():
            for data, target in val_loader:
                data, target = data.to(device), target.to(device)
                output = model(data)
                val_loss += criterion(output, target).item()
                pred = output.argmax(dim=1, keepdim=True)
                val_correct += pred.eq(target.view_as(pred)).sum().item()
                val_total += target.size(0)
        
        # Calculate averages
        train_loss /= len(train_loader)
        val_loss /= len(val_loader)
        train_acc = 100. * train_correct / train_total
        val_acc = 100. * val_correct / val_total
        
        train_losses.append(train_loss)
        train_accuracies.append(train_acc)
        val_losses.append(val_loss)
        val_accuracies.append(val_acc)
        
        if epoch % 5 == 0 or epoch == epochs - 1:
            print(f'Epoch {epoch+1:2d}: Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.1f}%, '
                  f'Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.1f}%')
    
    return {
        'train_losses': train_losses,
        'train_accuracies': train_accuracies,
        'val_losses': val_losses,
        'val_accuracies': val_accuracies
    }

## Train Model WITHOUT Dropout

In [None]:
model_no_dropout = SimpleNet(dropout_rate=0.0).to(device)
history_no_dropout = train_model(
    model_no_dropout, 
    train_loader, 
    val_loader, 
    epochs=20, 
    model_name="WITHOUT Dropout"
)

## Train Model WITH Dropout

In [None]:
model_with_dropout = SimpleNet(dropout_rate=0.5).to(device)
history_with_dropout = train_model(
    model_with_dropout, 
    train_loader, 
    val_loader, 
    epochs=20, 
    model_name="WITH Dropout (p=0.5)"
)

## Compare Results

In [None]:
print("=" * 60)
print("FINAL RESULTS COMPARISON")
print("=" * 60)

print("Without Dropout:")
print(f"  Final Train Acc: {history_no_dropout['train_accuracies'][-1]:.1f}%")
print(f"  Final Val Acc: {history_no_dropout['val_accuracies'][-1]:.1f}%")
print(f"  Overfitting Gap: {history_no_dropout['train_accuracies'][-1] - history_no_dropout['val_accuracies'][-1]:.1f}%")

print("\nWith Dropout:")
print(f"  Final Train Acc: {history_with_dropout['train_accuracies'][-1]:.1f}%")
print(f"  Final Val Acc: {history_with_dropout['val_accuracies'][-1]:.1f}%")
print(f"  Overfitting Gap: {history_with_dropout['train_accuracies'][-1] - history_with_dropout['val_accuracies'][-1]:.1f}%")

## Visualize Training Progress

In [None]:
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 10))

epochs = range(1, len(history_no_dropout['train_losses']) + 1)

# Training Loss
ax1.plot(epochs, history_no_dropout['train_losses'], 'b-', label='No Dropout', linewidth=2)
ax1.plot(epochs, history_with_dropout['train_losses'], 'r-', label='With Dropout', linewidth=2)
ax1.set_title('Training Loss', fontsize=14, fontweight='bold')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Validation Loss
ax2.plot(epochs, history_no_dropout['val_losses'], 'b-', label='No Dropout', linewidth=2)
ax2.plot(epochs, history_with_dropout['val_losses'], 'r-', label='With Dropout', linewidth=2)
ax2.set_title('Validation Loss', fontsize=14, fontweight='bold')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Loss')
ax2.legend()
ax2.grid(True, alpha=0.3)

# Training Accuracy
ax3.plot(epochs, history_no_dropout['train_accuracies'], 'b-', label='No Dropout', linewidth=2)
ax3.plot(epochs, history_with_dropout['train_accuracies'], 'r-', label='With Dropout', linewidth=2)
ax3.set_title('Training Accuracy', fontsize=14, fontweight='bold')
ax3.set_xlabel('Epoch')
ax3.set_ylabel('Accuracy (%)')
ax3.legend()
ax3.grid(True, alpha=0.3)

# Validation Accuracy
ax4.plot(epochs, history_no_dropout['val_accuracies'], 'b-', label='No Dropout', linewidth=2)
ax4.plot(epochs, history_with_dropout['val_accuracies'], 'r-', label='With Dropout', linewidth=2)
ax4.set_title('Validation Accuracy', fontsize=14, fontweight='bold')
ax4.set_xlabel('Epoch')
ax4.set_ylabel('Accuracy (%)')
ax4.legend()
ax4.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('dropout_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

## Test Sample Predictions

In [None]:
print("=" * 50)
print("SAMPLE PREDICTIONS")
print("=" * 50)

model_with_dropout.eval()
with torch.no_grad():
    # Get a batch of test data
    data_iter = iter(val_loader)
    images, labels = next(data_iter)
    images, labels = images.to(device), labels.to(device)
    
    # Make predictions
    outputs = model_with_dropout(images[:10])  # First 10 samples
    predictions = outputs.argmax(dim=1)
    
    print("Actual:    ", labels[:10].cpu().numpy())
    print("Predicted: ", predictions.cpu().numpy())
    print("Correct:   ", (predictions == labels[:10]).cpu().numpy())

## Visualize Predictions

In [None]:
# Show predictions with images
fig, axes = plt.subplots(2, 5, figsize=(15, 8))
model_with_dropout.eval()

with torch.no_grad():
    for i in range(10):
        img = images[i:i+1]
        label = labels[i].item()
        pred = model_with_dropout(img).argmax(dim=1).item()
        
        ax = axes[i//5, i%5]
        ax.imshow(img.cpu().squeeze(), cmap='gray')
        color = 'green' if pred == label else 'red'
        ax.set_title(f'True: {label}, Pred: {pred}', color=color, fontweight='bold')
        ax.axis('off')

plt.tight_layout()
plt.show()

## Key Takeaways

1. **Without Dropout**: The model quickly overfits, achieving near-perfect training accuracy but lower validation accuracy
2. **With Dropout**: The model generalizes better, with training and validation accuracies staying closer together
3. **Overfitting Gap**: Dropout significantly reduces the gap between training and validation performance
4. **Final Performance**: The dropout model often achieves better validation accuracy despite lower training accuracy