Lec 23 - pytorch - optim & nn

class: center, middle, inverse, title-slide

# Lec 23 - pytorch - optim & nn
## <br/> Statistical Computing and Computation
### Sta 663 | Spring 2022
### <br/> Dr. Colin Rundel

---

exclude: true

```python
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import scipy

import statsmodels.api as sm
import statsmodels.formula.api as smf

import torch

import os
import math

plt.rcParams['figure.dpi'] = 200

np.set_printoptions(
  edgeitems=30, linewidth=200,
  precision = 5, suppress=True
  #formatter=dict(float=lambda x: "%.5g" % x)
)

pd.set_option("display.width", 130)
pd.set_option("display.max_columns", 10)
pd.set_option("display.precision", 6)
```

```r
knitr::opts_chunk$set(
  fig.align="center",
  cache=FALSE
)

library(lme4)
```

```
## Loading required package: Matrix
```

```r
local({
  hook_err_old <- knitr::knit_hooks$get("error")  # save the old hook
  knitr::knit_hooks$set(error = function(x, options) {
    # now do whatever you want to do with x, and pass
    # the new x to the old hook
    x = sub("## \n## Detailed traceback:\n.*$", "", x)
    x = sub("Error in py_call_impl\$.*?\$\\: ", "", x)
    #x = stringr::str_wrap(x, width = 100)
    hook_err_old(x, options)
  })
  
  hook_warn_old <- knitr::knit_hooks$get("warning")  # save the old hook
  knitr::knit_hooks$set(warning = function(x, options) {
    x = sub("<string>:1: ", "", x)
    #x = stringr::str_wrap(x, width = 100)
    hook_warn_old(x, options)
  })
  
  hook_msg_old <- knitr::knit_hooks$get("output")  # save the old hook
  knitr::knit_hooks$set(output = function(x, options) {
    x = stringr::str_replace(x, "(## ).* ([A-Za-z]+Warning:)", "\\1\\2")
    x = stringr::str_split(x, "\n")[[1]]
    #x = stringr::str_wrap(x, width = 120, exdent = 3)
    x = stringr::str_remove_all(x, "\r")
    x = stringi::stri_wrap(x, width=120, exdent = 3, normalize=FALSE)
    x = paste(x, collapse="\n")
    
    #x = stringr::str_wrap(x, width = 100)
    hook_msg_old(x, options)
  })
})
```

---
class: center, middle

# Demo 2 - Using a model

---

## A sample model

```python
class Model(torch.nn.Module):
    def __init__(self, X, y, beta=None):
        super().__init__()
        
        self.X = X
        self.y = y
        
        if beta is None:
          beta = torch.zeros(X.shape[1])
        
        beta.requires_grad = True
        self.beta = torch.nn.Parameter(beta)
        
    def forward(self, X):
        return X @ self.beta
    
    def fit(self, opt, n=1000, loss_fn = torch.nn.MSELoss()):
      losses = []
      
      for i in range(n):
          loss = loss_fn(self(self.X).squeeze(), self.y.squeeze())
          loss.backward()
          
          opt.step()
          opt.zero_grad()
          
          losses.append(loss.item())
      
      return losses
```

---

## Fitting

.pull-left[

```python
x = torch.linspace(-math.pi, math.pi, 200)
y = torch.sin(x)

X = torch.vstack((
  torch.ones_like(x),
  x,
  x**2,
  x**3
)).T

m = Model(X, y)
opt = torch.optim.SGD(m.parameters(), lr=1e-3)
losses = m.fit(opt, n=10000)

m.beta.detach()
```

```
## tensor([-1.0707e-08,  8.5455e-01,  3.9629e-09, -9.2817e-02])
```
]

.pull-right[

```python
plt.figure(figsize=(8,6), layout="constrained")
plt.plot(losses)
plt.show()
```

<img src="Lec23_files/figure-html/unnamed-chunk-3-1.png" width="768" style="display: block; margin: auto;" />
]

---

## Learning rate and convergence

.pull-left[

```python
plt.figure(figsize=(8,6), layout="constrained")

for lr in [1e-3, 1e-4, 1e-5, 1e-6]:
  m = Model(X, y)
  opt = torch.optim.SGD(m.parameters(), lr=lr)
  losses = m.fit(opt, n=10000)
  
  plt.plot(losses, label=f"lr = {lr}")

plt.legend()
plt.show()
```
]

.pull-right[
<img src="Lec23_files/figure-html/unnamed-chunk-4-3.png" width="768" style="display: block; margin: auto;" />
]

---

## Momentum and convergence

.pull-left[

```python
plt.figure(figsize=(8,6), layout="constrained")

for mt in [0, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99]:
  m = Model(X, y)
  opt = torch.optim.SGD(m.parameters(), lr=1e-4, momentum=mt)
  losses = m.fit(opt, n=10000)
  
  plt.plot(losses, label=f"momentum = {mt}")

plt.legend()
plt.show()
```
]

.pull-right[
<img src="Lec23_files/figure-html/unnamed-chunk-5-5.png" width="768" style="display: block; margin: auto;" />
]

---

## Optimizers and convergence

.pull-left[

```python
plt.figure(figsize=(8,6), layout="constrained")

opts = (torch.optim.SGD, 
        torch.optim.Adam, 
        torch.optim.Adagrad)

for opt_fn in opts:
  m = Model(X, y)
  opt = opt_fn(m.parameters(), lr=1e-4)
  losses = m.fit(opt, n=10000)
  
  plt.plot(losses, label=f"opt = {opt_fn}")

plt.legend()
plt.show()
```
]

.pull-right[
<img src="Lec23_files/figure-html/unnamed-chunk-6-7.png" width="768" style="display: block; margin: auto;" />
]

---
class: center, middle

## MNIST & Logistic models

---

## MNIST handwritten digits - simplified

```python
from sklearn.datasets import load_digits

digits = load_digits()
```

.pull-left[ .small[

```python
X = digits.data
X.shape
```

```
## (1797, 64)
```

```python
X[0:3]
```

```
## array([[ 0.,  0.,  5., 13.,  9.,  1.,  0.,  0.,  0.,  0., 13., 15., 10., 15.,  5.,  0.,  0.,  3., 15.,  2.,  0.,
   11.,  8.,  0.,  0.,  4., 12.,  0.,  0.,  8.,  8.,  0.,  0.,  5.,  8.,  0.,  0.,  9.,
##          8.,  0.,  0.,  4., 11.,  0.,  1., 12.,  7.,  0.,  0.,  2., 14.,  5., 10., 12.,  0.,  0.,  0.,  0.,  6., 13.,
   10.,  0.,  0.,  0.],
##        [ 0.,  0.,  0., 12., 13.,  5.,  0.,  0.,  0.,  0.,  0., 11., 16.,  9.,  0.,  0.,  0.,  0.,  3., 15., 16., 
   6.,  0.,  0.,  0.,  7., 15., 16., 16.,  2.,  0.,  0.,  0.,  0.,  1., 16., 16.,  3.,
##          0.,  0.,  0.,  0.,  1., 16., 16.,  6.,  0.,  0.,  0.,  0.,  1., 16., 16.,  6.,  0.,  0.,  0.,  0.,  0., 11.,
   16., 10.,  0.,  0.],
##        [ 0.,  0.,  0.,  4., 15., 12.,  0.,  0.,  0.,  0.,  3., 16., 15., 14.,  0.,  0.,  0.,  0.,  8., 13.,  8.,
   16.,  0.,  0.,  0.,  0.,  1.,  6., 15., 11.,  0.,  0.,  0.,  1.,  8., 13., 15.,  1.,
##          0.,  0.,  0.,  9., 16., 16.,  5.,  0.,  0.,  0.,  0.,  3., 13., 16., 16., 11.,  5.,  0.,  0.,  0.,  0.,  3.,
   11., 16.,  9.,  0.]])
```
] ]

.pull-right[ .small[

```python
y = digits.target
y.shape
```

```
## (1797,)
```

```python
y[0:10]
```

```
## array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
```
] ]

---

## Example digits

---

## Test train split

```python
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, shuffle=True, random_state=1234
)
```

.pull-left[

```python
X_train.shape
```

```
## (1437, 64)
```

```python
y_train.shape
```

```
## (1437,)
```

```python
X_test.shape
```

```
## (360, 64)
```

```python
y_test.shape
```

```
## (360,)
```
]

.pull-right[

```python
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr = LogisticRegression(penalty='none').fit(X_train, y_train)

accuracy_score(y_train, lr.predict(X_train))
```

```
## 1.0
```

```python
accuracy_score(y_test, lr.predict(X_test))
```

```
## 0.9583333333333334
```
]

---

## As Tensors

.pull-left[

```python
X_train = torch.from_numpy(X_train).float()
y_train = torch.from_numpy(y_train)
X_test = torch.from_numpy(X_test).float()
y_test = torch.from_numpy(y_test)
```
]

.pull-right[

```python
X_train.shape
```

```
## torch.Size([1437, 64])
```

```python
y_train.shape
```

```
## torch.Size([1437])
```

```python
X_test.shape
```

```
## torch.Size([360, 64])
```

```python
y_test.shape
```

```
## torch.Size([360])
```
]

---

## PyTorch Model

---

## Cross entropy loss

```python
m = mnist_model(64, 10)
l = m.fit(X_train, y_train, X_test, y_test)
```

```python
plt.figure(figsize=(8,6), layout="constrained")
plt.plot(l)
plt.show()
```

---

## Out of sample accuracy

.pull-left[

```python
m(X_test)
```

```
## tensor([[-60.8759, -17.2256,  -9.1371,  ...,  34.9462, -32.5647, -27.0246],
##         [ -1.5938,  72.1707, -43.8905,  ..., -12.6269,  41.7249,  62.3089],
##         [-68.0310, -44.0990, -16.7110,  ...,  59.5446, -23.9124, -29.6518],
##         ...,
##         [ 53.7474,  35.9603,  -1.8724,  ..., -59.9388,  33.7026,  30.7364],
##         [-63.3765,  11.4854, -83.6133,  ...,  37.6233,  -4.3290,  12.5134],
##         [-34.7887,  -6.8412,   0.2186,  ..., -16.7075,  24.0493,   2.7638]],
##        grad_fn=<SqueezeBackward0>)
```

```python
val, index = torch.max(m(X_test), dim=1)
```

```python
index
```

```
## tensor([7, 1, 7, 6, 0, 2, 6, 3, 6, 3, 7, 8, 7, 9, 4, 3, 1, 7, 8, 4, 0, 3, 9, 2,
##         3, 6, 6, 0, 5, 4, 1, 8, 1, 2, 3, 2, 7, 6, 4, 8, 6, 4, 4, 0, 9, 1, 8, 5,
##         4, 1, 4, 1, 7, 6, 8, 2, 9, 9, 8, 0, 8, 3, 1, 8, 8, 1, 3, 9, 1, 3, 9, 6,
##         9, 5, 2, 1, 9, 2, 1, 3, 8, 7, 3, 3, 1, 8, 7, 5, 1, 2, 6, 1, 9, 1, 6, 4,
##         5, 2, 2, 4, 5, 4, 7, 6, 5, 7, 2, 4, 1, 0, 7, 6, 1, 2, 9, 5, 2, 5, 0, 3,
##         2, 7, 6, 4, 3, 2, 1, 1, 6, 4, 6, 2, 7, 4, 7, 5, 0, 9, 1, 0, 5, 6, 7, 6,
##         3, 8, 3, 2, 0, 4, 0, 8, 5, 4, 6, 1, 1, 1, 6, 8, 7, 9, 0, 7, 9, 5, 4, 1,
##         3, 8, 6, 4, 7, 1, 5, 7, 4, 7, 4, 3, 2, 2, 1, 1, 4, 4, 3, 5, 5, 9, 4, 5,
##         5, 9, 3, 9, 0, 1, 2, 0, 8, 2, 8, 9, 2, 4, 6, 8, 3, 3, 1, 0, 8, 1, 8, 3,
##         6, 8, 7, 1, 8, 0, 4, 9, 7, 0, 5, 5, 6, 1, 3, 0, 5, 8, 2, 0, 9, 5, 6, 7,
##         8, 4, 1, 0, 5, 2, 5, 1, 6, 1, 7, 1, 2, 6, 4, 4, 6, 3, 3, 3, 2, 6, 5, 2,
##         9, 6, 7, 0, 1, 0, 4, 3, 1, 2, 7, 9, 8, 5, 9, 5, 7, 0, 0, 8, 4, 9, 4, 0,
##         7, 7, 3, 5, 3, 5, 3, 9, 7, 9, 6, 2, 7, 4, 8, 9, 1, 7, 9, 8, 5, 0, 8, 0,
##         1, 7, 0, 9, 5, 5, 9, 6, 1, 2, 3, 3, 6, 3, 2, 9, 3, 0, 3, 4, 1, 6, 1, 8,
##         5, 0, 9, 2, 7, 2, 3, 5, 2, 6, 3, 4, 1, 5, 0, 5, 4, 6, 3, 2, 5, 0, 7, 3])
```
]

.pull-right[

```python
(index == y_test).sum()
```

```
## tensor(324)
```

```python
(index == y_test).sum() / len(y_test)
```

```
## tensor(0.9000)
```
]

---

## Calculating Accuracy

.small[

```python
class mnist_model(torch.nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        
        self.beta = torch.nn.Parameter(
          torch.randn(input_dim, output_dim, requires_grad=True)  
        )
        self.intercept = torch.nn.Parameter(
          torch.randn(output_dim, requires_grad=True)  
        )
        
    def forward(self, X):
        return (X @ self.beta + self.intercept).squeeze()
    
    def fit(self, X_train, y_train, X_test, y_test, lr=0.001, n=1000, acc_step=10):
      opt = torch.optim.SGD(self.parameters(), lr=lr, momentum=0.9) 
      losses, train_acc, test_acc = [], [], []
      
      for i in range(n):
          opt.zero_grad()
          loss = torch.nn.CrossEntropyLoss()(self(X_train), y_train)
          loss.backward()
          opt.step()
          losses.append(loss.item())
          
          if (i+1) % acc_step == 0:
            val, train_pred = torch.max(self(X_train), dim=1)
            val, test_pred  = torch.max(self(X_test), dim=1)
            
            train_acc.append( (train_pred == y_train).sum() / len(y_train) )
            test_acc.append( (test_pred == y_test).sum() / len(y_test) )
            
      return (losses, train_acc, test_acc)
```
]

---

## Performance

```python
loss, train_acc, test_acc = mnist_model(64, 10).fit(X_train, y_train, X_test, y_test,acc_step=10, n=3000)
```

```python
plt.figure(figsize=(8,6), layout="constrained")
plt.plot(train_acc, label="train accuracy")
plt.plot(test_acc, label="test accuracy")
plt.legend()
plt.show()
```

---

## NN Layers

```python
class mnist_nn_model(torch.nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.linear = torch.nn.Linear(input_dim, output_dim)
        
    def forward(self, X):
        return self.linear(X)
    
    def fit(self, X_train, y_train, X_test, y_test, lr=0.001, n=1000, acc_step=10):
      opt = torch.optim.SGD(self.parameters(), lr=lr, momentum=0.9) 
      losses, train_acc, test_acc = [], [], []
      
      for i in range(n):
          opt.zero_grad()
          loss = torch.nn.CrossEntropyLoss()(self(X_train), y_train)
          loss.backward()
          opt.step()
          losses.append(loss.item())
          
          if (i+1) % acc_step == 0:
            val, train_pred = torch.max(self(X_train), dim=1)
            val, test_pred  = torch.max(self(X_test), dim=1)
            
            train_acc.append( (train_pred == y_train).sum() / len(y_train) )
            test_acc.append( (test_pred == y_test).sum() / len(y_test) )
            
      return (losses, train_acc, test_acc)
```

---

## Linear layer parameters

```python
m = mnist_nn_model(64, 10)

m.parameters()
```

```
## <generator object Module.parameters at 0x2a5a1cba0>
```

```python
len(list(m.parameters()))
```

```
## 2
```

```python
list(m.parameters())[0].shape
```

```
## torch.Size([10, 64])
```

```python
list(m.parameters())[1].shape
```

```
## torch.Size([10])
```

Applies a linear transform to the incoming data:
`$$y = x A^T+b$$`

---

## Performance

```python
loss, train_acc, test_acc = m.fit(X_train, y_train, X_test, y_test, n=1500)
```

---
class: center, middle

## Feedforward Neural Network

---

## FNN Model

.small[

```python
class mnist_fnn_model(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, nl_step = torch.nn.ReLU(), seed=1234):
        super().__init__()
        self.l1 = torch.nn.Linear(input_dim, hidden_dim)
        self.nl = nl_step
        self.l2 = torch.nn.Linear(hidden_dim, output_dim)
        
    def forward(self, X):
        out = self.l1(X)
        out = self.nl(out)
        out = self.l2(out)
        return out
    
    def fit(self, X_train, y_train, X_test, y_test, lr=0.001, n=1000, acc_step=10):
      opt = torch.optim.SGD(self.parameters(), lr=lr, momentum=0.9) 
      losses, train_acc, test_acc = [], [], []
      
      for i in range(n):
          opt.zero_grad()
          loss = torch.nn.CrossEntropyLoss()(self(X_train), y_train)
          loss.backward()
          opt.step()
          
          losses.append(loss.item())
          
          if (i+1) % acc_step == 0:
            val, train_pred = torch.max(self(X_train), dim=1)
            val, test_pred  = torch.max(self(X_test), dim=1)
            
            train_acc.append( (train_pred == y_train).sum() / len(y_train) )
            test_acc.append( (test_pred == y_test).sum() / len(y_test) )
            
      return (losses, train_acc, test_acc)
```
]

---

## Model parameters

```python
m = mnist_fnn_model(64,64,10)

len(list(m.parameters()))
```

```
## 4
```

```python
for i, p in enumerate(m.parameters()):
  print("Param", i, p.shape)
```

```
## Param 0 torch.Size([64, 64])
## Param 1 torch.Size([64])
## Param 2 torch.Size([10, 64])
## Param 3 torch.Size([10])
```

---

## Performance - ReLU

```python
loss, train_acc, test_acc = mnist_fnn_model(64,64,10).fit(
  X_train, y_train, X_test, y_test, n=2000
)
test_acc[-5:]
```

```
## [tensor(0.9750), tensor(0.9750), tensor(0.9750), tensor(0.9750), tensor(0.9750)]
```

---

## Non-linear activation functions

.pull-left[ .small[
`$$\text{Tanh}(x) = \frac{\exp(x)-\exp(-x)}{\exp(x) + \exp(-x)}$$`

`$$\text{Sigmoid}(x) = \frac{1}{1+\exp(-x)}$$`
<img src="imgs/torch_Sigmoid.png" width="55%" style="display: block; margin: auto;" />
] ]

.pull-right[ .small[
`$$\text{ReLU}(x) = \max(0,x)$$`
<img src="imgs/torch_ReLU.png" width="55%" style="display: block; margin: auto;" />
] ]

---

## Performance - tanh

```python
loss, train_acc, test_acc = mnist_fnn_model(64,64,10, nl_step=torch.nn.Tanh()).fit(
  X_train, y_train, X_test, y_test, n=2000
)
test_acc[-5:]
```

```
## [tensor(0.9694), tensor(0.9694), tensor(0.9694), tensor(0.9694), tensor(0.9694)]
```

---

## Performance - Sigmoid

```python
loss, train_acc, test_acc = mnist_fnn_model(64,64,10, nl_step=torch.nn.Sigmoid()).fit(
  X_train, y_train, X_test, y_test, n=2000
)
test_acc[-5:]
```

```
## [tensor(0.9556), tensor(0.9556), tensor(0.9556), tensor(0.9556), tensor(0.9556)]
```

---

## Multilayer FNN Model

.small[

```python
class mnist_fnn2_model(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, nl_step = torch.nn.ReLU(), seed=1234):
        super().__init__()
        self.l1 = torch.nn.Linear(input_dim, hidden_dim)
        self.nl = nl_step
        self.l2 = torch.nn.Linear(hidden_dim, hidden_dim)
        self.nl = nl_step
        self.l3 = torch.nn.Linear(hidden_dim, output_dim)
        
    def forward(self, X):
        out = self.l1(X)
        out = self.nl(out)
        out = self.l2(out)
        out = self.nl(out)
        out = self.l3(out)
        return out
    
    def fit(self, X_train, y_train, X_test, y_test, lr=0.001, n=1000, acc_step=10):
      loss_fn = torch.nn.CrossEntropyLoss()
      opt = torch.optim.SGD(self.parameters(), lr=lr, momentum=0.9) 
      losses, train_acc, test_acc = [], [], []
      
      for i in range(n):
          opt.zero_grad()
          loss = loss_fn(self(X_train), y_train)
          loss.backward()
          opt.step()
          
          losses.append(loss.item())
          
          if (i+1) % acc_step == 0:
            val, train_pred = torch.max(self(X_train), dim=1)
            val, test_pred  = torch.max(self(X_test), dim=1)
            
            train_acc.append( (train_pred == y_train).sum() / len(y_train) )
            test_acc.append( (test_pred == y_test).sum() / len(y_test) )
            
      return (losses, train_acc, test_acc)
```
]

---

## Performance

```python
loss, train_acc, test_acc = mnist_fnn2_model(64,64,10, nl_step=torch.nn.ReLU()).fit(
  X_train, y_train, X_test, y_test, n=1000
)
test_acc[-5:]
```

```
## [tensor(0.9611), tensor(0.9611), tensor(0.9611), tensor(0.9611), tensor(0.9611)]
```

---
class: center, middle

## Convolutional NN

---

## 2d convolutions

.footnote[[Source](https://towardsdatascience.com/intuitively-understanding-convolutions-for-deep-learning-1f6f42faee1)]

.pull-left[
<img src="imgs/tds_2dconv.gif" style="display: block; margin: auto;" />
]

.pull-right[
<img src="imgs/tds_2dconv2.gif" style="display: block; margin: auto;" />
]

---

## `nn.Conv2d()`

```python
cv = torch.nn.Conv2d(
  in_channels=1, out_channels=4, 
  kernel_size=3, 
  stride=1, padding=1
)
```

```python
list(cv.parameters())
```

```
## [Parameter containing:
## tensor([[[[ 0.3185, -0.1251,  0.0127],
##           [ 0.2319,  0.1078,  0.0828],
##           [-0.2651,  0.1484, -0.0063]]],
##
##
##         [[[-0.3191,  0.1816, -0.0369],
##           [ 0.0180,  0.0299, -0.0892],
##           [ 0.2278, -0.2236,  0.2685]]],
##
##
##         [[[ 0.0027, -0.3022,  0.0216],
##           [-0.0097, -0.1375,  0.0558],
##           [ 0.0894,  0.1148, -0.2137]]],
##
##
##         [[[-0.0667,  0.0374, -0.1312],
##           [ 0.1726,  0.1751, -0.2822],
##           [-0.1157, -0.1920, -0.1563]]]], requires_grad=True), Parameter containing:
## tensor([ 0.1464, -0.3314,  0.2575,  0.2789], requires_grad=True)]
```

---

## Applying `Conv2d()`

```python
X_train[[0]]
```

```
## tensor([[ 0.,  0.,  0., 10., 11.,  0.,  0.,  0.,  0.,  0.,  9., 16.,  6.,  0.,
##           0.,  0.,  0.,  0., 15., 13.,  0.,  0.,  0.,  0.,  0.,  0., 14., 10.,
##           0.,  0.,  0.,  0.,  0.,  1., 15., 12.,  8.,  2.,  0.,  0.,  0.,  0.,
##          12., 16., 16., 16., 10.,  1.,  0.,  0.,  7., 16., 12., 12., 16.,  4.,
##           0.,  0.,  0.,  9., 15., 12.,  5.,  0.]])
```

```python
X_train[[0]].shape
```

```
## torch.Size([1, 64])
```

```python
cv(X_train[[0]])
```

```
## RuntimeError: Expected 3D (unbatched) or 4D (batched) input to conv2d, but got input of size: [1, 64]
```

```python
cv(X_train[[0]].view(1,8,8))
```

```
## tensor([[[ 1.4635e-01,  8.9731e-02,  2.2091e+00,  2.0856e+00,  3.0008e-01,
##            1.1069e+00,  1.4635e-01,  1.4635e-01],
##          [ 1.4635e-01,  7.9740e-01,  4.7118e+00,  1.2956e+00,  2.8663e+00,
##            5.0414e+00,  1.4635e-01,  1.4635e-01],
##          [ 1.4635e-01,  1.4148e+00,  3.9304e+00,  3.7392e+00,  4.8559e+00,
##            2.0574e+00,  1.4635e-01,  1.4635e-01],
##          [ 1.4006e-01,  1.5501e+00,  2.6560e+00,  5.3757e+00,  4.5996e+00,
##           -1.6774e+00, -3.8376e-01,  1.4635e-01],
##          [ 2.2918e-01,  1.5985e+00,  3.0432e+00,  7.8808e+00,  5.1742e+00,
##            2.8690e-01, -2.1535e+00, -2.3558e+00],
##          [ 1.5903e-01,  1.1613e+00,  2.2963e+00,  9.7990e+00,  7.2166e+00,
##            7.2061e+00,  4.8221e+00, -1.0744e+00],
##          [ 1.4635e-01,  8.7829e-01,  8.7060e-01,  7.7516e+00,  9.2051e+00,
##            6.5414e+00,  6.4035e+00,  6.0225e+00],
##          [ 1.4635e-01,  2.3512e-01,  2.1870e-01,  2.7383e+00,  8.5907e+00,
##            7.8556e+00,  5.3387e+00,  5.9015e+00]],
##
##         [[-3.3139e-01,  2.0855e+00,  1.0611e+00, -9.2929e-01,  2.4813e+00,
##            1.2338e+00, -3.3139e-01, -3.3139e-01],
##          [-3.3139e-01,  2.8943e+00, -1.7214e+00,  1.6945e+00,  1.9052e+00,
##           -3.7329e+00, -3.3139e-01, -3.3139e-01],
##          [-3.3139e-01,  1.7583e+00, -4.4349e-01,  1.0937e+00, -1.8343e+00,
##           -2.2458e+00, -3.3139e-01, -3.3139e-01],
##          [-6.2847e-02,  1.6707e+00,  1.5358e+00,  6.7726e-01, -2.8171e+00,
##            1.0439e+00,  1.2424e-01, -3.3139e-01],
##          [-4.2056e-01,  1.0664e+00,  2.8519e+00,  3.8648e-01,  1.1191e+00,
##            2.6252e+00,  1.3820e+00,  1.7231e+00],
##          [-3.6833e-01,  1.0592e-01,  3.2942e+00, -2.7258e+00,  7.4348e-01,
##            1.7019e+00, -2.4103e-01,  2.6293e+00],
##          [-3.3139e-01, -1.3988e+00,  2.4566e+00, -2.9491e-01, -1.6257e+00,
##           -1.6749e+00, -1.7032e+00, -1.7933e+00],
##          [-3.3139e-01, -5.8998e-01, -4.5368e-01, -1.1705e+00, -4.1593e+00,
##           -2.3883e+00, -1.0363e+00, -4.6198e+00]],
##
##         [[ 2.5754e-01, -1.6656e+00, -1.5697e+00,  8.5662e-01,  7.6729e-01,
##            6.8691e-01,  2.5754e-01,  2.5754e-01],
##          [ 2.5754e-01, -2.4453e+00, -9.2579e-01, -1.6456e+00, -2.8589e+00,
##            2.2836e-01,  2.5754e-01,  2.5754e-01],
##          [ 2.5754e-01, -1.7021e+00, -3.9824e+00, -3.9580e+00, -7.4595e-01,
##            2.7345e-01,  2.5754e-01,  2.5754e-01],
##          [ 4.3848e-02, -1.7271e+00, -6.1138e+00, -4.1333e+00,  1.7588e+00,
##            1.2023e+00,  4.3632e-01,  2.5754e-01],
##          [ 3.1336e-01, -1.3042e+00, -7.2006e+00, -4.5856e+00, -9.7218e-01,
##            1.0355e+00,  2.6030e+00,  1.2663e+00],
##          [ 2.7915e-01, -5.4638e-01, -7.3856e+00, -4.6808e+00, -3.3033e+00,
##           -3.0910e+00,  8.4359e-01,  1.9124e+00],
##          [ 2.5754e-01,  9.0769e-01, -5.0158e+00, -7.9702e+00, -5.3626e+00,
##           -3.5424e+00, -3.1469e+00, -2.7673e-01],
##          [ 2.5754e-01,  4.0883e-01, -1.0099e+00, -4.7002e+00, -4.5473e+00,
##           -4.5081e+00, -5.2640e+00, -9.5759e-01]],
##
##         [[ 2.7892e-01, -1.1277e+00, -6.7712e+00, -6.1245e+00,  9.2750e-01,
##            1.4826e+00,  2.7892e-01,  2.7892e-01],
##          [ 2.7892e-01, -4.6052e+00, -8.8830e+00, -2.3599e+00,  2.3306e+00,
##            5.8103e-01,  2.7892e-01,  2.7892e-01],
##          [ 2.7892e-01, -7.3229e+00, -6.7756e+00,  8.1508e-01,  5.2251e-01,
##           -1.2101e-01,  2.7892e-01,  2.7892e-01],
##          [ 1.2263e-01, -8.1761e+00, -6.1066e+00, -1.3577e+00, -2.0990e+00,
##           -1.0308e+00,  4.7451e-02,  2.7892e-01],
##          [-3.2851e-03, -7.4911e+00, -5.9007e+00, -4.8088e+00, -4.9038e+00,
##           -4.4762e+00, -3.3035e+00, -1.0704e+00],
##          [ 1.4772e-01, -6.1321e+00, -7.0593e+00, -6.7212e+00, -5.4672e+00,
##           -3.6314e+00, -7.0955e-01, -4.3996e-01],
##          [ 2.7892e-01, -3.2708e+00, -6.0676e+00, -5.4703e+00, -6.6090e+00,
##           -6.6654e+00,  8.5040e-01,  2.5325e+00],
##          [ 2.7892e-01, -6.3944e-01, -4.0984e+00, -3.8209e+00, -1.1199e+00,
##            1.1073e+00,  2.4985e+00,  2.2468e-01]]], grad_fn=<SqueezeBackward1>)
```

---

## Pooling

```python
x = torch.tensor(
  [[[0,0,0,0],
    [0,1,2,0],
    [0,3,4,0],
    [0,0,0,0]]],
  dtype=torch.float
)
x.shape
```

```
## torch.Size([1, 4, 4])
```

.pull-left[

```python
p = torch.nn.MaxPool2d(kernel_size=2, stride=1)
p(x)
```

```
## tensor([[[1., 2., 2.],
##          [3., 4., 4.],
##          [3., 4., 4.]]])
```

```python
p = torch.nn.MaxPool2d(kernel_size=3, stride=1, padding=1)
p(x)
```

```
## tensor([[[1., 2., 2., 2.],
##          [3., 4., 4., 4.],
##          [3., 4., 4., 4.],
##          [3., 4., 4., 4.]]])
```
]

.pull-right[

```python
p = torch.nn.AvgPool2d(kernel_size=2)
p(x)
```

```
## tensor([[[0.2500, 0.5000],
##          [0.7500, 1.0000]]])
```

```python
p = torch.nn.AvgPool2d(kernel_size=2, padding=1)
p(x)
```

```
## tensor([[[0.0000, 0.0000, 0.0000],
##          [0.0000, 2.5000, 0.0000],
##          [0.0000, 0.0000, 0.0000]]])
```
]

---

## Convolutional model

```python
class mnist_conv_model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.cnn  = torch.nn.Conv2d(
          in_channels=1, out_channels=8,
          kernel_size=3, stride=1, padding=1
        )
        self.relu = torch.nn.ReLU()
        self.pool = torch.nn.MaxPool2d(kernel_size=2)
        self.lin  = torch.nn.Linear(8 * 4 * 4, 10)
        
    def forward(self, X):
        out = self.cnn(X.view(-1, 1, 8, 8))
        out = self.relu(out)
        out = self.pool(out)
        out = self.lin(out.view(-1, 8 * 4 * 4))
        return out
    
    def fit(self, X_train, y_train, X_test, y_test, lr=0.001, n=1000, acc_step=10):
      opt = torch.optim.SGD(self.parameters(), lr=lr, momentum=0.9) 
      losses, train_acc, test_acc = [], [], []
      
      for i in range(n):
          opt.zero_grad()
          loss = torch.nn.CrossEntropyLoss()(self(X_train), y_train)
          loss.backward()
          opt.step()
          
          losses.append(loss.item())
          
          if (i+1) % acc_step == 0:
            val, train_pred = torch.max(self(X_train), dim=1)
            val, test_pred  = torch.max(self(X_test), dim=1)
            
            train_acc.append( (train_pred == y_train).sum() / len(y_train) )
            test_acc.append( (test_pred == y_test).sum() / len(y_test) )
            
      return (losses, train_acc, test_acc)
```

---

## Performance

```python
loss, train_acc, test_acc = mnist_conv_model().fit(
  X_train, y_train, X_test, y_test, n=1000
)
test_acc[-5:]
```

```
## [tensor(0.9667), tensor(0.9667), tensor(0.9667), tensor(0.9667), tensor(0.9667)]
```