Simple practical examples to give you a good understanding of how all this NN/AI things really work

Backpropagation (backward propagation of errors) - is a widely used algorithm in training feedforward networks. It computes the gradient of the loss function with respect to the weights of the network. The main idea of it is to break big functions in small parts and use partial derivatives to get function derivative with using the Chain Rule.

So training the model is basically solving this equation:

\begin{equation} loss(model(X), Target) = 0 \end{equation}

And because solving this can be a very hard task, here comes backpropagation and gradient descent(updating weights by a small amount based on the gradient to move in the way of loss minimization).

All this is based on a simple diferentiation chain rule:

\begin{equation} \frac{df}{dx} = \frac{df}{dy}*\frac{dy}{dx} \end{equation}

For example we have simple function with 2 nodes(operation):

Let's say we have 3 variables x=-2, y=5, z=-4, the result will be f=-12, and our target for training is -13. Simple loss=f-target=1.

Now we need to propagate our error(1) back to the x,y,z. To do this we will need to calculate partial derivatives for every function(operation):

\begin{align*} Multiplication:&\\ (q * z)dq &= 1 * z \\ (q * z)dz &= q * 1 \\ Summation:&\\ (x + y)dx &= 1 \\ (x + y)dy &= 1 \end{align*}

And our chain is:

\begin{align*} \frac{df}{dx} &= 1*(z*loss(1)) = 1 * -4 * 1 = -4 \\ \frac{df}{fy} &= 1*(z*loss(1)) = 1 * -4 * 1 = -4 \\ \frac{df}{dz} &= q*loss(1) = 3 * 1 = 3 \\ \end{align*}

Let's build a simple Autograd example

Here we will build a simple automatic differentiationexample from the above formulas.

class Tensor: def __init__(self, data, requires_grad=False): self.data = data if not isinstance(data, np.ndarray): self.data = np.array(data) # whether to run backpropagation or not self.requires_grad = requires_grad # tensor gradient self._grad = None # operation if this tensor was used in it self._grad_fn = None @property def shape(self): return self.data.shape @property def grad_fn(self): if not self.requires_grad: raise Exception('This tensor is not backpropagated') return self._grad_fn @property def grad(self): return self._grad def backward(self, grad=None): if not self.grad_fn: return False if grad is None and self._grad is None: # in case if this is last loss tensor grad = self.__class__(1., requires_grad=False) elif self.grad is not None: grad = self._grad if not self.requires_grad: raise Exception('This tensor is not backpropagated') self.grad_fn.backward(grad) return True def __str__(self): return f'Tensor({str(self.data)})' def add_grad(self, grad): if self._grad is None: self._grad = grad else: self._grad += grad def __add__(self, o): if self.data is not None: self.data += o.data return self self.data = o.data return self class Op: def forward(self): raise NotImplemented def backward(self, grad): raise NotImplemented def __call__(self, *args): self.out = self.forward(*args) self.out._grad_fn = self return self.out class AddOp(Op): '''Sumation operation with 2 tensors''' def forward(self, x: Tensor, y: Tensor): self.x = x self.y = y # created tensor should be backpropagated if at least one # of the input is backpropagated requires_grad = x.requires_grad or y.requires_grad return Tensor(x.data + y.data, requires_grad=requires_grad) def backward(self, grad): if self.x.requires_grad: # as we have matrix operation one of the parameters can # have partial shape in such scenarion we need to sum # gradient values by missed axis if self.x.shape != grad.shape: axis = np.argmax(np.abs(np.array(self.x.shape) - np.array(grad.shape))) self.x.add_grad(Tensor(grad.data.sum(axis=axis, keepdims=True))) else: self.x.add_grad(grad) if self.x.grad_fn: self.x.backward() if self.y.requires_grad: if self.y.shape != grad.shape: axis = np.argmax(np.abs(np.array(self.y.shape) - np.array(grad.shape))) self.y.add_grad(Tensor(grad.data.sum(axis=axis, keepdims=True))) else: self.y.add_grad(grad) if self.y.grad_fn: self.y.backward() class MulOp(Op): '''Multiplication operation with 2 tensors''' def forward(self, x: Tensor, y: Tensor): self.x = x self.y = y requires_grad = x.requires_grad or y.requires_grad return Tensor(x.data * y.data, requires_grad=requires_grad) def backward(self, grad): if self.x.requires_grad: print(self.x, self.x._grad, grad) self.x.add_grad(Tensor(grad.data * self.y.data, False)) if self.x.grad_fn: self.x.backward() if self.y.requires_grad: self.y.add_grad(Tensor(grad.data * self.x.data, False)) if self.y.grad_fn: self.y.backward()

Now let's see what will happens

x = np.random.random((3,3)) a = np.random.random((3,3)) b = np.random.random((3,)) loss = np.random.random((3,3)) _x = Tensor(x,requires_grad=True) _a = Tensor(a,requires_grad=True) _b = Tensor(b,requires_grad=True) # here we use parameter _a 2 times in different operations. # that means that gradient will be calculated twice for _a. # and thus we need to sum two gradients to get result # gradient for _a y = MulOp()( AddOp()( MulOp()( _x, _a ), _b ), _a ) print('y = ', y) print() print('y.grad_fn = ', y.grad_fn) print() y.backward(Tensor(loss)) print('_x.grad = ',_x.grad) print() print('_a.grad = ',_a.grad) print() print('_b.grad = ',_b.grad) print() print('y.grad = ',y.grad)

y = Tensor([[0.45119641 0.25401614 0.36177802] [0.2445061 0.22747291 0.41594346] [0.35465603 0.26046455 1.26796941]]) y.grad_fn = _x.grad = Tensor([[0.08710775 0.00380129 0.14389783] [0.09573756 0.05524222 0.07022905] [0.03903106 0.09778539 0.79086884]]) _a.grad = Tensor([[0.29088065 0.03449309 0.90931509] [0.44720764 0.85727129 0.14349873] [0.05794243 0.58307861 1.63786025]]) _b.grad = Tensor([[0.45979957 0.48953863 1.26903921]]) y.grad = None

Checking with PyTorch Autograd

_x = torch.tensor(x,requires_grad=True) _a = torch.tensor(a,requires_grad=True) _b = torch.tensor(b,requires_grad=True) y = (_a*_x + _b) * _a print('y = ', y) print() print('y.grad_fn = ', y.grad_fn) print() y.backward(torch.tensor(loss)) print('_x.grad = ',_x.grad) print() print('_a.grad = ',_a.grad) print() print('_b.grad = ',_b.grad) print() print('y.grad = ',y.grad)

y = tensor([[0.4512, 0.2540, 0.3618], [0.2445, 0.2275, 0.4159], [0.3547, 0.2605, 1.2680]], dtype=torch.float64, grad_fn= ) y.grad_fn = _x.grad = tensor([[0.0871, 0.0038, 0.1439], [0.0957, 0.0552, 0.0702], [0.0390, 0.0978, 0.7909]], dtype=torch.float64) _a.grad = tensor([[0.2909, 0.0345, 0.9093], [0.4472, 0.8573, 0.1435], [0.0579, 0.5831, 1.6379]], dtype=torch.float64) _b.grad = tensor([0.4598, 0.4895, 1.2690], dtype=torch.float64) y.grad = None

As you can se we got the same gradients as in our simple autograd solution

Let's build Neural Networks and solve 2 simple tasks

For simplicity we will not use autograd here and will solve derivative for each layer.

class Layer: def forward(self): raise NotImplemented def backward(self, grad): raise NotImplemented def __call__(self, *args): return self.forward(*args) class Sigmoid: def forward(self,x): self.x = x return 1/(1+np.exp(-x)) def backward(self, grad): grad_input = self.x*(1-self.x) * grad return grad_input class Relu(Layer): def forward(self,x): self.x = x return np.maximum(np.zeros_like(x), x) def backward(self, grad): grad_input = (self.x > 0) * grad return grad_input class SoftmaxCrossentropyWithLogits(Layer): def forward(self, x, y): self.x = x self.y = y exps = np.exp(x) self.softmax = exps / np.sum(exps, axis=-1, keepdims=True) logits = self.softmax[np.arange(x.shape[0]),y] log_likelihood = -np.log(logits) loss = np.sum(log_likelihood) / x.shape[0] return loss def backward(self, grad=None): batch = self.x.shape[0] grad = self.softmax grad[np.arange(batch),self.y] -= 1 grad = grad/batch return grad class MSE(Layer): def forward(self, x, y): self.x = x self.y = y return ((x - y)**2) / (self.x.shape[0]*2) def backward(self, grad=None): # 1/2n * Sum(xi-yi)**2 # dx = 1/2n * Sum( x**2 -2*x*y + y**2) # dx = (2x - 2y) / 2*n = (x - y) / n return (self.x - self.y) / self.x.shape[0] class Linear(Layer): def __init__(self, input, output, lr=0.0001): self.A = 2*np.random.random((input, output)) - 1 self.b = 2*np.random.random((output)) - 1 self.lr = lr def forward(self, x): self.x = x return np.dot(x,self.A) + self.b def backward(self, grad): # d_layer / db = 1 b_grad = grad.mean(axis=0)*self.x.shape[0] # d_layer / dA = x A_grad = np.dot(self.x.T, grad) # As this layer have somee weights we need to update them using # gradient descent # compute df / dx = df / d_layer * d_layer / dx # df / d_layer == grad grad_input = np.dot(grad, self.A.T) self.A -= A_grad * self.lr self.b -= b_grad * self.lr return grad_input

Running simple point interpolation

class Model(Layer): def __init__(self, lr=0.0001): self.lr = lr self.layers = [ Linear(3,15, lr=self.lr), Relu(), Linear(15,1, lr=self.lr) ] def forward(self,x): for l in self.layers: x = l(x) return x def backward(self, grad): for l in self.layers[::-1]: grad = l.backward(grad) return grad mm = Model() def yf(x1,x2,x3): return np.array([2 * x1 + 3*x2 + 4*x3 + 5],dtype=np.float32) loss = MSE() for i in range(20000): x1 = np.random.random()*30 x2 = np.random.random()*20 x3 = np.random.random()*11 y = mm(np.array([[x1,x2,x3]])) err = loss(y, yf(x1,x2,x3)) yb = loss.backward() yb = mm.backward(yb) if i % 100 == 0: print(err) print('VAL Target:',yf(1,2,3), 'Res:', mm(np.array([[1,2,3]])), 'Loss:', loss(yf(1,2,3), mm(np.array([[1,2,3]] )))) print('---------')

... --------- [[0.02075696]] VAL Target: [25.] Res: [[23.61075629]] Loss: [[0.96499904]] --------- [[0.24674584]] VAL Target: [25.] Res: [[23.67659836]] Loss: [[0.87569595]] --------- [[0.10659916]] VAL Target: [25.] Res: [[23.60292783]] Loss: [[0.97590533]] --------- [[0.68391024]] VAL Target: [25.] Res: [[23.60518007]] Loss: [[0.97276132]] --------- [[0.07534666]] VAL Target: [25.] Res: [[23.61488175]] Loss: [[0.95927629]] ---------

Classifying MNIST with our custom network

Now let's solve some classification task on MNIST dataset. We will use some PyTorch utils to make it simpler

from torchvision.datasets import MNIST from torchvision import transforms from torch.utils.data import DataLoader class Model(Layer): def __init__(self, lr=0.00001): self.lr = lr self.layers = [ Linear(784,100, lr=self.lr), Relu(), Linear(100,200, lr=self.lr), Relu(), Linear(200,10, lr=self.lr) ] def forward(self,x): for l in self.layers: x = l(x) return x def backward(self, grad): for l in self.layers[::-1]: grad = l.backward(grad) return grad simple = transforms.Compose([ transforms.ToTensor(), # converts to [0,1] interval ]) ds = MNIST('./mnist', download=True, transform=simple) ld = DataLoader(ds, batch_size=2, pin_memory=True, drop_last=True) mm = Model() loss = SoftmaxCrossentropyWithLogits() _loss_avg = 0 for e in range(5): for i, (img, label) in enumerate(ld): x = img.view(2,-1).numpy() res = mm(x) _loss = loss(res, label.numpy()) _loss_avg += _loss.mean() # running loss mean grad = loss.backward(1) mm.backward(grad) if i % 100 == 0: print(_loss_avg/100) _loss_avg = 0 print('---------')

for i in range(10): img, target = ds[i] plt.imshow(img[0]) plt.show() x = img.view(1,-1).numpy() res = mm(x)[0] pred = np.argmax(res) print(f'target: {target} predicted: {pred}' )

And that's all. Don't forget that you can run this code in Google Colab by clicking button "Open in colab"