diff --git a/lab_3/assignment3.ipynb b/lab_3/assignment3.ipynb
new file mode 100644
index 0000000..a0eed34
--- /dev/null
+++ b/lab_3/assignment3.ipynb
@@ -0,0 +1,1871 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Лабораторная работа 3"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "1) Полносвязная нейронная сеть ( Fully-Connected Neural Network)\n",
+    "\n",
+    "2) Нормализация по мини-батчам (Batch normalization)\n",
+    "\n",
+    "3) Dropout\n",
+    "\n",
+    "4) Сверточные нейронные сети (Convolutional Networks)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Лабораторные работы можно выполнять с использованием сервиса Google Colaboratory (https://medium.com/deep-learning-turkey/google-colab-free-gpu-tutorial-e113627b9f5d) или на локальном компьютере. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Полносвязная нейронная сеть"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "В данной лабораторной работе необходимо будет реализовать полносвязную нейронную сеть, используя модульный подход. Для каждого  слоя реализации прямого и обратного проходов алгоритма обратного распространения ошибки будут иметь следующий вид:\n",
+    "\n",
+    "```python\n",
+    "def layer_forward(x, w):\n",
+    "  \"\"\" Receive inputs x and weights w \"\"\"\n",
+    "  # Do some computations ...\n",
+    "  z = # ... some intermediate value\n",
+    "  # Do some more computations ...\n",
+    "  out = # the output\n",
+    "   \n",
+    "  cache = (x, w, z, out) # Values we need to compute gradients\n",
+    "   \n",
+    "  return out, cache\n",
+    "```\n",
+    "\n",
+    "\n",
+    "\n",
+    "```python\n",
+    "def layer_backward(dout, cache):\n",
+    "  \"\"\"\n",
+    "  Receive dout (derivative of loss with respect to outputs) and cache,\n",
+    "  and compute derivative with respect to inputs.\n",
+    "  \"\"\"\n",
+    "  # Unpack cache values\n",
+    "  x, w, z, out = cache\n",
+    "  \n",
+    "  # Use values in cache to compute derivatives\n",
+    "  dx = # Derivative of loss with respect to x\n",
+    "  dw = # Derivative of loss with respect to w\n",
+    "  \n",
+    "  return dx, dw\n",
+    "```\n",
+    "\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "=========== You can safely ignore the message below if you are NOT working on ConvolutionalNetworks.ipynb ===========\n",
+      "\tYou will need to compile a Cython extension for a portion of this assignment.\n",
+      "\tThe instructions to do this will be given in a section of the notebook below.\n",
+      "\tThere will be an option for Colab users and another for Jupyter (local) users.\n"
+     ]
+    }
+   ],
+   "source": [
+    "from __future__ import print_function\n",
+    "import time\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "from scripts.classifiers.fc_net import *\n",
+    "\n",
+    "from scripts.gradient_check import eval_numerical_gradient, eval_numerical_gradient_array\n",
+    "from scripts.solver import Solver\n",
+    "from scripts.classifiers.cnn import *\n",
+    "from scripts.layers import *\n",
+    "from scripts.fast_layers import *\n",
+    "\n",
+    "\n",
+    "%matplotlib inline\n",
+    "plt.rcParams['figure.figsize'] = (10.0, 8.0)  \n",
+    "plt.rcParams['image.interpolation'] = 'nearest'\n",
+    "plt.rcParams['image.cmap'] = 'gray'\n",
+    "\n",
+    "# for auto-reloading external modules\n",
+    "# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2\n",
+    "\n",
+    "def rel_error(x, y):\n",
+    "  \"\"\" returns relative error \"\"\"\n",
+    "  return np.max(np.abs(x - y) / (np.maximum(1e-8, np.abs(x) + np.abs(y))))\n",
+    "def print_mean_std(x,axis=0):\n",
+    "    print('  means: ', x.mean(axis=axis))\n",
+    "    print('  stds:  ', x.std(axis=axis))\n",
+    "    print() "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Загрузите данные из предыдущей лабораторной работы. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Для полносвязного слоя реализуйте прямой проход (метод affine_forward в scripts/layers.py). Протестируйте свою реализацию. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "num_inputs = 2\n",
+    "input_shape = (4, 5, 6)\n",
+    "output_dim = 3\n",
+    "\n",
+    "input_size = num_inputs * np.prod(input_shape)\n",
+    "weight_size = output_dim * np.prod(input_shape)\n",
+    "\n",
+    "x = np.linspace(-0.1, 0.5, num=input_size).reshape(num_inputs, *input_shape)\n",
+    "w = np.linspace(-0.2, 0.3, num=weight_size).reshape(np.prod(input_shape), output_dim)\n",
+    "b = np.linspace(-0.3, 0.1, num=output_dim)\n",
+    "\n",
+    "out, _ = affine_forward(x, w, b)\n",
+    "correct_out = np.array([[ 1.49834967,  1.70660132,  1.91485297],\n",
+    "                        [ 3.25553199,  3.5141327,   3.77273342]])\n",
+    "\n",
+    "\n",
+    "print('Testing affine_forward function:')\n",
+    "print('difference: ', rel_error(out, correct_out))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Для полносвязного слоя реализуйте обратный проход (метод affine_backward в scripts/layers.py). Протестируйте свою реализацию. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "np.random.seed(231)\n",
+    "x = np.random.randn(10, 2, 3)\n",
+    "w = np.random.randn(6, 5)\n",
+    "b = np.random.randn(5)\n",
+    "dout = np.random.randn(10, 5)\n",
+    "\n",
+    "dx_num = eval_numerical_gradient_array(lambda x: affine_forward(x, w, b)[0], x, dout)\n",
+    "dw_num = eval_numerical_gradient_array(lambda w: affine_forward(x, w, b)[0], w, dout)\n",
+    "db_num = eval_numerical_gradient_array(lambda b: affine_forward(x, w, b)[0], b, dout)\n",
+    "\n",
+    "_, cache = affine_forward(x, w, b)\n",
+    "dx, dw, db = affine_backward(dout, cache)\n",
+    "\n",
+    "print('Testing affine_backward function:')\n",
+    "print('dx error: ', rel_error(dx_num, dx))\n",
+    "print('dw error: ', rel_error(dw_num, dw))\n",
+    "print('db error: ', rel_error(db_num, db))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Реализуйте прямой проход для слоя активации ReLU (relu_forward) и протестируйте его."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "x = np.linspace(-0.5, 0.5, num=12).reshape(3, 4)\n",
+    "\n",
+    "out, _ = relu_forward(x)\n",
+    "correct_out = np.array([[ 0.,          0.,          0.,          0.,        ],\n",
+    "                        [ 0.,          0.,          0.04545455,  0.13636364,],\n",
+    "                        [ 0.22727273,  0.31818182,  0.40909091,  0.5,       ]])\n",
+    "\n",
+    "# Compare your output with ours. The error should be on the order of e-8\n",
+    "print('Testing relu_forward function:')\n",
+    "print('difference: ', rel_error(out, correct_out))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Реализуйте обратный проход для слоя активации ReLU (relu_backward ) и протестируйте его."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "np.random.seed(231)\n",
+    "x = np.random.randn(10, 10)\n",
+    "dout = np.random.randn(*x.shape)\n",
+    "\n",
+    "dx_num = eval_numerical_gradient_array(lambda x: relu_forward(x)[0], x, dout)\n",
+    "\n",
+    "_, cache = relu_forward(x)\n",
+    "dx = relu_backward(dout, cache)\n",
+    "\n",
+    "# The error should be on the order of e-12\n",
+    "print('Testing relu_backward function:')\n",
+    "print('dx error: ', rel_error(dx_num, dx))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "В скрипте /layer_utils.py приведены реализации прямого и обратного проходов для часто используемых комбинаций слоев. Например, за полносвязным слоем часто следует слой активации. Ознакомьтесь с функциями affine_relu_forward и affine_relu_backward, запустите код ниже и убедитесь, что ошибка порядка e-10 или ниже. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from scripts.layer_utils import affine_relu_forward, affine_relu_backward\n",
+    "np.random.seed(231)\n",
+    "x = np.random.randn(2, 3, 4)\n",
+    "w = np.random.randn(12, 10)\n",
+    "b = np.random.randn(10)\n",
+    "dout = np.random.randn(2, 10)\n",
+    "\n",
+    "out, cache = affine_relu_forward(x, w, b)\n",
+    "dx, dw, db = affine_relu_backward(dout, cache)\n",
+    "\n",
+    "dx_num = eval_numerical_gradient_array(lambda x: affine_relu_forward(x, w, b)[0], x, dout)\n",
+    "dw_num = eval_numerical_gradient_array(lambda w: affine_relu_forward(x, w, b)[0], w, dout)\n",
+    "db_num = eval_numerical_gradient_array(lambda b: affine_relu_forward(x, w, b)[0], b, dout)\n",
+    "\n",
+    "# Relative error should be around e-10 or less\n",
+    "print('Testing affine_relu_forward and affine_relu_backward:')\n",
+    "print('dx error: ', rel_error(dx_num, dx))\n",
+    "print('dw error: ', rel_error(dw_num, dw))\n",
+    "print('db error: ', rel_error(db_num, db))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Реализуйте двухслойную полносвязную сеть - класс TwoLayerNet в scripts/classifiers/fc_net.py . Проверьте свою реализацию, запустив код ниже. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "np.random.seed(231)\n",
+    "N, D, H, C = 3, 5, 50, 7\n",
+    "X = np.random.randn(N, D)\n",
+    "y = np.random.randint(C, size=N)\n",
+    "\n",
+    "std = 1e-3\n",
+    "model = TwoLayerNet(input_dim=D, hidden_dim=H, num_classes=C, weight_scale=std)\n",
+    "\n",
+    "print('Testing initialization ... ')\n",
+    "W1_std = abs(model.params['W1'].std() - std)\n",
+    "b1 = model.params['b1']\n",
+    "W2_std = abs(model.params['W2'].std() - std)\n",
+    "b2 = model.params['b2']\n",
+    "assert W1_std < std / 10, 'First layer weights do not seem right'\n",
+    "assert np.all(b1 == 0), 'First layer biases do not seem right'\n",
+    "assert W2_std < std / 10, 'Second layer weights do not seem right'\n",
+    "assert np.all(b2 == 0), 'Second layer biases do not seem right'\n",
+    "\n",
+    "print('Testing test-time forward pass ... ')\n",
+    "model.params['W1'] = np.linspace(-0.7, 0.3, num=D*H).reshape(D, H)\n",
+    "model.params['b1'] = np.linspace(-0.1, 0.9, num=H)\n",
+    "model.params['W2'] = np.linspace(-0.3, 0.4, num=H*C).reshape(H, C)\n",
+    "model.params['b2'] = np.linspace(-0.9, 0.1, num=C)\n",
+    "X = np.linspace(-5.5, 4.5, num=N*D).reshape(D, N).T\n",
+    "scores = model.loss(X)\n",
+    "correct_scores = np.asarray(\n",
+    "  [[11.53165108,  12.2917344,   13.05181771,  13.81190102,  14.57198434, 15.33206765,  16.09215096],\n",
+    "   [12.05769098,  12.74614105,  13.43459113,  14.1230412,   14.81149128, 15.49994135,  16.18839143],\n",
+    "   [12.58373087,  13.20054771,  13.81736455,  14.43418138,  15.05099822, 15.66781506,  16.2846319 ]])\n",
+    "scores_diff = np.abs(scores - correct_scores).sum()\n",
+    "assert scores_diff < 1e-6, 'Problem with test-time forward pass'\n",
+    "\n",
+    "print('Testing training loss (no regularization)')\n",
+    "y = np.asarray([0, 5, 1])\n",
+    "loss, grads = model.loss(X, y)\n",
+    "correct_loss = 3.4702243556\n",
+    "assert abs(loss - correct_loss) < 1e-10, 'Problem with training-time loss'\n",
+    "\n",
+    "model.reg = 1.0\n",
+    "loss, grads = model.loss(X, y)\n",
+    "correct_loss = 26.5948426952\n",
+    "assert abs(loss - correct_loss) < 1e-10, 'Problem with regularization loss'\n",
+    "\n",
+    "# Errors should be around e-7 or less\n",
+    "for reg in [0.0, 0.7]:\n",
+    "  print('Running numeric gradient check with reg = ', reg)\n",
+    "  model.reg = reg\n",
+    "  loss, grads = model.loss(X, y)\n",
+    "\n",
+    "  for name in sorted(grads):\n",
+    "    f = lambda _: model.loss(X, y)[0]\n",
+    "    grad_num = eval_numerical_gradient(f, model.params[name], verbose=False)\n",
+    "    print('%s relative error: %.2e' % (name, rel_error(grad_num, grads[name])))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Ознакомьтесь с API для обучения и тестирования моделей в scripts/solver.py . Используйте экземпляр класса Solver для обучения двухслойной полносвязной сети. Необходимо достичь минимум 50% верно классифицированных объектов на валидационном наборе. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = TwoLayerNet()\n",
+    "solver = None\n",
+    "\n",
+    "##############################################################################\n",
+    "# TODO: Use a Solver instance to train a TwoLayerNet that achieves at least  #\n",
+    "# 50% accuracy on the validation set.                                        #\n",
+    "##############################################################################\n",
+    "# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****\n",
+    "\n",
+    "pass\n",
+    "\n",
+    "# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****\n",
+    "##############################################################################\n",
+    "#                             END OF YOUR CODE                               #\n",
+    "##############################################################################"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.subplot(2, 1, 1)\n",
+    "plt.title('Training loss')\n",
+    "plt.plot(solver.loss_history, 'o')\n",
+    "plt.xlabel('Iteration')\n",
+    "\n",
+    "plt.subplot(2, 1, 2)\n",
+    "plt.title('Accuracy')\n",
+    "plt.plot(solver.train_acc_history, '-o', label='train')\n",
+    "plt.plot(solver.val_acc_history, '-o', label='val')\n",
+    "plt.plot([0.5] * len(solver.val_acc_history), 'k--')\n",
+    "plt.xlabel('Epoch')\n",
+    "plt.legend(loc='lower right')\n",
+    "plt.gcf().set_size_inches(15, 12)\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Теперь реализуйте полносвязную сеть с произвольным числом скрытых слоев. Ознакомьтесь с классом FullyConnectedNet в scripts/classifiers/fc_net.py . Реализуйте инициализацию, прямой и обратный проходы."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "np.random.seed(231)\n",
+    "N, D, H1, H2, C = 2, 15, 20, 30, 10\n",
+    "X = np.random.randn(N, D)\n",
+    "y = np.random.randint(C, size=(N,))\n",
+    "\n",
+    "for reg in [0, 3.14]:\n",
+    "  print('Running check with reg = ', reg)\n",
+    "  model = FullyConnectedNet([H1, H2], input_dim=D, num_classes=C,\n",
+    "                            reg=reg, weight_scale=5e-2, dtype=np.float64)\n",
+    "\n",
+    "  loss, grads = model.loss(X, y)\n",
+    "  print('Initial loss: ', loss)\n",
+    "  \n",
+    "  # Most of the errors should be on the order of e-7 or smaller.   \n",
+    "  # NOTE: It is fine however to see an error for W2 on the order of e-5\n",
+    "  # for the check when reg = 0.0\n",
+    "  for name in sorted(grads):\n",
+    "    f = lambda _: model.loss(X, y)[0]\n",
+    "    grad_num = eval_numerical_gradient(f, model.params[name], verbose=False, h=1e-5)\n",
+    "    print('%s relative error: %.2e' % (name, rel_error(grad_num, grads[name])))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Попробуйте добиться эффекта переобучения на небольшом наборе изображений (например, 50). Используйте трехслойную сеть со 100 нейронами на каждом скрытом слое. Попробуйте переобучить сеть, достигнув 100 % accuracy за 20 эпох. Для этого поэкспериментируйте с параметрами weight_scale и learning_rate. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# TODO: Use a three-layer Net to overfit 50 training examples by \n",
+    "# tweaking just the learning rate and initialization scale.\n",
+    "\n",
+    "num_train = 50\n",
+    "small_data = {\n",
+    "  'X_train': data['X_train'][:num_train],\n",
+    "  'y_train': data['y_train'][:num_train],\n",
+    "  'X_val': data['X_val'],\n",
+    "  'y_val': data['y_val'],\n",
+    "}\n",
+    "\n",
+    "weight_scale = 1e-2   # Experiment with this!\n",
+    "learning_rate = 1e-4  # Experiment with this!\n",
+    "model = FullyConnectedNet([100, 100],\n",
+    "              weight_scale=weight_scale, dtype=np.float64)\n",
+    "solver = Solver(model, small_data,\n",
+    "                print_every=10, num_epochs=20, batch_size=25,\n",
+    "                update_rule='sgd',\n",
+    "                optim_config={\n",
+    "                  'learning_rate': learning_rate,\n",
+    "                }\n",
+    "         )\n",
+    "solver.train()\n",
+    "\n",
+    "plt.plot(solver.loss_history, 'o')\n",
+    "plt.title('Training loss history')\n",
+    "plt.xlabel('Iteration')\n",
+    "plt.ylabel('Training loss')\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Повторите эксперимент, описанный выше, для пятислойной сети."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# TODO: Use a five-layer Net to overfit 50 training examples by \n",
+    "# tweaking just the learning rate and initialization scale.\n",
+    "\n",
+    "num_train = 50\n",
+    "small_data = {\n",
+    "  'X_train': data['X_train'][:num_train],\n",
+    "  'y_train': data['y_train'][:num_train],\n",
+    "  'X_val': data['X_val'],\n",
+    "  'y_val': data['y_val'],\n",
+    "}\n",
+    "\n",
+    "learning_rate = 2e-3  # Experiment with this!\n",
+    "weight_scale = 1e-5   # Experiment with this!\n",
+    "model = FullyConnectedNet([100, 100, 100, 100],\n",
+    "                weight_scale=weight_scale, dtype=np.float64)\n",
+    "solver = Solver(model, small_data,\n",
+    "                print_every=10, num_epochs=20, batch_size=25,\n",
+    "                update_rule='sgd',\n",
+    "                optim_config={\n",
+    "                  'learning_rate': learning_rate,\n",
+    "                }\n",
+    "         )\n",
+    "solver.train()\n",
+    "\n",
+    "plt.plot(solver.loss_history, 'o')\n",
+    "plt.title('Training loss history')\n",
+    "plt.xlabel('Iteration')\n",
+    "plt.ylabel('Training loss')\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Сделайте выводы по проведенному эксперименту. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Ранее обновление весов проходило по правилу SGD. Теперь попробуйте реализовать стохастический градиентный спуск с импульсом (SGD+momentum). http://cs231n.github.io/neural-networks-3/#sgd Реализуйте sgd_momentum в scripts/optim.py  и запустите проверку. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from scripts.optim import sgd_momentum\n",
+    "\n",
+    "N, D = 4, 5\n",
+    "w = np.linspace(-0.4, 0.6, num=N*D).reshape(N, D)\n",
+    "dw = np.linspace(-0.6, 0.4, num=N*D).reshape(N, D)\n",
+    "v = np.linspace(0.6, 0.9, num=N*D).reshape(N, D)\n",
+    "\n",
+    "config = {'learning_rate': 1e-3, 'velocity': v}\n",
+    "next_w, _ = sgd_momentum(w, dw, config=config)\n",
+    "\n",
+    "expected_next_w = np.asarray([\n",
+    "  [ 0.1406,      0.20738947,  0.27417895,  0.34096842,  0.40775789],\n",
+    "  [ 0.47454737,  0.54133684,  0.60812632,  0.67491579,  0.74170526],\n",
+    "  [ 0.80849474,  0.87528421,  0.94207368,  1.00886316,  1.07565263],\n",
+    "  [ 1.14244211,  1.20923158,  1.27602105,  1.34281053,  1.4096    ]])\n",
+    "expected_velocity = np.asarray([\n",
+    "  [ 0.5406,      0.55475789,  0.56891579, 0.58307368,  0.59723158],\n",
+    "  [ 0.61138947,  0.62554737,  0.63970526,  0.65386316,  0.66802105],\n",
+    "  [ 0.68217895,  0.69633684,  0.71049474,  0.72465263,  0.73881053],\n",
+    "  [ 0.75296842,  0.76712632,  0.78128421,  0.79544211,  0.8096    ]])\n",
+    "\n",
+    "# Should see relative errors around e-8 or less\n",
+    "print('next_w error: ', rel_error(next_w, expected_next_w))\n",
+    "print('velocity error: ', rel_error(expected_velocity, config['velocity']))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Сравните результаты обучения шестислойной сети, обученной классическим градиентным спуском и адаптивным алгоритмом с импульсом. Какой алгоритм сходится быстрее."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "num_train = 4000\n",
+    "small_data = {\n",
+    "  'X_train': data['X_train'][:num_train],\n",
+    "  'y_train': data['y_train'][:num_train],\n",
+    "  'X_val': data['X_val'],\n",
+    "  'y_val': data['y_val'],\n",
+    "}\n",
+    "\n",
+    "solvers = {}\n",
+    "\n",
+    "for update_rule in ['sgd', 'sgd_momentum']:\n",
+    "  print('running with ', update_rule)\n",
+    "  model = FullyConnectedNet([100, 100, 100, 100, 100], weight_scale=5e-2)\n",
+    "\n",
+    "  solver = Solver(model, small_data,\n",
+    "                  num_epochs=5, batch_size=100,\n",
+    "                  update_rule=update_rule,\n",
+    "                  optim_config={\n",
+    "                    'learning_rate': 5e-3,\n",
+    "                  },\n",
+    "                  verbose=True)\n",
+    "  solvers[update_rule] = solver\n",
+    "  solver.train()\n",
+    "  print()\n",
+    "\n",
+    "plt.subplot(3, 1, 1)\n",
+    "plt.title('Training loss')\n",
+    "plt.xlabel('Iteration')\n",
+    "\n",
+    "plt.subplot(3, 1, 2)\n",
+    "plt.title('Training accuracy')\n",
+    "plt.xlabel('Epoch')\n",
+    "\n",
+    "plt.subplot(3, 1, 3)\n",
+    "plt.title('Validation accuracy')\n",
+    "plt.xlabel('Epoch')\n",
+    "\n",
+    "for update_rule, solver in solvers.items():\n",
+    "  plt.subplot(3, 1, 1)\n",
+    "  plt.plot(solver.loss_history, 'o', label=\"loss_%s\" % update_rule)\n",
+    "  \n",
+    "  plt.subplot(3, 1, 2)\n",
+    "  plt.plot(solver.train_acc_history, '-o', label=\"train_acc_%s\" % update_rule)\n",
+    "\n",
+    "  plt.subplot(3, 1, 3)\n",
+    "  plt.plot(solver.val_acc_history, '-o', label=\"val_acc_%s\" % update_rule)\n",
+    "  \n",
+    "for i in [1, 2, 3]:\n",
+    "  plt.subplot(3, 1, i)\n",
+    "  plt.legend(loc='upper center', ncol=4)\n",
+    "plt.gcf().set_size_inches(15, 15)\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Реализуйте алгоритмы RMSProp [1] and Adam [2] с коррекцией смещения  - методы rmsprop и adam . \n",
+    "\n",
+    "\n",
+    "[1] Tijmen Tieleman and Geoffrey Hinton. \"Lecture 6.5-rmsprop: Divide the gradient by a running average of its recent magnitude.\" COURSERA: Neural Networks for Machine Learning 4 (2012).\n",
+    "\n",
+    "[2] Diederik Kingma and Jimmy Ba, \"Adam: A Method for Stochastic Optimization\", ICLR 2015."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Test RMSProp implementation\n",
+    "from scripts.optim import rmsprop\n",
+    "\n",
+    "N, D = 4, 5\n",
+    "w = np.linspace(-0.4, 0.6, num=N*D).reshape(N, D)\n",
+    "dw = np.linspace(-0.6, 0.4, num=N*D).reshape(N, D)\n",
+    "cache = np.linspace(0.6, 0.9, num=N*D).reshape(N, D)\n",
+    "\n",
+    "config = {'learning_rate': 1e-2, 'cache': cache}\n",
+    "next_w, _ = rmsprop(w, dw, config=config)\n",
+    "\n",
+    "expected_next_w = np.asarray([\n",
+    "  [-0.39223849, -0.34037513, -0.28849239, -0.23659121, -0.18467247],\n",
+    "  [-0.132737,   -0.08078555, -0.02881884,  0.02316247,  0.07515774],\n",
+    "  [ 0.12716641,  0.17918792,  0.23122175,  0.28326742,  0.33532447],\n",
+    "  [ 0.38739248,  0.43947102,  0.49155973,  0.54365823,  0.59576619]])\n",
+    "expected_cache = np.asarray([\n",
+    "  [ 0.5976,      0.6126277,   0.6277108,   0.64284931,  0.65804321],\n",
+    "  [ 0.67329252,  0.68859723,  0.70395734,  0.71937285,  0.73484377],\n",
+    "  [ 0.75037008,  0.7659518,   0.78158892,  0.79728144,  0.81302936],\n",
+    "  [ 0.82883269,  0.84469141,  0.86060554,  0.87657507,  0.8926    ]])\n",
+    "\n",
+    "# You should see relative errors around e-7 or less\n",
+    "print('next_w error: ', rel_error(expected_next_w, next_w))\n",
+    "print('cache error: ', rel_error(expected_cache, config['cache']))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Test Adam implementation\n",
+    "from scripts.optim import adam\n",
+    "\n",
+    "N, D = 4, 5\n",
+    "w = np.linspace(-0.4, 0.6, num=N*D).reshape(N, D)\n",
+    "dw = np.linspace(-0.6, 0.4, num=N*D).reshape(N, D)\n",
+    "m = np.linspace(0.6, 0.9, num=N*D).reshape(N, D)\n",
+    "v = np.linspace(0.7, 0.5, num=N*D).reshape(N, D)\n",
+    "\n",
+    "config = {'learning_rate': 1e-2, 'm': m, 'v': v, 't': 5}\n",
+    "next_w, _ = adam(w, dw, config=config)\n",
+    "\n",
+    "expected_next_w = np.asarray([\n",
+    "  [-0.40094747, -0.34836187, -0.29577703, -0.24319299, -0.19060977],\n",
+    "  [-0.1380274,  -0.08544591, -0.03286534,  0.01971428,  0.0722929],\n",
+    "  [ 0.1248705,   0.17744702,  0.23002243,  0.28259667,  0.33516969],\n",
+    "  [ 0.38774145,  0.44031188,  0.49288093,  0.54544852,  0.59801459]])\n",
+    "expected_v = np.asarray([\n",
+    "  [ 0.69966,     0.68908382,  0.67851319,  0.66794809,  0.65738853,],\n",
+    "  [ 0.64683452,  0.63628604,  0.6257431,   0.61520571,  0.60467385,],\n",
+    "  [ 0.59414753,  0.58362676,  0.57311152,  0.56260183,  0.55209767,],\n",
+    "  [ 0.54159906,  0.53110598,  0.52061845,  0.51013645,  0.49966,   ]])\n",
+    "expected_m = np.asarray([\n",
+    "  [ 0.48,        0.49947368,  0.51894737,  0.53842105,  0.55789474],\n",
+    "  [ 0.57736842,  0.59684211,  0.61631579,  0.63578947,  0.65526316],\n",
+    "  [ 0.67473684,  0.69421053,  0.71368421,  0.73315789,  0.75263158],\n",
+    "  [ 0.77210526,  0.79157895,  0.81105263,  0.83052632,  0.85      ]])\n",
+    "\n",
+    "# You should see relative errors around e-7 or less\n",
+    "print('next_w error: ', rel_error(expected_next_w, next_w))\n",
+    "print('v error: ', rel_error(expected_v, config['v']))\n",
+    "print('m error: ', rel_error(expected_m, config['m']))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Обучите пару глубоких сетей с испольованием RMSProp и Adam алгоритмов обновления весов и сравните результаты обучения."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Получите лучшую полносвязную сеть для классификации вашего набора данных. На наборе CIFAR-10 необходимо получить accuracy не ниже 50 % на валидационном наборе."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "best_model = None\n",
+    "################################################################################\n",
+    "# TODO: Train the best FullyConnectedNet that you can on CIFAR-10. You might   #\n",
+    "# find batch/layer normalization and dropout useful. Store your best model in  #\n",
+    "# the best_model variable.                                                     #\n",
+    "################################################################################\n",
+    "# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****\n",
+    "\n",
+    "pass\n",
+    "\n",
+    "# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****\n",
+    "################################################################################\n",
+    "#                              END OF YOUR CODE                                #\n",
+    "################################################################################"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Получите оценку accuracy для валидационной и тестовой выборок. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "y_test_pred = np.argmax(best_model.loss(data['X_test']), axis=1)\n",
+    "y_val_pred = np.argmax(best_model.loss(data['X_val']), axis=1)\n",
+    "print('Validation set accuracy: ', (y_val_pred == data['y_val']).mean())\n",
+    "print('Test set accuracy: ', (y_test_pred == data['y_test']).mean())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Нормализация по мини-батчам\n",
+    "\n",
+    "Идея нормализации по мини-батчам предложена в работе [1]\n",
+    "\n",
+    "[1] Sergey Ioffe and Christian Szegedy, \"Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift\", ICML 2015."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Реализуйте прямой проход для слоя батч-нормализации - функция batchnorm_forward в scripts/layers.py . Проверьте свою реализацию, запустив следующий код:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Check the training-time forward pass by checking means and variances\n",
+    "# of features both before and after batch normalization   \n",
+    "\n",
+    "# Simulate the forward pass for a two-layer network\n",
+    "np.random.seed(231)\n",
+    "N, D1, D2, D3 = 200, 50, 60, 3\n",
+    "X = np.random.randn(N, D1)\n",
+    "W1 = np.random.randn(D1, D2)\n",
+    "W2 = np.random.randn(D2, D3)\n",
+    "a = np.maximum(0, X.dot(W1)).dot(W2)\n",
+    "\n",
+    "print('Before batch normalization:')\n",
+    "print_mean_std(a,axis=0)\n",
+    "\n",
+    "gamma = np.ones((D3,))\n",
+    "beta = np.zeros((D3,))\n",
+    "# Means should be close to zero and stds close to one\n",
+    "print('After batch normalization (gamma=1, beta=0)')\n",
+    "a_norm, _ = batchnorm_forward(a, gamma, beta, {'mode': 'train'})\n",
+    "print_mean_std(a_norm,axis=0)\n",
+    "\n",
+    "gamma = np.asarray([1.0, 2.0, 3.0])\n",
+    "beta = np.asarray([11.0, 12.0, 13.0])\n",
+    "# Now means should be close to beta and stds close to gamma\n",
+    "print('After batch normalization (gamma=', gamma, ', beta=', beta, ')')\n",
+    "a_norm, _ = batchnorm_forward(a, gamma, beta, {'mode': 'train'})\n",
+    "print_mean_std(a_norm,axis=0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Check the test-time forward pass by running the training-time\n",
+    "# forward pass many times to warm up the running averages, and then\n",
+    "# checking the means and variances of activations after a test-time\n",
+    "# forward pass.\n",
+    "\n",
+    "np.random.seed(231)\n",
+    "N, D1, D2, D3 = 200, 50, 60, 3\n",
+    "W1 = np.random.randn(D1, D2)\n",
+    "W2 = np.random.randn(D2, D3)\n",
+    "\n",
+    "bn_param = {'mode': 'train'}\n",
+    "gamma = np.ones(D3)\n",
+    "beta = np.zeros(D3)\n",
+    "\n",
+    "for t in range(50):\n",
+    "  X = np.random.randn(N, D1)\n",
+    "  a = np.maximum(0, X.dot(W1)).dot(W2)\n",
+    "  batchnorm_forward(a, gamma, beta, bn_param)\n",
+    "\n",
+    "bn_param['mode'] = 'test'\n",
+    "X = np.random.randn(N, D1)\n",
+    "a = np.maximum(0, X.dot(W1)).dot(W2)\n",
+    "a_norm, _ = batchnorm_forward(a, gamma, beta, bn_param)\n",
+    "\n",
+    "# Means should be close to zero and stds close to one, but will be\n",
+    "# noisier than training-time forward passes.\n",
+    "print('After batch normalization (test-time):')\n",
+    "print_mean_std(a_norm,axis=0)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Реализуйте обратный проход в функции batchnorm_backward."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Gradient check batchnorm backward pass\n",
+    "np.random.seed(231)\n",
+    "N, D = 4, 5\n",
+    "x = 5 * np.random.randn(N, D) + 12\n",
+    "gamma = np.random.randn(D)\n",
+    "beta = np.random.randn(D)\n",
+    "dout = np.random.randn(N, D)\n",
+    "\n",
+    "bn_param = {'mode': 'train'}\n",
+    "fx = lambda x: batchnorm_forward(x, gamma, beta, bn_param)[0]\n",
+    "fg = lambda a: batchnorm_forward(x, a, beta, bn_param)[0]\n",
+    "fb = lambda b: batchnorm_forward(x, gamma, b, bn_param)[0]\n",
+    "\n",
+    "dx_num = eval_numerical_gradient_array(fx, x, dout)\n",
+    "da_num = eval_numerical_gradient_array(fg, gamma.copy(), dout)\n",
+    "db_num = eval_numerical_gradient_array(fb, beta.copy(), dout)\n",
+    "\n",
+    "_, cache = batchnorm_forward(x, gamma, beta, bn_param)\n",
+    "dx, dgamma, dbeta = batchnorm_backward(dout, cache)\n",
+    "#You should expect to see relative errors between 1e-13 and 1e-8\n",
+    "print('dx error: ', rel_error(dx_num, dx))\n",
+    "print('dgamma error: ', rel_error(da_num, dgamma))\n",
+    "print('dbeta error: ', rel_error(db_num, dbeta))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Измените реализацию класса FullyConnectedNet, добавив батч-нормализацию. \n",
+    "Если флаг normalization == \"batchnorm\", то вам необходимо вставить слой батч-нормализации перед каждым слоем активации ReLU, кроме выхода сети. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "np.random.seed(231)\n",
+    "N, D, H1, H2, C = 2, 15, 20, 30, 10\n",
+    "X = np.random.randn(N, D)\n",
+    "y = np.random.randint(C, size=(N,))\n",
+    "\n",
+    "# You should expect losses between 1e-4~1e-10 for W, \n",
+    "# losses between 1e-08~1e-10 for b,\n",
+    "# and losses between 1e-08~1e-09 for beta and gammas.\n",
+    "for reg in [0, 3.14]:\n",
+    "  print('Running check with reg = ', reg)\n",
+    "  model = FullyConnectedNet([H1, H2], input_dim=D, num_classes=C,\n",
+    "                            reg=reg, weight_scale=5e-2, dtype=np.float64,\n",
+    "                            normalization='batchnorm')\n",
+    "\n",
+    "  loss, grads = model.loss(X, y)\n",
+    "  print('Initial loss: ', loss)\n",
+    "\n",
+    "  for name in sorted(grads):\n",
+    "    f = lambda _: model.loss(X, y)[0]\n",
+    "    grad_num = eval_numerical_gradient(f, model.params[name], verbose=False, h=1e-5)\n",
+    "    print('%s relative error: %.2e' % (name, rel_error(grad_num, grads[name])))\n",
+    "  if reg == 0: print()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Обучите 6-ти слойную сеть на наборе из 1000 изображений с батч-нормализацией и без нее"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "np.random.seed(231)\n",
+    "# Try training a very deep net with batchnorm\n",
+    "hidden_dims = [100, 100, 100, 100, 100]\n",
+    "\n",
+    "num_train = 1000\n",
+    "small_data = {\n",
+    "  'X_train': data['X_train'][:num_train],\n",
+    "  'y_train': data['y_train'][:num_train],\n",
+    "  'X_val': data['X_val'],\n",
+    "  'y_val': data['y_val'],\n",
+    "}\n",
+    "\n",
+    "weight_scale = 2e-2\n",
+    "bn_model = FullyConnectedNet(hidden_dims, weight_scale=weight_scale, normalization='batchnorm')\n",
+    "model = FullyConnectedNet(hidden_dims, weight_scale=weight_scale, normalization=None)\n",
+    "\n",
+    "print('Solver with batch norm:')\n",
+    "bn_solver = Solver(bn_model, small_data,\n",
+    "                num_epochs=10, batch_size=50,\n",
+    "                update_rule='adam',\n",
+    "                optim_config={\n",
+    "                  'learning_rate': 1e-3,\n",
+    "                },\n",
+    "                verbose=True,print_every=20)\n",
+    "bn_solver.train()\n",
+    "\n",
+    "print('\\nSolver without batch norm:')\n",
+    "solver = Solver(model, small_data,\n",
+    "                num_epochs=10, batch_size=50,\n",
+    "                update_rule='adam',\n",
+    "                optim_config={\n",
+    "                  'learning_rate': 1e-3,\n",
+    "                },\n",
+    "                verbose=True, print_every=20)\n",
+    "solver.train()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Визуализируйте процесс обучения для двух сетей. Увеличилась ли скорость сходимости в случае с батч-нормализацией? Сделайте выводы. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def plot_training_history(title, label, baseline, bn_solvers, plot_fn, bl_marker='.', bn_marker='.', labels=None):\n",
+    "    \"\"\"utility function for plotting training history\"\"\"\n",
+    "    plt.title(title)\n",
+    "    plt.xlabel(label)\n",
+    "    bn_plots = [plot_fn(bn_solver) for bn_solver in bn_solvers]\n",
+    "    bl_plot = plot_fn(baseline)\n",
+    "    num_bn = len(bn_plots)\n",
+    "    for i in range(num_bn):\n",
+    "        label='with_norm'\n",
+    "        if labels is not None:\n",
+    "            label += str(labels[i])\n",
+    "        plt.plot(bn_plots[i], bn_marker, label=label)\n",
+    "    label='baseline'\n",
+    "    if labels is not None:\n",
+    "        label += str(labels[0])\n",
+    "    plt.plot(bl_plot, bl_marker, label=label)\n",
+    "    plt.legend(loc='lower center', ncol=num_bn+1) \n",
+    "\n",
+    "    \n",
+    "plt.subplot(3, 1, 1)\n",
+    "plot_training_history('Training loss','Iteration', solver, [bn_solver], \\\n",
+    "                      lambda x: x.loss_history, bl_marker='o', bn_marker='o')\n",
+    "plt.subplot(3, 1, 2)\n",
+    "plot_training_history('Training accuracy','Epoch', solver, [bn_solver], \\\n",
+    "                      lambda x: x.train_acc_history, bl_marker='-o', bn_marker='-o')\n",
+    "plt.subplot(3, 1, 3)\n",
+    "plot_training_history('Validation accuracy','Epoch', solver, [bn_solver], \\\n",
+    "                      lambda x: x.val_acc_history, bl_marker='-o', bn_marker='-o')\n",
+    "\n",
+    "plt.gcf().set_size_inches(15, 15)\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Обучите 6-тислойную сеть с батч-нормализацией и без нее, используя разные размеры батча. Визуализируйте графики обучения. Сделайте выводы по результатам эксперимента. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def run_batchsize_experiments(normalization_mode):\n",
+    "    np.random.seed(231)\n",
+    "    # Try training a very deep net with batchnorm\n",
+    "    hidden_dims = [100, 100, 100, 100, 100]\n",
+    "    num_train = 1000\n",
+    "    small_data = {\n",
+    "      'X_train': data['X_train'][:num_train],\n",
+    "      'y_train': data['y_train'][:num_train],\n",
+    "      'X_val': data['X_val'],\n",
+    "      'y_val': data['y_val'],\n",
+    "    }\n",
+    "    n_epochs=10\n",
+    "    weight_scale = 2e-2\n",
+    "    batch_sizes = [5,10,50]\n",
+    "    lr = 10**(-3.5)\n",
+    "    solver_bsize = batch_sizes[0]\n",
+    "\n",
+    "    print('No normalization: batch size = ',solver_bsize)\n",
+    "    model = FullyConnectedNet(hidden_dims, weight_scale=weight_scale, normalization=None)\n",
+    "    solver = Solver(model, small_data,\n",
+    "                    num_epochs=n_epochs, batch_size=solver_bsize,\n",
+    "                    update_rule='adam',\n",
+    "                    optim_config={\n",
+    "                      'learning_rate': lr,\n",
+    "                    },\n",
+    "                    verbose=False)\n",
+    "    solver.train()\n",
+    "    \n",
+    "    bn_solvers = []\n",
+    "    for i in range(len(batch_sizes)):\n",
+    "        b_size=batch_sizes[i]\n",
+    "        print('Normalization: batch size = ',b_size)\n",
+    "        bn_model = FullyConnectedNet(hidden_dims, weight_scale=weight_scale, normalization=normalization_mode)\n",
+    "        bn_solver = Solver(bn_model, small_data,\n",
+    "                        num_epochs=n_epochs, batch_size=b_size,\n",
+    "                        update_rule='adam',\n",
+    "                        optim_config={\n",
+    "                          'learning_rate': lr,\n",
+    "                        },\n",
+    "                        verbose=False)\n",
+    "        bn_solver.train()\n",
+    "        bn_solvers.append(bn_solver)\n",
+    "        \n",
+    "    return bn_solvers, solver, batch_sizes\n",
+    "\n",
+    "batch_sizes = [5,10,50]\n",
+    "bn_solvers_bsize, solver_bsize, batch_sizes = run_batchsize_experiments('batchnorm')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.subplot(2, 1, 1)\n",
+    "plot_training_history('Training accuracy (Batch Normalization)','Epoch', solver_bsize, bn_solvers_bsize, \\\n",
+    "                      lambda x: x.train_acc_history, bl_marker='-^', bn_marker='-o', labels=batch_sizes)\n",
+    "plt.subplot(2, 1, 2)\n",
+    "plot_training_history('Validation accuracy (Batch Normalization)','Epoch', solver_bsize, bn_solvers_bsize, \\\n",
+    "                      lambda x: x.val_acc_history, bl_marker='-^', bn_marker='-o', labels=batch_sizes)\n",
+    "\n",
+    "plt.gcf().set_size_inches(15, 10)\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Dropout"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Реализуйте прямой проход для dropout-слоя в scripts/layers.py\n",
+    "\n",
+    "http://cs231n.github.io/neural-networks-2/#reg"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "np.random.seed(231)\n",
+    "x = np.random.randn(500, 500) + 10\n",
+    "\n",
+    "for p in [0.25, 0.4, 0.7]:\n",
+    "  out, _ = dropout_forward(x, {'mode': 'train', 'p': p})\n",
+    "  out_test, _ = dropout_forward(x, {'mode': 'test', 'p': p})\n",
+    "\n",
+    "  print('Running tests with p = ', p)\n",
+    "  print('Mean of input: ', x.mean())\n",
+    "  print('Mean of train-time output: ', out.mean())\n",
+    "  print('Mean of test-time output: ', out_test.mean())\n",
+    "  print('Fraction of train-time output set to zero: ', (out == 0).mean())\n",
+    "  print('Fraction of test-time output set to zero: ', (out_test == 0).mean())\n",
+    "  print()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Реализуйте обратный проход для dropout-слоя"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "np.random.seed(231)\n",
+    "x = np.random.randn(10, 10) + 10\n",
+    "dout = np.random.randn(*x.shape)\n",
+    "\n",
+    "dropout_param = {'mode': 'train', 'p': 0.2, 'seed': 123}\n",
+    "out, cache = dropout_forward(x, dropout_param)\n",
+    "dx = dropout_backward(dout, cache)\n",
+    "dx_num = eval_numerical_gradient_array(lambda xx: dropout_forward(xx, dropout_param)[0], x, dout)\n",
+    "\n",
+    "# Error should be around e-10 or less\n",
+    "print('dx relative error: ', rel_error(dx, dx_num))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Добавьте в реализацию класса FullyConnectedNet поддержку dropout. Если параметр dropout != 1, то добавьте в модель dropout-слой после каждого слоя активации. Проверьте свою реализацию"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "np.random.seed(231)\n",
+    "N, D, H1, H2, C = 2, 15, 20, 30, 10\n",
+    "X = np.random.randn(N, D)\n",
+    "y = np.random.randint(C, size=(N,))\n",
+    "\n",
+    "for dropout in [1, 0.75, 0.5]:\n",
+    "  print('Running check with dropout = ', dropout)\n",
+    "  model = FullyConnectedNet([H1, H2], input_dim=D, num_classes=C,\n",
+    "                            weight_scale=5e-2, dtype=np.float64,\n",
+    "                            dropout=dropout, seed=123)\n",
+    "\n",
+    "  loss, grads = model.loss(X, y)\n",
+    "  print('Initial loss: ', loss)\n",
+    "  \n",
+    "  # Relative errors should be around e-6 or less; Note that it's fine\n",
+    "  # if for dropout=1 you have W2 error be on the order of e-5.\n",
+    "  for name in sorted(grads):\n",
+    "    f = lambda _: model.loss(X, y)[0]\n",
+    "    grad_num = eval_numerical_gradient(f, model.params[name], verbose=False, h=1e-5)\n",
+    "    print('%s relative error: %.2e' % (name, rel_error(grad_num, grads[name])))\n",
+    "  print()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Обучите две двухслойные сети с dropout-слоем (вероятность отсева 0,25) и без на наборе из 500 изображений. Визуализируйте графики обучения. Сделайте выводы по результатам эксперимента"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Train two identical nets, one with dropout and one without\n",
+    "np.random.seed(231)\n",
+    "num_train = 500\n",
+    "small_data = {\n",
+    "  'X_train': data['X_train'][:num_train],\n",
+    "  'y_train': data['y_train'][:num_train],\n",
+    "  'X_val': data['X_val'],\n",
+    "  'y_val': data['y_val'],\n",
+    "}\n",
+    "\n",
+    "solvers = {}\n",
+    "dropout_choices = [1, 0.25]\n",
+    "for dropout in dropout_choices:\n",
+    "  model = FullyConnectedNet([500], dropout=dropout)\n",
+    "  print(dropout)\n",
+    "\n",
+    "  solver = Solver(model, small_data,\n",
+    "                  num_epochs=25, batch_size=100,\n",
+    "                  update_rule='adam',\n",
+    "                  optim_config={\n",
+    "                    'learning_rate': 5e-4,\n",
+    "                  },\n",
+    "                  verbose=True, print_every=100)\n",
+    "  solver.train()\n",
+    "  solvers[dropout] = solver\n",
+    "  print()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Plot train and validation accuracies of the two models\n",
+    "\n",
+    "train_accs = []\n",
+    "val_accs = []\n",
+    "for dropout in dropout_choices:\n",
+    "  solver = solvers[dropout]\n",
+    "  train_accs.append(solver.train_acc_history[-1])\n",
+    "  val_accs.append(solver.val_acc_history[-1])\n",
+    "\n",
+    "plt.subplot(3, 1, 1)\n",
+    "for dropout in dropout_choices:\n",
+    "  plt.plot(solvers[dropout].train_acc_history, 'o', label='%.2f dropout' % dropout)\n",
+    "plt.title('Train accuracy')\n",
+    "plt.xlabel('Epoch')\n",
+    "plt.ylabel('Accuracy')\n",
+    "plt.legend(ncol=2, loc='lower right')\n",
+    "  \n",
+    "plt.subplot(3, 1, 2)\n",
+    "for dropout in dropout_choices:\n",
+    "  plt.plot(solvers[dropout].val_acc_history, 'o', label='%.2f dropout' % dropout)\n",
+    "plt.title('Val accuracy')\n",
+    "plt.xlabel('Epoch')\n",
+    "plt.ylabel('Accuracy')\n",
+    "plt.legend(ncol=2, loc='lower right')\n",
+    "\n",
+    "plt.gcf().set_size_inches(15, 15)\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Сверточные нейронные сети (CNN)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Реализуйте прямой проход для сверточного слоя - функция conv_forward_naive в scripts/layers.py юПроверьте свою реализацию, запустив код ниже "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "x_shape = (2, 3, 4, 4)\n",
+    "w_shape = (3, 3, 4, 4)\n",
+    "x = np.linspace(-0.1, 0.5, num=np.prod(x_shape)).reshape(x_shape)\n",
+    "w = np.linspace(-0.2, 0.3, num=np.prod(w_shape)).reshape(w_shape)\n",
+    "b = np.linspace(-0.1, 0.2, num=3)\n",
+    "\n",
+    "conv_param = {'stride': 2, 'pad': 1}\n",
+    "out, _ = conv_forward_naive(x, w, b, conv_param)\n",
+    "correct_out = np.array([[[[-0.08759809, -0.10987781],\n",
+    "                           [-0.18387192, -0.2109216 ]],\n",
+    "                          [[ 0.21027089,  0.21661097],\n",
+    "                           [ 0.22847626,  0.23004637]],\n",
+    "                          [[ 0.50813986,  0.54309974],\n",
+    "                           [ 0.64082444,  0.67101435]]],\n",
+    "                         [[[-0.98053589, -1.03143541],\n",
+    "                           [-1.19128892, -1.24695841]],\n",
+    "                          [[ 0.69108355,  0.66880383],\n",
+    "                           [ 0.59480972,  0.56776003]],\n",
+    "                          [[ 2.36270298,  2.36904306],\n",
+    "                           [ 2.38090835,  2.38247847]]]])\n",
+    "\n",
+    "# Compare your output to ours; difference should be around e-8\n",
+    "print('Testing conv_forward_naive')\n",
+    "print('difference: ', rel_error(out, correct_out))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Реализуйте обратный проход - функция conv_backward_naive в scripts/layers.py"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "np.random.seed(231)\n",
+    "x = np.random.randn(4, 3, 5, 5)\n",
+    "w = np.random.randn(2, 3, 3, 3)\n",
+    "b = np.random.randn(2,)\n",
+    "dout = np.random.randn(4, 2, 5, 5)\n",
+    "conv_param = {'stride': 1, 'pad': 1}\n",
+    "\n",
+    "dx_num = eval_numerical_gradient_array(lambda x: conv_forward_naive(x, w, b, conv_param)[0], x, dout)\n",
+    "dw_num = eval_numerical_gradient_array(lambda w: conv_forward_naive(x, w, b, conv_param)[0], w, dout)\n",
+    "db_num = eval_numerical_gradient_array(lambda b: conv_forward_naive(x, w, b, conv_param)[0], b, dout)\n",
+    "\n",
+    "out, cache = conv_forward_naive(x, w, b, conv_param)\n",
+    "dx, dw, db = conv_backward_naive(dout, cache)\n",
+    "\n",
+    "# Your errors should be around e-8 or less.\n",
+    "print('Testing conv_backward_naive function')\n",
+    "print('dx error: ', rel_error(dx, dx_num))\n",
+    "print('dw error: ', rel_error(dw, dw_num))\n",
+    "print('db error: ', rel_error(db, db_num))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Реализуйте прямой проход для max-pooling слоя -функция  max_pool_forward_naive в scripts/layers.py"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "x_shape = (2, 3, 4, 4)\n",
+    "x = np.linspace(-0.3, 0.4, num=np.prod(x_shape)).reshape(x_shape)\n",
+    "pool_param = {'pool_width': 2, 'pool_height': 2, 'stride': 2}\n",
+    "\n",
+    "out, _ = max_pool_forward_naive(x, pool_param)\n",
+    "\n",
+    "correct_out = np.array([[[[-0.26315789, -0.24842105],\n",
+    "                          [-0.20421053, -0.18947368]],\n",
+    "                         [[-0.14526316, -0.13052632],\n",
+    "                          [-0.08631579, -0.07157895]],\n",
+    "                         [[-0.02736842, -0.01263158],\n",
+    "                          [ 0.03157895,  0.04631579]]],\n",
+    "                        [[[ 0.09052632,  0.10526316],\n",
+    "                          [ 0.14947368,  0.16421053]],\n",
+    "                         [[ 0.20842105,  0.22315789],\n",
+    "                          [ 0.26736842,  0.28210526]],\n",
+    "                         [[ 0.32631579,  0.34105263],\n",
+    "                          [ 0.38526316,  0.4       ]]]])\n",
+    "\n",
+    "# Compare your output with ours. Difference should be on the order of e-8.\n",
+    "print('Testing max_pool_forward_naive function:')\n",
+    "print('difference: ', rel_error(out, correct_out))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Реализуйте обратный проход для max-pooling слоя в max_pool_backward_naive . "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "np.random.seed(231)\n",
+    "x = np.random.randn(3, 2, 8, 8)\n",
+    "dout = np.random.randn(3, 2, 4, 4)\n",
+    "pool_param = {'pool_height': 2, 'pool_width': 2, 'stride': 2}\n",
+    "\n",
+    "dx_num = eval_numerical_gradient_array(lambda x: max_pool_forward_naive(x, pool_param)[0], x, dout)\n",
+    "\n",
+    "out, cache = max_pool_forward_naive(x, pool_param)\n",
+    "dx = max_pool_backward_naive(dout, cache)\n",
+    "\n",
+    "# Your error should be on the order of e-12\n",
+    "print('Testing max_pool_backward_naive function:')\n",
+    "print('dx error: ', rel_error(dx, dx_num))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "В скрипте scripts/fast_layers.py представлены быстрые реализации слоев свертки и пуллинга, написанных с использованием  Cython. \n",
+    "\n",
+    "Для компиляции выполните следующую команду в директории scripts\n",
+    "\n",
+    "```bash\n",
+    "python setup.py build_ext --inplace\n",
+    "```\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Сравните ваши реализации слоев свертки и пуллинга с быстрыми реализациями."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Rel errors should be around e-9 or less\n",
+    "from scripts.fast_layers import conv_forward_fast, conv_backward_fast\n",
+    "from time import time\n",
+    "np.random.seed(231)\n",
+    "x = np.random.randn(100, 3, 31, 31)\n",
+    "w = np.random.randn(25, 3, 3, 3)\n",
+    "b = np.random.randn(25,)\n",
+    "dout = np.random.randn(100, 25, 16, 16)\n",
+    "conv_param = {'stride': 2, 'pad': 1}\n",
+    "\n",
+    "t0 = time()\n",
+    "out_naive, cache_naive = conv_forward_naive(x, w, b, conv_param)\n",
+    "t1 = time()\n",
+    "out_fast, cache_fast = conv_forward_fast(x, w, b, conv_param)\n",
+    "t2 = time()\n",
+    "\n",
+    "print('Testing conv_forward_fast:')\n",
+    "print('Naive: %fs' % (t1 - t0))\n",
+    "print('Fast: %fs' % (t2 - t1))\n",
+    "print('Speedup: %fx' % ((t1 - t0) / (t2 - t1)))\n",
+    "print('Difference: ', rel_error(out_naive, out_fast))\n",
+    "\n",
+    "t0 = time()\n",
+    "dx_naive, dw_naive, db_naive = conv_backward_naive(dout, cache_naive)\n",
+    "t1 = time()\n",
+    "dx_fast, dw_fast, db_fast = conv_backward_fast(dout, cache_fast)\n",
+    "t2 = time()\n",
+    "\n",
+    "print('\\nTesting conv_backward_fast:')\n",
+    "print('Naive: %fs' % (t1 - t0))\n",
+    "print('Fast: %fs' % (t2 - t1))\n",
+    "print('Speedup: %fx' % ((t1 - t0) / (t2 - t1)))\n",
+    "print('dx difference: ', rel_error(dx_naive, dx_fast))\n",
+    "print('dw difference: ', rel_error(dw_naive, dw_fast))\n",
+    "print('db difference: ', rel_error(db_naive, db_fast))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Relative errors should be close to 0.0\n",
+    "from scripts.fast_layers import max_pool_forward_fast, max_pool_backward_fast\n",
+    "np.random.seed(231)\n",
+    "x = np.random.randn(100, 3, 32, 32)\n",
+    "dout = np.random.randn(100, 3, 16, 16)\n",
+    "pool_param = {'pool_height': 2, 'pool_width': 2, 'stride': 2}\n",
+    "\n",
+    "t0 = time()\n",
+    "out_naive, cache_naive = max_pool_forward_naive(x, pool_param)\n",
+    "t1 = time()\n",
+    "out_fast, cache_fast = max_pool_forward_fast(x, pool_param)\n",
+    "t2 = time()\n",
+    "\n",
+    "print('Testing pool_forward_fast:')\n",
+    "print('Naive: %fs' % (t1 - t0))\n",
+    "print('fast: %fs' % (t2 - t1))\n",
+    "print('speedup: %fx' % ((t1 - t0) / (t2 - t1)))\n",
+    "print('difference: ', rel_error(out_naive, out_fast))\n",
+    "\n",
+    "t0 = time()\n",
+    "dx_naive = max_pool_backward_naive(dout, cache_naive)\n",
+    "t1 = time()\n",
+    "dx_fast = max_pool_backward_fast(dout, cache_fast)\n",
+    "t2 = time()\n",
+    "\n",
+    "print('\\nTesting pool_backward_fast:')\n",
+    "print('Naive: %fs' % (t1 - t0))\n",
+    "print('fast: %fs' % (t2 - t1))\n",
+    "print('speedup: %fx' % ((t1 - t0) / (t2 - t1)))\n",
+    "print('dx difference: ', rel_error(dx_naive, dx_fast))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "В layer_utils.py вы можете найти  часто используемые комбинации слоев, используемых в сверточных сетях. Ознакомьтесь с ними и запустите код ниже для проверки их работы"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from scripts.layer_utils import conv_relu_pool_forward, conv_relu_pool_backward\n",
+    "np.random.seed(231)\n",
+    "x = np.random.randn(2, 3, 16, 16)\n",
+    "w = np.random.randn(3, 3, 3, 3)\n",
+    "b = np.random.randn(3,)\n",
+    "dout = np.random.randn(2, 3, 8, 8)\n",
+    "conv_param = {'stride': 1, 'pad': 1}\n",
+    "pool_param = {'pool_height': 2, 'pool_width': 2, 'stride': 2}\n",
+    "\n",
+    "out, cache = conv_relu_pool_forward(x, w, b, conv_param, pool_param)\n",
+    "dx, dw, db = conv_relu_pool_backward(dout, cache)\n",
+    "\n",
+    "dx_num = eval_numerical_gradient_array(lambda x: conv_relu_pool_forward(x, w, b, conv_param, pool_param)[0], x, dout)\n",
+    "dw_num = eval_numerical_gradient_array(lambda w: conv_relu_pool_forward(x, w, b, conv_param, pool_param)[0], w, dout)\n",
+    "db_num = eval_numerical_gradient_array(lambda b: conv_relu_pool_forward(x, w, b, conv_param, pool_param)[0], b, dout)\n",
+    "\n",
+    "# Relative errors should be around e-8 or less\n",
+    "print('Testing conv_relu_pool')\n",
+    "print('dx error: ', rel_error(dx_num, dx))\n",
+    "print('dw error: ', rel_error(dw_num, dw))\n",
+    "print('db error: ', rel_error(db_num, db))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from scripts.layer_utils import conv_relu_forward, conv_relu_backward\n",
+    "np.random.seed(231)\n",
+    "x = np.random.randn(2, 3, 8, 8)\n",
+    "w = np.random.randn(3, 3, 3, 3)\n",
+    "b = np.random.randn(3,)\n",
+    "dout = np.random.randn(2, 3, 8, 8)\n",
+    "conv_param = {'stride': 1, 'pad': 1}\n",
+    "\n",
+    "out, cache = conv_relu_forward(x, w, b, conv_param)\n",
+    "dx, dw, db = conv_relu_backward(dout, cache)\n",
+    "\n",
+    "dx_num = eval_numerical_gradient_array(lambda x: conv_relu_forward(x, w, b, conv_param)[0], x, dout)\n",
+    "dw_num = eval_numerical_gradient_array(lambda w: conv_relu_forward(x, w, b, conv_param)[0], w, dout)\n",
+    "db_num = eval_numerical_gradient_array(lambda b: conv_relu_forward(x, w, b, conv_param)[0], b, dout)\n",
+    "\n",
+    "# Relative errors should be around e-8 or less\n",
+    "print('Testing conv_relu:')\n",
+    "print('dx error: ', rel_error(dx_num, dx))\n",
+    "print('dw error: ', rel_error(dw_num, dw))\n",
+    "print('db error: ', rel_error(db_num, db))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Напишите реализацию класса ThreeLayerConvNet в scripts/classifiers/cnn.py . Вы можете использовать готовые реализации слоев и их комбинаций."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Проверьте вашу реализацию. Ожидается, что значение функции потерь softmax будет порядка `log(C)` для `C` классов для случая без регуляризации. В случае регуляризации значение функции потерь должно немного возрасти. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = ThreeLayerConvNet()\n",
+    "\n",
+    "N = 50\n",
+    "X = np.random.randn(N, 3, 32, 32)\n",
+    "y = np.random.randint(10, size=N)\n",
+    "\n",
+    "loss, grads = model.loss(X, y)\n",
+    "print('Initial loss (no regularization): ', loss)\n",
+    "\n",
+    "model.reg = 0.5\n",
+    "loss, grads = model.loss(X, y)\n",
+    "print('Initial loss (with regularization): ', loss)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Проверьте реализацию обратного прохода"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "num_inputs = 2\n",
+    "input_dim = (3, 16, 16)\n",
+    "reg = 0.0\n",
+    "num_classes = 10\n",
+    "np.random.seed(231)\n",
+    "X = np.random.randn(num_inputs, *input_dim)\n",
+    "y = np.random.randint(num_classes, size=num_inputs)\n",
+    "\n",
+    "model = ThreeLayerConvNet(num_filters=3, filter_size=3,\n",
+    "                          input_dim=input_dim, hidden_dim=7,\n",
+    "                          dtype=np.float64)\n",
+    "loss, grads = model.loss(X, y)\n",
+    "# Errors should be small, but correct implementations may have\n",
+    "# relative errors up to the order of e-2\n",
+    "for param_name in sorted(grads):\n",
+    "    f = lambda _: model.loss(X, y)[0]\n",
+    "    param_grad_num = eval_numerical_gradient(f, model.params[param_name], verbose=False, h=1e-6)\n",
+    "    e = rel_error(param_grad_num, grads[param_name])\n",
+    "    print('%s max relative error: %e' % (param_name, rel_error(param_grad_num, grads[param_name])))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Попробуйте добиться эффекта переобучения. Обучите модель на небольшом наборе данных.Сравните значения accuracy на обучающих данных и на валидационных. Визуализируйте графики обучения "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "np.random.seed(231)\n",
+    "\n",
+    "num_train = 100\n",
+    "small_data = {\n",
+    "  'X_train': data['X_train'][:num_train],\n",
+    "  'y_train': data['y_train'][:num_train],\n",
+    "  'X_val': data['X_val'],\n",
+    "  'y_val': data['y_val'],\n",
+    "}\n",
+    "\n",
+    "model = ThreeLayerConvNet(weight_scale=1e-2)\n",
+    "\n",
+    "solver = Solver(model, small_data,\n",
+    "                num_epochs=15, batch_size=50,\n",
+    "                update_rule='adam',\n",
+    "                optim_config={\n",
+    "                  'learning_rate': 1e-3,\n",
+    "                },\n",
+    "                verbose=True, print_every=1)\n",
+    "solver.train()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Print final training accuracy\n",
+    "print(\n",
+    "    \"Small data training accuracy:\",\n",
+    "    solver.check_accuracy(small_data['X_train'], small_data['y_train'])\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Print final validation accuracy\n",
+    "print(\n",
+    "    \"Small data validation accuracy:\",\n",
+    "    solver.check_accuracy(small_data['X_val'], small_data['y_val'])\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.subplot(2, 1, 1)\n",
+    "plt.plot(solver.loss_history, 'o')\n",
+    "plt.xlabel('iteration')\n",
+    "plt.ylabel('loss')\n",
+    "\n",
+    "plt.subplot(2, 1, 2)\n",
+    "plt.plot(solver.train_acc_history, '-o')\n",
+    "plt.plot(solver.val_acc_history, '-o')\n",
+    "plt.legend(['train', 'val'], loc='upper left')\n",
+    "plt.xlabel('epoch')\n",
+    "plt.ylabel('accuracy')\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Обучите сеть на полном наборе данных. Выведите accuracy на обучающей и валидационной выборках"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = ThreeLayerConvNet(weight_scale=0.001, hidden_dim=500, reg=0.001)\n",
+    "\n",
+    "solver = Solver(model, data,\n",
+    "                num_epochs=1, batch_size=50,\n",
+    "                update_rule='adam',\n",
+    "                optim_config={\n",
+    "                  'learning_rate': 1e-3,\n",
+    "                },\n",
+    "                verbose=True, print_every=20)\n",
+    "solver.train()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Print final training accuracy\n",
+    "print(\n",
+    "    \"Full data training accuracy:\",\n",
+    "    solver.check_accuracy(small_data['X_train'], small_data['y_train'])\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Print final validation accuracy\n",
+    "print(\n",
+    "    \"Full data validation accuracy:\",\n",
+    "    solver.check_accuracy(data['X_val'], data['y_val'])\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Визуализируйте фильтры на первом слое обученной сети"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from scripts.vis_utils import visualize_grid\n",
+    "\n",
+    "grid = visualize_grid(model.params['W1'].transpose(0, 2, 3, 1))\n",
+    "plt.imshow(grid.astype('uint8'))\n",
+    "plt.axis('off')\n",
+    "plt.gcf().set_size_inches(5, 5)\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/lab_3/scripts/__init__.py b/lab_3/scripts/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/lab_3/scripts/classifiers/__init__.py b/lab_3/scripts/classifiers/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/lab_3/scripts/classifiers/cnn.py b/lab_3/scripts/classifiers/cnn.py
new file mode 100644
index 0000000..08894d6
--- /dev/null
+++ b/lab_3/scripts/classifiers/cnn.py
@@ -0,0 +1,135 @@
+from builtins import object
+import numpy as np
+
+from ..layers import *
+from ..fast_layers import *
+from ..layer_utils import *
+
+
+class ThreeLayerConvNet(object):
+    """
+    A three-layer convolutional network with the following architecture:
+
+    conv - relu - 2x2 max pool - affine - relu - affine - softmax
+
+    The network operates on minibatches of data that have shape (N, C, H, W)
+    consisting of N images, each with height H and width W and with C input
+    channels.
+    """
+
+    def __init__(
+        self,
+        input_dim=(3, 32, 32),
+        num_filters=32,
+        filter_size=7,
+        hidden_dim=100,
+        num_classes=10,
+        weight_scale=1e-3,
+        reg=0.0,
+        dtype=np.float32,
+    ):
+        """
+        Initialize a new network.
+
+        Inputs:
+        - input_dim: Tuple (C, H, W) giving size of input data
+        - num_filters: Number of filters to use in the convolutional layer
+        - filter_size: Width/height of filters to use in the convolutional layer
+        - hidden_dim: Number of units to use in the fully-connected hidden layer
+        - num_classes: Number of scores to produce from the final affine layer.
+        - weight_scale: Scalar giving standard deviation for random initialization
+          of weights.
+        - reg: Scalar giving L2 regularization strength
+        - dtype: numpy datatype to use for computation.
+        """
+        self.params = {}
+        self.reg = reg
+        self.dtype = dtype
+
+        ############################################################################
+        # TODO: Initialize weights and biases for the three-layer convolutional    #
+        # network. Weights should be initialized from a Gaussian centered at 0.0   #
+        # with standard deviation equal to weight_scale; biases should be          #
+        # initialized to zero. All weights and biases should be stored in the      #
+        #  dictionary self.params. Store weights and biases for the convolutional  #
+        # layer using the keys 'W1' and 'b1'; use keys 'W2' and 'b2' for the       #
+        # weights and biases of the hidden affine layer, and keys 'W3' and 'b3'    #
+        # for the weights and biases of the output affine layer.                   #
+        #                                                                          #
+        # IMPORTANT: For this assignment, you can assume that the padding          #
+        # and stride of the first convolutional layer are chosen so that           #
+        # **the width and height of the input are preserved**. Take a look at      #
+        # the start of the loss() function to see how that happens.                #
+        ############################################################################
+        # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
+
+        pass
+
+        # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
+        ############################################################################
+        #                             END OF YOUR CODE                             #
+        ############################################################################
+
+        for k, v in self.params.items():
+            self.params[k] = v.astype(dtype)
+
+    def loss(self, X, y=None):
+        """
+        Evaluate loss and gradient for the three-layer convolutional network.
+
+        Input / output: Same API as TwoLayerNet in fc_net.py.
+        """
+        W1, b1 = self.params["W1"], self.params["b1"]
+        W2, b2 = self.params["W2"], self.params["b2"]
+        W3, b3 = self.params["W3"], self.params["b3"]
+
+        # pass conv_param to the forward pass for the convolutional layer
+        # Padding and stride chosen to preserve the input spatial size
+        filter_size = W1.shape[2]
+        conv_param = {"stride": 1, "pad": (filter_size - 1) // 2}
+
+        # pass pool_param to the forward pass for the max-pooling layer
+        pool_param = {"pool_height": 2, "pool_width": 2, "stride": 2}
+
+        scores = None
+        ############################################################################
+        # TODO: Implement the forward pass for the three-layer convolutional net,  #
+        # computing the class scores for X and storing them in the scores          #
+        # variable.                                                                #
+        #                                                                          #
+        # Remember you can use the functions defined in cs231n/fast_layers.py and  #
+        # cs231n/layer_utils.py in your implementation (already imported).         #
+        ############################################################################
+        # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
+
+        pass
+
+        # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
+        ############################################################################
+        #                             END OF YOUR CODE                             #
+        ############################################################################
+
+        if y is None:
+            return scores
+
+        loss, grads = 0, {}
+        ############################################################################
+        # TODO: Implement the backward pass for the three-layer convolutional net, #
+        # storing the loss and gradients in the loss and grads variables. Compute  #
+        # data loss using softmax, and make sure that grads[k] holds the gradients #
+        # for self.params[k]. Don't forget to add L2 regularization!               #
+        #                                                                          #
+        # NOTE: To ensure that your implementation matches ours and you pass the   #
+        # automated tests, make sure that your L2 regularization includes a factor #
+        # of 0.5 to simplify the expression for the gradient.                      #
+        ############################################################################
+        # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
+
+        pass
+
+        # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
+        ############################################################################
+        #                             END OF YOUR CODE                             #
+        ############################################################################
+
+        return loss, grads
diff --git a/lab_3/scripts/classifiers/fc_net.py b/lab_3/scripts/classifiers/fc_net.py
new file mode 100644
index 0000000..8b71b0f
--- /dev/null
+++ b/lab_3/scripts/classifiers/fc_net.py
@@ -0,0 +1,291 @@
+from builtins import range
+from builtins import object
+import numpy as np
+
+from ..layers import *
+from ..layer_utils import *
+
+
+class TwoLayerNet(object):
+    """
+    A two-layer fully-connected neural network with ReLU nonlinearity and
+    softmax loss that uses a modular layer design. We assume an input dimension
+    of D, a hidden dimension of H, and perform classification over C classes.
+
+    The architecure should be affine - relu - affine - softmax.
+
+    Note that this class does not implement gradient descent; instead, it
+    will interact with a separate Solver object that is responsible for running
+    optimization.
+
+    The learnable parameters of the model are stored in the dictionary
+    self.params that maps parameter names to numpy arrays.
+    """
+
+    def __init__(
+        self,
+        input_dim=3 * 32 * 32,
+        hidden_dim=100,
+        num_classes=10,
+        weight_scale=1e-3,
+        reg=0.0,
+    ):
+        """
+        Initialize a new network.
+
+        Inputs:
+        - input_dim: An integer giving the size of the input
+        - hidden_dim: An integer giving the size of the hidden layer
+        - num_classes: An integer giving the number of classes to classify
+        - weight_scale: Scalar giving the standard deviation for random
+          initialization of the weights.
+        - reg: Scalar giving L2 regularization strength.
+        """
+        self.params = {}
+        self.reg = reg
+
+        ############################################################################
+        # TODO: Initialize the weights and biases of the two-layer net. Weights    #
+        # should be initialized from a Gaussian centered at 0.0 with               #
+        # standard deviation equal to weight_scale, and biases should be           #
+        # initialized to zero. All weights and biases should be stored in the      #
+        # dictionary self.params, with first layer weights                         #
+        # and biases using the keys 'W1' and 'b1' and second layer                 #
+        # weights and biases using the keys 'W2' and 'b2'.                         #
+        ############################################################################
+        # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
+
+        pass
+
+        # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
+        ############################################################################
+        #                             END OF YOUR CODE                             #
+        ############################################################################
+
+    def loss(self, X, y=None):
+        """
+        Compute loss and gradient for a minibatch of data.
+
+        Inputs:
+        - X: Array of input data of shape (N, d_1, ..., d_k)
+        - y: Array of labels, of shape (N,). y[i] gives the label for X[i].
+
+        Returns:
+        If y is None, then run a test-time forward pass of the model and return:
+        - scores: Array of shape (N, C) giving classification scores, where
+          scores[i, c] is the classification score for X[i] and class c.
+
+        If y is not None, then run a training-time forward and backward pass and
+        return a tuple of:
+        - loss: Scalar value giving the loss
+        - grads: Dictionary with the same keys as self.params, mapping parameter
+          names to gradients of the loss with respect to those parameters.
+        """
+        scores = None
+        ############################################################################
+        # TODO: Implement the forward pass for the two-layer net, computing the    #
+        # class scores for X and storing them in the scores variable.              #
+        ############################################################################
+        # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
+
+        pass
+
+        # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
+        ############################################################################
+        #                             END OF YOUR CODE                             #
+        ############################################################################
+
+        # If y is None then we are in test mode so just return scores
+        if y is None:
+            return scores
+
+        loss, grads = 0, {}
+        ############################################################################
+        # TODO: Implement the backward pass for the two-layer net. Store the loss  #
+        # in the loss variable and gradients in the grads dictionary. Compute data #
+        # loss using softmax, and make sure that grads[k] holds the gradients for  #
+        # self.params[k]. Don't forget to add L2 regularization!                   #
+        #                                                                          #
+        # NOTE: To ensure that your implementation matches ours and you pass the   #
+        # automated tests, make sure that your L2 regularization includes a factor #
+        # of 0.5 to simplify the expression for the gradient.                      #
+        ############################################################################
+        # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
+
+        pass
+
+        # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
+        ############################################################################
+        #                             END OF YOUR CODE                             #
+        ############################################################################
+
+        return loss, grads
+
+
+class FullyConnectedNet(object):
+    """
+    A fully-connected neural network with an arbitrary number of hidden layers,
+    ReLU nonlinearities, and a softmax loss function. This will also implement
+    dropout and batch/layer normalization as options. For a network with L layers,
+    the architecture will be
+
+    {affine - [batch/layer norm] - relu - [dropout]} x (L - 1) - affine - softmax
+
+    where batch/layer normalization and dropout are optional, and the {...} block is
+    repeated L - 1 times.
+
+    Similar to the TwoLayerNet above, learnable parameters are stored in the
+    self.params dictionary and will be learned using the Solver class.
+    """
+
+    def __init__(
+        self,
+        hidden_dims,
+        input_dim=3 * 32 * 32,
+        num_classes=10,
+        dropout=1,
+        normalization=None,
+        reg=0.0,
+        weight_scale=1e-2,
+        dtype=np.float32,
+        seed=None,
+    ):
+        """
+        Initialize a new FullyConnectedNet.
+
+        Inputs:
+        - hidden_dims: A list of integers giving the size of each hidden layer.
+        - input_dim: An integer giving the size of the input.
+        - num_classes: An integer giving the number of classes to classify.
+        - dropout: Scalar between 0 and 1 giving dropout strength. If dropout=1 then
+          the network should not use dropout at all.
+        - normalization: What type of normalization the network should use. Valid values
+          are "batchnorm", "layernorm", or None for no normalization (the default).
+        - reg: Scalar giving L2 regularization strength.
+        - weight_scale: Scalar giving the standard deviation for random
+          initialization of the weights.
+        - dtype: A numpy datatype object; all computations will be performed using
+          this datatype. float32 is faster but less accurate, so you should use
+          float64 for numeric gradient checking.
+        - seed: If not None, then pass this random seed to the dropout layers. This
+          will make the dropout layers deteriminstic so we can gradient check the
+          model.
+        """
+        self.normalization = normalization
+        self.use_dropout = dropout != 1
+        self.reg = reg
+        self.num_layers = 1 + len(hidden_dims)
+        self.dtype = dtype
+        self.params = {}
+
+        ############################################################################
+        # TODO: Initialize the parameters of the network, storing all values in    #
+        # the self.params dictionary. Store weights and biases for the first layer #
+        # in W1 and b1; for the second layer use W2 and b2, etc. Weights should be #
+        # initialized from a normal distribution centered at 0 with standard       #
+        # deviation equal to weight_scale. Biases should be initialized to zero.   #
+        #                                                                          #
+        # When using batch normalization, store scale and shift parameters for the #
+        # first layer in gamma1 and beta1; for the second layer use gamma2 and     #
+        # beta2, etc. Scale parameters should be initialized to ones and shift     #
+        # parameters should be initialized to zeros.                               #
+        ############################################################################
+        # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
+
+        pass
+
+        # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
+        ############################################################################
+        #                             END OF YOUR CODE                             #
+        ############################################################################
+
+        # When using dropout we need to pass a dropout_param dictionary to each
+        # dropout layer so that the layer knows the dropout probability and the mode
+        # (train / test). You can pass the same dropout_param to each dropout layer.
+        self.dropout_param = {}
+        if self.use_dropout:
+            self.dropout_param = {"mode": "train", "p": dropout}
+            if seed is not None:
+                self.dropout_param["seed"] = seed
+
+        # With batch normalization we need to keep track of running means and
+        # variances, so we need to pass a special bn_param object to each batch
+        # normalization layer. You should pass self.bn_params[0] to the forward pass
+        # of the first batch normalization layer, self.bn_params[1] to the forward
+        # pass of the second batch normalization layer, etc.
+        self.bn_params = []
+        if self.normalization == "batchnorm":
+            self.bn_params = [{"mode": "train"} for i in range(self.num_layers - 1)]
+        if self.normalization == "layernorm":
+            self.bn_params = [{} for i in range(self.num_layers - 1)]
+
+        # Cast all parameters to the correct datatype
+        for k, v in self.params.items():
+            self.params[k] = v.astype(dtype)
+
+    def loss(self, X, y=None):
+        """
+        Compute loss and gradient for the fully-connected net.
+
+        Input / output: Same as TwoLayerNet above.
+        """
+        X = X.astype(self.dtype)
+        mode = "test" if y is None else "train"
+
+        # Set train/test mode for batchnorm params and dropout param since they
+        # behave differently during training and testing.
+        if self.use_dropout:
+            self.dropout_param["mode"] = mode
+        if self.normalization == "batchnorm":
+            for bn_param in self.bn_params:
+                bn_param["mode"] = mode
+        scores = None
+        ############################################################################
+        # TODO: Implement the forward pass for the fully-connected net, computing  #
+        # the class scores for X and storing them in the scores variable.          #
+        #                                                                          #
+        # When using dropout, you'll need to pass self.dropout_param to each       #
+        # dropout forward pass.                                                    #
+        #                                                                          #
+        # When using batch normalization, you'll need to pass self.bn_params[0] to #
+        # the forward pass for the first batch normalization layer, pass           #
+        # self.bn_params[1] to the forward pass for the second batch normalization #
+        # layer, etc.                                                              #
+        ############################################################################
+        # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
+
+        pass
+
+        # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
+        ############################################################################
+        #                             END OF YOUR CODE                             #
+        ############################################################################
+
+        # If test mode return early
+        if mode == "test":
+            return scores
+
+        loss, grads = 0.0, {}
+        ############################################################################
+        # TODO: Implement the backward pass for the fully-connected net. Store the #
+        # loss in the loss variable and gradients in the grads dictionary. Compute #
+        # data loss using softmax, and make sure that grads[k] holds the gradients #
+        # for self.params[k]. Don't forget to add L2 regularization!               #
+        #                                                                          #
+        # When using batch/layer normalization, you don't need to regularize the scale   #
+        # and shift parameters.                                                    #
+        #                                                                          #
+        # NOTE: To ensure that your implementation matches ours and you pass the   #
+        # automated tests, make sure that your L2 regularization includes a factor #
+        # of 0.5 to simplify the expression for the gradient.                      #
+        ############################################################################
+        # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
+
+        pass
+
+        # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
+        ############################################################################
+        #                             END OF YOUR CODE                             #
+        ############################################################################
+
+        return loss, grads
diff --git a/lab_3/scripts/data_utils.py b/lab_3/scripts/data_utils.py
new file mode 100644
index 0000000..e88cfec
--- /dev/null
+++ b/lab_3/scripts/data_utils.py
@@ -0,0 +1,270 @@
+from __future__ import print_function
+
+from builtins import range
+from six.moves import cPickle as pickle
+import numpy as np
+import os
+from imageio import imread
+import platform
+
+
+def load_pickle(f):
+    version = platform.python_version_tuple()
+    if version[0] == "2":
+        return pickle.load(f)
+    elif version[0] == "3":
+        return pickle.load(f, encoding="latin1")
+    raise ValueError("invalid python version: {}".format(version))
+
+
+def load_CIFAR_batch(filename):
+    """ load single batch of cifar """
+    with open(filename, "rb") as f:
+        datadict = load_pickle(f)
+        X = datadict["data"]
+        Y = datadict["labels"]
+        X = X.reshape(10000, 3, 32, 32).transpose(0, 2, 3, 1).astype("float")
+        Y = np.array(Y)
+        return X, Y
+
+
+def load_CIFAR10(ROOT):
+    """ load all of cifar """
+    xs = []
+    ys = []
+    for b in range(1, 6):
+        f = os.path.join(ROOT, "data_batch_%d" % (b,))
+        X, Y = load_CIFAR_batch(f)
+        xs.append(X)
+        ys.append(Y)
+    Xtr = np.concatenate(xs)
+    Ytr = np.concatenate(ys)
+    del X, Y
+    Xte, Yte = load_CIFAR_batch(os.path.join(ROOT, "test_batch"))
+    return Xtr, Ytr, Xte, Yte
+
+
+def get_CIFAR10_data(
+    num_training=49000, num_validation=1000, num_test=1000, subtract_mean=True
+):
+    """
+    Load the CIFAR-10 dataset from disk and perform preprocessing to prepare
+    it for classifiers. These are the same steps as we used for the SVM, but
+    condensed to a single function.
+    """
+    # Load the raw CIFAR-10 data
+    cifar10_dir = os.path.join(
+        os.path.dirname(__file__), "datasets/cifar-10-batches-py"
+    )
+    X_train, y_train, X_test, y_test = load_CIFAR10(cifar10_dir)
+
+    # Subsample the data
+    mask = list(range(num_training, num_training + num_validation))
+    X_val = X_train[mask]
+    y_val = y_train[mask]
+    mask = list(range(num_training))
+    X_train = X_train[mask]
+    y_train = y_train[mask]
+    mask = list(range(num_test))
+    X_test = X_test[mask]
+    y_test = y_test[mask]
+
+    # Normalize the data: subtract the mean image
+    if subtract_mean:
+        mean_image = np.mean(X_train, axis=0)
+        X_train -= mean_image
+        X_val -= mean_image
+        X_test -= mean_image
+
+    # Transpose so that channels come first
+    X_train = X_train.transpose(0, 3, 1, 2).copy()
+    X_val = X_val.transpose(0, 3, 1, 2).copy()
+    X_test = X_test.transpose(0, 3, 1, 2).copy()
+
+    # Package data into a dictionary
+    return {
+        "X_train": X_train,
+        "y_train": y_train,
+        "X_val": X_val,
+        "y_val": y_val,
+        "X_test": X_test,
+        "y_test": y_test,
+    }
+
+
+def load_tiny_imagenet(path, dtype=np.float32, subtract_mean=True):
+    """
+    Load TinyImageNet. Each of TinyImageNet-100-A, TinyImageNet-100-B, and
+    TinyImageNet-200 have the same directory structure, so this can be used
+    to load any of them.
+
+    Inputs:
+    - path: String giving path to the directory to load.
+    - dtype: numpy datatype used to load the data.
+    - subtract_mean: Whether to subtract the mean training image.
+
+    Returns: A dictionary with the following entries:
+    - class_names: A list where class_names[i] is a list of strings giving the
+      WordNet names for class i in the loaded dataset.
+    - X_train: (N_tr, 3, 64, 64) array of training images
+    - y_train: (N_tr,) array of training labels
+    - X_val: (N_val, 3, 64, 64) array of validation images
+    - y_val: (N_val,) array of validation labels
+    - X_test: (N_test, 3, 64, 64) array of testing images.
+    - y_test: (N_test,) array of test labels; if test labels are not available
+      (such as in student code) then y_test will be None.
+    - mean_image: (3, 64, 64) array giving mean training image
+    """
+    # First load wnids
+    with open(os.path.join(path, "wnids.txt"), "r") as f:
+        wnids = [x.strip() for x in f]
+
+    # Map wnids to integer labels
+    wnid_to_label = {wnid: i for i, wnid in enumerate(wnids)}
+
+    # Use words.txt to get names for each class
+    with open(os.path.join(path, "words.txt"), "r") as f:
+        wnid_to_words = dict(line.split("\t") for line in f)
+        for wnid, words in wnid_to_words.items():
+            wnid_to_words[wnid] = [w.strip() for w in words.split(",")]
+    class_names = [wnid_to_words[wnid] for wnid in wnids]
+
+    # Next load training data.
+    X_train = []
+    y_train = []
+    for i, wnid in enumerate(wnids):
+        if (i + 1) % 20 == 0:
+            print("loading training data for synset %d / %d" % (i + 1, len(wnids)))
+        # To figure out the filenames we need to open the boxes file
+        boxes_file = os.path.join(path, "train", wnid, "%s_boxes.txt" % wnid)
+        with open(boxes_file, "r") as f:
+            filenames = [x.split("\t")[0] for x in f]
+        num_images = len(filenames)
+
+        X_train_block = np.zeros((num_images, 3, 64, 64), dtype=dtype)
+        y_train_block = wnid_to_label[wnid] * np.ones(num_images, dtype=np.int64)
+        for j, img_file in enumerate(filenames):
+            img_file = os.path.join(path, "train", wnid, "images", img_file)
+            img = imread(img_file)
+            if img.ndim == 2:
+                ## grayscale file
+                img.shape = (64, 64, 1)
+            X_train_block[j] = img.transpose(2, 0, 1)
+        X_train.append(X_train_block)
+        y_train.append(y_train_block)
+
+    # We need to concatenate all training data
+    X_train = np.concatenate(X_train, axis=0)
+    y_train = np.concatenate(y_train, axis=0)
+
+    # Next load validation data
+    with open(os.path.join(path, "val", "val_annotations.txt"), "r") as f:
+        img_files = []
+        val_wnids = []
+        for line in f:
+            img_file, wnid = line.split("\t")[:2]
+            img_files.append(img_file)
+            val_wnids.append(wnid)
+        num_val = len(img_files)
+        y_val = np.array([wnid_to_label[wnid] for wnid in val_wnids])
+        X_val = np.zeros((num_val, 3, 64, 64), dtype=dtype)
+        for i, img_file in enumerate(img_files):
+            img_file = os.path.join(path, "val", "images", img_file)
+            img = imread(img_file)
+            if img.ndim == 2:
+                img.shape = (64, 64, 1)
+            X_val[i] = img.transpose(2, 0, 1)
+
+    # Next load test images
+    # Students won't have test labels, so we need to iterate over files in the
+    # images directory.
+    img_files = os.listdir(os.path.join(path, "test", "images"))
+    X_test = np.zeros((len(img_files), 3, 64, 64), dtype=dtype)
+    for i, img_file in enumerate(img_files):
+        img_file = os.path.join(path, "test", "images", img_file)
+        img = imread(img_file)
+        if img.ndim == 2:
+            img.shape = (64, 64, 1)
+        X_test[i] = img.transpose(2, 0, 1)
+
+    y_test = None
+    y_test_file = os.path.join(path, "test", "test_annotations.txt")
+    if os.path.isfile(y_test_file):
+        with open(y_test_file, "r") as f:
+            img_file_to_wnid = {}
+            for line in f:
+                line = line.split("\t")
+                img_file_to_wnid[line[0]] = line[1]
+        y_test = [wnid_to_label[img_file_to_wnid[img_file]] for img_file in img_files]
+        y_test = np.array(y_test)
+
+    mean_image = X_train.mean(axis=0)
+    if subtract_mean:
+        X_train -= mean_image[None]
+        X_val -= mean_image[None]
+        X_test -= mean_image[None]
+
+    return {
+        "class_names": class_names,
+        "X_train": X_train,
+        "y_train": y_train,
+        "X_val": X_val,
+        "y_val": y_val,
+        "X_test": X_test,
+        "y_test": y_test,
+        "class_names": class_names,
+        "mean_image": mean_image,
+    }
+
+
+def load_models(models_dir):
+    """
+    Load saved models from disk. This will attempt to unpickle all files in a
+    directory; any files that give errors on unpickling (such as README.txt)
+    will be skipped.
+
+    Inputs:
+    - models_dir: String giving the path to a directory containing model files.
+      Each model file is a pickled dictionary with a 'model' field.
+
+    Returns:
+    A dictionary mapping model file names to models.
+    """
+    models = {}
+    for model_file in os.listdir(models_dir):
+        with open(os.path.join(models_dir, model_file), "rb") as f:
+            try:
+                models[model_file] = load_pickle(f)["model"]
+            except pickle.UnpicklingError:
+                continue
+    return models
+
+
+def load_imagenet_val(num=None):
+    """Load a handful of validation images from ImageNet.
+
+    Inputs:
+    - num: Number of images to load (max of 25)
+
+    Returns:
+    - X: numpy array with shape [num, 224, 224, 3]
+    - y: numpy array of integer image labels, shape [num]
+    - class_names: dict mapping integer label to class name
+    """
+    imagenet_fn = os.path.join(
+        os.path.dirname(__file__), "datasets/imagenet_val_25.npz"
+    )
+    if not os.path.isfile(imagenet_fn):
+        print("file %s not found" % imagenet_fn)
+        print("Run the following:")
+        print("cd cs231n/datasets")
+        print("bash get_imagenet_val.sh")
+        assert False, "Need to download imagenet_val_25.npz"
+    f = np.load(imagenet_fn)
+    X = f["X"]
+    y = f["y"]
+    class_names = f["label_map"].item()
+    if num is not None:
+        X = X[:num]
+        y = y[:num]
+    return X, y, class_names
diff --git a/lab_3/scripts/datasets/get_datasets.sh b/lab_3/scripts/datasets/get_datasets.sh
new file mode 100644
index 0000000..06d4b3c
--- /dev/null
+++ b/lab_3/scripts/datasets/get_datasets.sh
@@ -0,0 +1,5 @@
+if [ ! -d "cifar-10-batches-py" ]; then
+  wget http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz -O cifar-10-python.tar.gz
+  tar -xzvf cifar-10-python.tar.gz
+  rm cifar-10-python.tar.gz
+fi
diff --git a/lab_3/scripts/fast_layers.py b/lab_3/scripts/fast_layers.py
new file mode 100644
index 0000000..46797f2
--- /dev/null
+++ b/lab_3/scripts/fast_layers.py
@@ -0,0 +1,283 @@
+from __future__ import print_function
+import numpy as np
+
+try:
+    from .im2col_cython import col2im_cython, im2col_cython
+    from .im2col_cython import col2im_6d_cython
+except ImportError:
+    print("""=========== You can safely ignore the message below if you are NOT working on ConvolutionalNetworks.ipynb ===========""")
+    print("\tYou will need to compile a Cython extension for a portion of this assignment.")
+    print("\tThe instructions to do this will be given in a section of the notebook below.")
+    print("\tThere will be an option for Colab users and another for Jupyter (local) users.")
+
+from .im2col import *
+
+
+def conv_forward_im2col(x, w, b, conv_param):
+    """
+    A fast implementation of the forward pass for a convolutional layer
+    based on im2col and col2im.
+    """
+    N, C, H, W = x.shape
+    num_filters, _, filter_height, filter_width = w.shape
+    stride, pad = conv_param["stride"], conv_param["pad"]
+
+    # Check dimensions
+    assert (W + 2 * pad - filter_width) % stride == 0, "width does not work"
+    assert (H + 2 * pad - filter_height) % stride == 0, "height does not work"
+
+    # Create output
+    out_height = (H + 2 * pad - filter_height) // stride + 1
+    out_width = (W + 2 * pad - filter_width) // stride + 1
+    out = np.zeros((N, num_filters, out_height, out_width), dtype=x.dtype)
+
+    # x_cols = im2col_indices(x, w.shape[2], w.shape[3], pad, stride)
+    x_cols = im2col_cython(x, w.shape[2], w.shape[3], pad, stride)
+    res = w.reshape((w.shape[0], -1)).dot(x_cols) + b.reshape(-1, 1)
+
+    out = res.reshape(w.shape[0], out.shape[2], out.shape[3], x.shape[0])
+    out = out.transpose(3, 0, 1, 2)
+
+    cache = (x, w, b, conv_param, x_cols)
+    return out, cache
+
+
+def conv_forward_strides(x, w, b, conv_param):
+    N, C, H, W = x.shape
+    F, _, HH, WW = w.shape
+    stride, pad = conv_param["stride"], conv_param["pad"]
+
+    # Check dimensions
+    # assert (W + 2 * pad - WW) % stride == 0, 'width does not work'
+    # assert (H + 2 * pad - HH) % stride == 0, 'height does not work'
+
+    # Pad the input
+    p = pad
+    x_padded = np.pad(x, ((0, 0), (0, 0), (p, p), (p, p)), mode="constant")
+
+    # Figure out output dimensions
+    H += 2 * pad
+    W += 2 * pad
+    out_h = (H - HH) // stride + 1
+    out_w = (W - WW) // stride + 1
+
+    # Perform an im2col operation by picking clever strides
+    shape = (C, HH, WW, N, out_h, out_w)
+    strides = (H * W, W, 1, C * H * W, stride * W, stride)
+    strides = x.itemsize * np.array(strides)
+    x_stride = np.lib.stride_tricks.as_strided(x_padded, shape=shape, strides=strides)
+    x_cols = np.ascontiguousarray(x_stride)
+    x_cols.shape = (C * HH * WW, N * out_h * out_w)
+
+    # Now all our convolutions are a big matrix multiply
+    res = w.reshape(F, -1).dot(x_cols) + b.reshape(-1, 1)
+
+    # Reshape the output
+    res.shape = (F, N, out_h, out_w)
+    out = res.transpose(1, 0, 2, 3)
+
+    # Be nice and return a contiguous array
+    # The old version of conv_forward_fast doesn't do this, so for a fair
+    # comparison we won't either
+    out = np.ascontiguousarray(out)
+
+    cache = (x, w, b, conv_param, x_cols)
+    return out, cache
+
+
+def conv_backward_strides(dout, cache):
+    x, w, b, conv_param, x_cols = cache
+    stride, pad = conv_param["stride"], conv_param["pad"]
+
+    N, C, H, W = x.shape
+    F, _, HH, WW = w.shape
+    _, _, out_h, out_w = dout.shape
+
+    db = np.sum(dout, axis=(0, 2, 3))
+
+    dout_reshaped = dout.transpose(1, 0, 2, 3).reshape(F, -1)
+    dw = dout_reshaped.dot(x_cols.T).reshape(w.shape)
+
+    dx_cols = w.reshape(F, -1).T.dot(dout_reshaped)
+    dx_cols.shape = (C, HH, WW, N, out_h, out_w)
+    dx = col2im_6d_cython(dx_cols, N, C, H, W, HH, WW, pad, stride)
+
+    return dx, dw, db
+
+
+def conv_backward_im2col(dout, cache):
+    """
+    A fast implementation of the backward pass for a convolutional layer
+    based on im2col and col2im.
+    """
+    x, w, b, conv_param, x_cols = cache
+    stride, pad = conv_param["stride"], conv_param["pad"]
+
+    db = np.sum(dout, axis=(0, 2, 3))
+
+    num_filters, _, filter_height, filter_width = w.shape
+    dout_reshaped = dout.transpose(1, 2, 3, 0).reshape(num_filters, -1)
+    dw = dout_reshaped.dot(x_cols.T).reshape(w.shape)
+
+    dx_cols = w.reshape(num_filters, -1).T.dot(dout_reshaped)
+    # dx = col2im_indices(dx_cols, x.shape, filter_height, filter_width, pad, stride)
+    dx = col2im_cython(
+        dx_cols,
+        x.shape[0],
+        x.shape[1],
+        x.shape[2],
+        x.shape[3],
+        filter_height,
+        filter_width,
+        pad,
+        stride,
+    )
+
+    return dx, dw, db
+
+
+conv_forward_fast = conv_forward_strides
+conv_backward_fast = conv_backward_strides
+
+
+def max_pool_forward_fast(x, pool_param):
+    """
+    A fast implementation of the forward pass for a max pooling layer.
+
+    This chooses between the reshape method and the im2col method. If the pooling
+    regions are square and tile the input image, then we can use the reshape
+    method which is very fast. Otherwise we fall back on the im2col method, which
+    is not much faster than the naive method.
+    """
+    N, C, H, W = x.shape
+    pool_height, pool_width = pool_param["pool_height"], pool_param["pool_width"]
+    stride = pool_param["stride"]
+
+    same_size = pool_height == pool_width == stride
+    tiles = H % pool_height == 0 and W % pool_width == 0
+    if same_size and tiles:
+        out, reshape_cache = max_pool_forward_reshape(x, pool_param)
+        cache = ("reshape", reshape_cache)
+    else:
+        out, im2col_cache = max_pool_forward_im2col(x, pool_param)
+        cache = ("im2col", im2col_cache)
+    return out, cache
+
+
+def max_pool_backward_fast(dout, cache):
+    """
+    A fast implementation of the backward pass for a max pooling layer.
+
+    This switches between the reshape method an the im2col method depending on
+    which method was used to generate the cache.
+    """
+    method, real_cache = cache
+    if method == "reshape":
+        return max_pool_backward_reshape(dout, real_cache)
+    elif method == "im2col":
+        return max_pool_backward_im2col(dout, real_cache)
+    else:
+        raise ValueError('Unrecognized method "%s"' % method)
+
+
+def max_pool_forward_reshape(x, pool_param):
+    """
+    A fast implementation of the forward pass for the max pooling layer that uses
+    some clever reshaping.
+
+    This can only be used for square pooling regions that tile the input.
+    """
+    N, C, H, W = x.shape
+    pool_height, pool_width = pool_param["pool_height"], pool_param["pool_width"]
+    stride = pool_param["stride"]
+    assert pool_height == pool_width == stride, "Invalid pool params"
+    assert H % pool_height == 0
+    assert W % pool_height == 0
+    x_reshaped = x.reshape(
+        N, C, H // pool_height, pool_height, W // pool_width, pool_width
+    )
+    out = x_reshaped.max(axis=3).max(axis=4)
+
+    cache = (x, x_reshaped, out)
+    return out, cache
+
+
+def max_pool_backward_reshape(dout, cache):
+    """
+    A fast implementation of the backward pass for the max pooling layer that
+    uses some clever broadcasting and reshaping.
+
+    This can only be used if the forward pass was computed using
+    max_pool_forward_reshape.
+
+    NOTE: If there are multiple argmaxes, this method will assign gradient to
+    ALL argmax elements of the input rather than picking one. In this case the
+    gradient will actually be incorrect. However this is unlikely to occur in
+    practice, so it shouldn't matter much. One possible solution is to split the
+    upstream gradient equally among all argmax elements; this should result in a
+    valid subgradient. You can make this happen by uncommenting the line below;
+    however this results in a significant performance penalty (about 40% slower)
+    and is unlikely to matter in practice so we don't do it.
+    """
+    x, x_reshaped, out = cache
+
+    dx_reshaped = np.zeros_like(x_reshaped)
+    out_newaxis = out[:, :, :, np.newaxis, :, np.newaxis]
+    mask = x_reshaped == out_newaxis
+    dout_newaxis = dout[:, :, :, np.newaxis, :, np.newaxis]
+    dout_broadcast, _ = np.broadcast_arrays(dout_newaxis, dx_reshaped)
+    dx_reshaped[mask] = dout_broadcast[mask]
+    dx_reshaped /= np.sum(mask, axis=(3, 5), keepdims=True)
+    dx = dx_reshaped.reshape(x.shape)
+
+    return dx
+
+
+def max_pool_forward_im2col(x, pool_param):
+    """
+    An implementation of the forward pass for max pooling based on im2col.
+
+    This isn't much faster than the naive version, so it should be avoided if
+    possible.
+    """
+    N, C, H, W = x.shape
+    pool_height, pool_width = pool_param["pool_height"], pool_param["pool_width"]
+    stride = pool_param["stride"]
+
+    assert (H - pool_height) % stride == 0, "Invalid height"
+    assert (W - pool_width) % stride == 0, "Invalid width"
+
+    out_height = (H - pool_height) // stride + 1
+    out_width = (W - pool_width) // stride + 1
+
+    x_split = x.reshape(N * C, 1, H, W)
+    x_cols = im2col(x_split, pool_height, pool_width, padding=0, stride=stride)
+    x_cols_argmax = np.argmax(x_cols, axis=0)
+    x_cols_max = x_cols[x_cols_argmax, np.arange(x_cols.shape[1])]
+    out = x_cols_max.reshape(out_height, out_width, N, C).transpose(2, 3, 0, 1)
+
+    cache = (x, x_cols, x_cols_argmax, pool_param)
+    return out, cache
+
+
+def max_pool_backward_im2col(dout, cache):
+    """
+    An implementation of the backward pass for max pooling based on im2col.
+
+    This isn't much faster than the naive version, so it should be avoided if
+    possible.
+    """
+    x, x_cols, x_cols_argmax, pool_param = cache
+    N, C, H, W = x.shape
+    pool_height, pool_width = pool_param["pool_height"], pool_param["pool_width"]
+    stride = pool_param["stride"]
+
+    dout_reshaped = dout.transpose(2, 3, 0, 1).flatten()
+    dx_cols = np.zeros_like(x_cols)
+    dx_cols[x_cols_argmax, np.arange(dx_cols.shape[1])] = dout_reshaped
+    dx = col2im_indices(
+        dx_cols, (N * C, 1, H, W), pool_height, pool_width, padding=0, stride=stride
+    )
+    dx = dx.reshape(x.shape)
+
+    return dx
diff --git a/lab_3/scripts/gradient_check.py b/lab_3/scripts/gradient_check.py
new file mode 100644
index 0000000..901c307
--- /dev/null
+++ b/lab_3/scripts/gradient_check.py
@@ -0,0 +1,133 @@
+from __future__ import print_function
+from builtins import range
+from past.builtins import xrange
+
+import numpy as np
+from random import randrange
+
+
+def eval_numerical_gradient(f, x, verbose=True, h=0.00001):
+    """
+    a naive implementation of numerical gradient of f at x
+    - f should be a function that takes a single argument
+    - x is the point (numpy array) to evaluate the gradient at
+    """
+
+    fx = f(x)  # evaluate function value at original point
+    grad = np.zeros_like(x)
+    # iterate over all indexes in x
+    it = np.nditer(x, flags=["multi_index"], op_flags=["readwrite"])
+    while not it.finished:
+
+        # evaluate function at x+h
+        ix = it.multi_index
+        oldval = x[ix]
+        x[ix] = oldval + h  # increment by h
+        fxph = f(x)  # evalute f(x + h)
+        x[ix] = oldval - h
+        fxmh = f(x)  # evaluate f(x - h)
+        x[ix] = oldval  # restore
+
+        # compute the partial derivative with centered formula
+        grad[ix] = (fxph - fxmh) / (2 * h)  # the slope
+        if verbose:
+            print(ix, grad[ix])
+        it.iternext()  # step to next dimension
+
+    return grad
+
+
+def eval_numerical_gradient_array(f, x, df, h=1e-5):
+    """
+    Evaluate a numeric gradient for a function that accepts a numpy
+    array and returns a numpy array.
+    """
+    grad = np.zeros_like(x)
+    it = np.nditer(x, flags=["multi_index"], op_flags=["readwrite"])
+    while not it.finished:
+        ix = it.multi_index
+
+        oldval = x[ix]
+        x[ix] = oldval + h
+        pos = f(x).copy()
+        x[ix] = oldval - h
+        neg = f(x).copy()
+        x[ix] = oldval
+
+        grad[ix] = np.sum((pos - neg) * df) / (2 * h)
+        it.iternext()
+    return grad
+
+
+def eval_numerical_gradient_blobs(f, inputs, output, h=1e-5):
+    """
+    Compute numeric gradients for a function that operates on input
+    and output blobs.
+
+    We assume that f accepts several input blobs as arguments, followed by a
+    blob where outputs will be written. For example, f might be called like:
+
+    f(x, w, out)
+
+    where x and w are input Blobs, and the result of f will be written to out.
+
+    Inputs:
+    - f: function
+    - inputs: tuple of input blobs
+    - output: output blob
+    - h: step size
+    """
+    numeric_diffs = []
+    for input_blob in inputs:
+        diff = np.zeros_like(input_blob.diffs)
+        it = np.nditer(input_blob.vals, flags=["multi_index"], op_flags=["readwrite"])
+        while not it.finished:
+            idx = it.multi_index
+            orig = input_blob.vals[idx]
+
+            input_blob.vals[idx] = orig + h
+            f(*(inputs + (output,)))
+            pos = np.copy(output.vals)
+            input_blob.vals[idx] = orig - h
+            f(*(inputs + (output,)))
+            neg = np.copy(output.vals)
+            input_blob.vals[idx] = orig
+
+            diff[idx] = np.sum((pos - neg) * output.diffs) / (2.0 * h)
+
+            it.iternext()
+        numeric_diffs.append(diff)
+    return numeric_diffs
+
+
+def eval_numerical_gradient_net(net, inputs, output, h=1e-5):
+    return eval_numerical_gradient_blobs(
+        lambda *args: net.forward(), inputs, output, h=h
+    )
+
+
+def grad_check_sparse(f, x, analytic_grad, num_checks=10, h=1e-5):
+    """
+    sample a few random elements and only return numerical
+    in this dimensions.
+    """
+
+    for i in range(num_checks):
+        ix = tuple([randrange(m) for m in x.shape])
+
+        oldval = x[ix]
+        x[ix] = oldval + h  # increment by h
+        fxph = f(x)  # evaluate f(x + h)
+        x[ix] = oldval - h  # increment by h
+        fxmh = f(x)  # evaluate f(x - h)
+        x[ix] = oldval  # reset
+
+        grad_numerical = (fxph - fxmh) / (2 * h)
+        grad_analytic = analytic_grad[ix]
+        rel_error = abs(grad_numerical - grad_analytic) / (
+            abs(grad_numerical) + abs(grad_analytic)
+        )
+        print(
+            "numerical: %f analytic: %f, relative error: %e"
+            % (grad_numerical, grad_analytic, rel_error)
+        )
diff --git a/lab_3/scripts/im2col.py b/lab_3/scripts/im2col.py
new file mode 100644
index 0000000..e1fc034
--- /dev/null
+++ b/lab_3/scripts/im2col.py
@@ -0,0 +1,58 @@
+from builtins import range
+import numpy as np
+
+
+def get_im2col_indices(x_shape, field_height, field_width, padding=1, stride=1):
+    # First figure out what the size of the output should be
+    N, C, H, W = x_shape
+    assert (H + 2 * padding - field_height) % stride == 0
+    assert (W + 2 * padding - field_height) % stride == 0
+    out_height = (H + 2 * padding - field_height) / stride + 1
+    out_width = (W + 2 * padding - field_width) / stride + 1
+
+    i0 = np.repeat(np.arange(field_height), field_width)
+    i0 = np.tile(i0, C)
+    i1 = stride * np.repeat(np.arange(out_height), out_width)
+    j0 = np.tile(np.arange(field_width), field_height * C)
+    j1 = stride * np.tile(np.arange(out_width), out_height)
+    i = i0.reshape(-1, 1) + i1.reshape(1, -1)
+    j = j0.reshape(-1, 1) + j1.reshape(1, -1)
+
+    k = np.repeat(np.arange(C), field_height * field_width).reshape(-1, 1)
+
+    return (k, i, j)
+
+
+def im2col_indices(x, field_height, field_width, padding=1, stride=1):
+    """ An implementation of im2col based on some fancy indexing """
+    # Zero-pad the input
+    p = padding
+    x_padded = np.pad(x, ((0, 0), (0, 0), (p, p), (p, p)), mode="constant")
+
+    k, i, j = get_im2col_indices(x.shape, field_height, field_width, padding, stride)
+
+    cols = x_padded[:, k, i, j]
+    C = x.shape[1]
+    cols = cols.transpose(1, 2, 0).reshape(field_height * field_width * C, -1)
+    return cols
+
+
+def col2im_indices(cols, x_shape, field_height=3, field_width=3, padding=1, stride=1):
+    """ An implementation of col2im based on fancy indexing and np.add.at """
+    N, C, H, W = x_shape
+    H_padded, W_padded = H + 2 * padding, W + 2 * padding
+    x_padded = np.zeros((N, C, H_padded, W_padded), dtype=cols.dtype)
+    k, i, j = get_im2col_indices(x_shape, field_height, field_width, padding, stride)
+    cols_reshaped = cols.reshape(C * field_height * field_width, -1, N)
+    cols_reshaped = cols_reshaped.transpose(2, 0, 1)
+    np.add.at(x_padded, (slice(None), k, i, j), cols_reshaped)
+    if padding == 0:
+        return x_padded
+    return x_padded[:, :, padding:-padding, padding:-padding]
+
+
+# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
+
+pass
+
+# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
diff --git a/lab_3/scripts/im2col_cython.pyx b/lab_3/scripts/im2col_cython.pyx
new file mode 100644
index 0000000..d6e33c6
--- /dev/null
+++ b/lab_3/scripts/im2col_cython.pyx
@@ -0,0 +1,121 @@
+import numpy as np
+cimport numpy as np
+cimport cython
+
+# DTYPE = np.float64
+# ctypedef np.float64_t DTYPE_t
+
+ctypedef fused DTYPE_t:
+    np.float32_t
+    np.float64_t
+
+def im2col_cython(np.ndarray[DTYPE_t, ndim=4] x, int field_height,
+                  int field_width, int padding, int stride):
+    cdef int N = x.shape[0]
+    cdef int C = x.shape[1]
+    cdef int H = x.shape[2]
+    cdef int W = x.shape[3]
+    
+    cdef int HH = (H + 2 * padding - field_height) / stride + 1
+    cdef int WW = (W + 2 * padding - field_width) / stride + 1
+
+    cdef int p = padding
+    cdef np.ndarray[DTYPE_t, ndim=4] x_padded = np.pad(x,
+            ((0, 0), (0, 0), (p, p), (p, p)), mode='constant')
+
+    cdef np.ndarray[DTYPE_t, ndim=2] cols = np.zeros(
+            (C * field_height * field_width, N * HH * WW),
+            dtype=x.dtype)
+
+    # Moving the inner loop to a C function with no bounds checking works, but does
+    # not seem to help performance in any measurable way.
+
+    im2col_cython_inner(cols, x_padded, N, C, H, W, HH, WW,
+                        field_height, field_width, padding, stride)
+    return cols
+
+
+@cython.boundscheck(False)
+cdef int im2col_cython_inner(np.ndarray[DTYPE_t, ndim=2] cols,
+                             np.ndarray[DTYPE_t, ndim=4] x_padded,
+                             int N, int C, int H, int W, int HH, int WW,
+                             int field_height, int field_width, int padding, int stride) except? -1:
+    cdef int c, ii, jj, row, yy, xx, i, col
+
+    for c in range(C):
+        for yy in range(HH):
+            for xx in range(WW):
+                for ii in range(field_height):
+                    for jj in range(field_width):
+                        row = c * field_width * field_height + ii * field_height + jj
+                        for i in range(N):
+                            col = yy * WW * N + xx * N + i
+                            cols[row, col] = x_padded[i, c, stride * yy + ii, stride * xx + jj]
+
+
+
+def col2im_cython(np.ndarray[DTYPE_t, ndim=2] cols, int N, int C, int H, int W,
+                  int field_height, int field_width, int padding, int stride):
+    cdef np.ndarray x = np.empty((N, C, H, W), dtype=cols.dtype)
+    cdef int HH = (H + 2 * padding - field_height) / stride + 1
+    cdef int WW = (W + 2 * padding - field_width) / stride + 1
+    cdef np.ndarray[DTYPE_t, ndim=4] x_padded = np.zeros((N, C, H + 2 * padding, W + 2 * padding),
+                                        dtype=cols.dtype)
+
+    # Moving the inner loop to a C-function with no bounds checking improves
+    # performance quite a bit for col2im.
+    col2im_cython_inner(cols, x_padded, N, C, H, W, HH, WW, 
+                        field_height, field_width, padding, stride)
+    if padding > 0:
+        return x_padded[:, :, padding:-padding, padding:-padding]
+    return x_padded
+
+
+@cython.boundscheck(False)
+cdef int col2im_cython_inner(np.ndarray[DTYPE_t, ndim=2] cols,
+                             np.ndarray[DTYPE_t, ndim=4] x_padded,
+                             int N, int C, int H, int W, int HH, int WW,
+                             int field_height, int field_width, int padding, int stride) except? -1:
+    cdef int c, ii, jj, row, yy, xx, i, col
+
+    for c in range(C):
+        for ii in range(field_height):
+            for jj in range(field_width):
+                row = c * field_width * field_height + ii * field_height + jj
+                for yy in range(HH):
+                    for xx in range(WW):
+                        for i in range(N):
+                            col = yy * WW * N + xx * N + i
+                            x_padded[i, c, stride * yy + ii, stride * xx + jj] += cols[row, col]
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+cdef col2im_6d_cython_inner(np.ndarray[DTYPE_t, ndim=6] cols,
+                            np.ndarray[DTYPE_t, ndim=4] x_padded,
+                            int N, int C, int H, int W, int HH, int WW,
+                            int out_h, int out_w, int pad, int stride):
+
+    cdef int c, hh, ww, n, h, w
+    for n in range(N):
+        for c in range(C):
+            for hh in range(HH):
+                for ww in range(WW):
+                    for h in range(out_h):
+                        for w in range(out_w):
+                            x_padded[n, c, stride * h + hh, stride * w + ww] += cols[c, hh, ww, n, h, w]
+    
+
+def col2im_6d_cython(np.ndarray[DTYPE_t, ndim=6] cols, int N, int C, int H, int W,
+        int HH, int WW, int pad, int stride):
+    cdef np.ndarray x = np.empty((N, C, H, W), dtype=cols.dtype)
+    cdef int out_h = (H + 2 * pad - HH) / stride + 1
+    cdef int out_w = (W + 2 * pad - WW) / stride + 1
+    cdef np.ndarray[DTYPE_t, ndim=4] x_padded = np.zeros((N, C, H + 2 * pad, W + 2 * pad),
+                                                  dtype=cols.dtype)
+
+    col2im_6d_cython_inner(cols, x_padded, N, C, H, W, HH, WW, out_h, out_w, pad, stride)
+
+    if pad > 0:
+        return x_padded[:, :, pad:-pad, pad:-pad]
+    return x_padded 
diff --git a/lab_3/scripts/layer_utils.py b/lab_3/scripts/layer_utils.py
new file mode 100644
index 0000000..c055e28
--- /dev/null
+++ b/lab_3/scripts/layer_utils.py
@@ -0,0 +1,110 @@
+# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
+
+pass
+
+# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
+from .layers import *
+from .fast_layers import *
+
+
+def affine_relu_forward(x, w, b):
+    """
+    Convenience layer that perorms an affine transform followed by a ReLU
+
+    Inputs:
+    - x: Input to the affine layer
+    - w, b: Weights for the affine layer
+
+    Returns a tuple of:
+    - out: Output from the ReLU
+    - cache: Object to give to the backward pass
+    """
+    a, fc_cache = affine_forward(x, w, b)
+    out, relu_cache = relu_forward(a)
+    cache = (fc_cache, relu_cache)
+    return out, cache
+
+
+def affine_relu_backward(dout, cache):
+    """
+    Backward pass for the affine-relu convenience layer
+    """
+    fc_cache, relu_cache = cache
+    da = relu_backward(dout, relu_cache)
+    dx, dw, db = affine_backward(da, fc_cache)
+    return dx, dw, db
+
+
+def conv_relu_forward(x, w, b, conv_param):
+    """
+    A convenience layer that performs a convolution followed by a ReLU.
+
+    Inputs:
+    - x: Input to the convolutional layer
+    - w, b, conv_param: Weights and parameters for the convolutional layer
+
+    Returns a tuple of:
+    - out: Output from the ReLU
+    - cache: Object to give to the backward pass
+    """
+    a, conv_cache = conv_forward_fast(x, w, b, conv_param)
+    out, relu_cache = relu_forward(a)
+    cache = (conv_cache, relu_cache)
+    return out, cache
+
+
+def conv_relu_backward(dout, cache):
+    """
+    Backward pass for the conv-relu convenience layer.
+    """
+    conv_cache, relu_cache = cache
+    da = relu_backward(dout, relu_cache)
+    dx, dw, db = conv_backward_fast(da, conv_cache)
+    return dx, dw, db
+
+
+def conv_bn_relu_forward(x, w, b, gamma, beta, conv_param, bn_param):
+    a, conv_cache = conv_forward_fast(x, w, b, conv_param)
+    an, bn_cache = spatial_batchnorm_forward(a, gamma, beta, bn_param)
+    out, relu_cache = relu_forward(an)
+    cache = (conv_cache, bn_cache, relu_cache)
+    return out, cache
+
+
+def conv_bn_relu_backward(dout, cache):
+    conv_cache, bn_cache, relu_cache = cache
+    dan = relu_backward(dout, relu_cache)
+    da, dgamma, dbeta = spatial_batchnorm_backward(dan, bn_cache)
+    dx, dw, db = conv_backward_fast(da, conv_cache)
+    return dx, dw, db, dgamma, dbeta
+
+
+def conv_relu_pool_forward(x, w, b, conv_param, pool_param):
+    """
+    Convenience layer that performs a convolution, a ReLU, and a pool.
+
+    Inputs:
+    - x: Input to the convolutional layer
+    - w, b, conv_param: Weights and parameters for the convolutional layer
+    - pool_param: Parameters for the pooling layer
+
+    Returns a tuple of:
+    - out: Output from the pooling layer
+    - cache: Object to give to the backward pass
+    """
+    a, conv_cache = conv_forward_fast(x, w, b, conv_param)
+    s, relu_cache = relu_forward(a)
+    out, pool_cache = max_pool_forward_fast(s, pool_param)
+    cache = (conv_cache, relu_cache, pool_cache)
+    return out, cache
+
+
+def conv_relu_pool_backward(dout, cache):
+    """
+    Backward pass for the conv-relu-pool convenience layer
+    """
+    conv_cache, relu_cache, pool_cache = cache
+    ds = max_pool_backward_fast(dout, pool_cache)
+    da = relu_backward(ds, relu_cache)
+    dx, dw, db = conv_backward_fast(da, conv_cache)
+    return dx, dw, db
diff --git a/lab_3/scripts/layers.py b/lab_3/scripts/layers.py
new file mode 100644
index 0000000..678931d
--- /dev/null
+++ b/lab_3/scripts/layers.py
@@ -0,0 +1,696 @@
+from builtins import range
+import numpy as np
+
+
+
+def affine_forward(x, w, b):
+    """
+    Computes the forward pass for an affine (fully-connected) layer.
+
+    The input x has shape (N, d_1, ..., d_k) and contains a minibatch of N
+    examples, where each example x[i] has shape (d_1, ..., d_k). We will
+    reshape each input into a vector of dimension D = d_1 * ... * d_k, and
+    then transform it to an output vector of dimension M.
+
+    Inputs:
+    - x: A numpy array containing input data, of shape (N, d_1, ..., d_k)
+    - w: A numpy array of weights, of shape (D, M)
+    - b: A numpy array of biases, of shape (M,)
+
+    Returns a tuple of:
+    - out: output, of shape (N, M)
+    - cache: (x, w, b)
+    """
+    out = None
+    ###########################################################################
+    # TODO: Implement the affine forward pass. Store the result in out. You   #
+    # will need to reshape the input into rows.                               #
+    ###########################################################################
+    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
+
+    pass
+
+    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
+    ###########################################################################
+    #                             END OF YOUR CODE                            #
+    ###########################################################################
+    cache = (x, w, b)
+    return out, cache
+
+
+def affine_backward(dout, cache):
+    """
+    Computes the backward pass for an affine layer.
+
+    Inputs:
+    - dout: Upstream derivative, of shape (N, M)
+    - cache: Tuple of:
+      - x: Input data, of shape (N, d_1, ... d_k)
+      - w: Weights, of shape (D, M)
+      - b: Biases, of shape (M,)
+
+    Returns a tuple of:
+    - dx: Gradient with respect to x, of shape (N, d1, ..., d_k)
+    - dw: Gradient with respect to w, of shape (D, M)
+    - db: Gradient with respect to b, of shape (M,)
+    """
+    x, w, b = cache
+    dx, dw, db = None, None, None
+    ###########################################################################
+    # TODO: Implement the affine backward pass.                               #
+    ###########################################################################
+    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
+
+    pass
+
+    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
+    ###########################################################################
+    #                             END OF YOUR CODE                            #
+    ###########################################################################
+    return dx, dw, db
+
+
+def relu_forward(x):
+    """
+    Computes the forward pass for a layer of rectified linear units (ReLUs).
+
+    Input:
+    - x: Inputs, of any shape
+
+    Returns a tuple of:
+    - out: Output, of the same shape as x
+    - cache: x
+    """
+    out = None
+    ###########################################################################
+    # TODO: Implement the ReLU forward pass.                                  #
+    ###########################################################################
+    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
+
+    pass
+
+    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
+    ###########################################################################
+    #                             END OF YOUR CODE                            #
+    ###########################################################################
+    cache = x
+    return out, cache
+
+
+def relu_backward(dout, cache):
+    """
+    Computes the backward pass for a layer of rectified linear units (ReLUs).
+
+    Input:
+    - dout: Upstream derivatives, of any shape
+    - cache: Input x, of same shape as dout
+
+    Returns:
+    - dx: Gradient with respect to x
+    """
+    dx, x = None, cache
+    ###########################################################################
+    # TODO: Implement the ReLU backward pass.                                 #
+    ###########################################################################
+    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
+
+    pass
+
+    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
+    ###########################################################################
+    #                             END OF YOUR CODE                            #
+    ###########################################################################
+    return dx
+
+
+def batchnorm_forward(x, gamma, beta, bn_param):
+    """
+    Forward pass for batch normalization.
+
+    During training the sample mean and (uncorrected) sample variance are
+    computed from minibatch statistics and used to normalize the incoming data.
+    During training we also keep an exponentially decaying running mean of the
+    mean and variance of each feature, and these averages are used to normalize
+    data at test-time.
+
+    At each timestep we update the running averages for mean and variance using
+    an exponential decay based on the momentum parameter:
+
+    running_mean = momentum * running_mean + (1 - momentum) * sample_mean
+    running_var = momentum * running_var + (1 - momentum) * sample_var
+
+    Note that the batch normalization paper suggests a different test-time
+    behavior: they compute sample mean and variance for each feature using a
+    large number of training images rather than using a running average. For
+    this implementation we have chosen to use running averages instead since
+    they do not require an additional estimation step; the torch7
+    implementation of batch normalization also uses running averages.
+
+    Input:
+    - x: Data of shape (N, D)
+    - gamma: Scale parameter of shape (D,)
+    - beta: Shift paremeter of shape (D,)
+    - bn_param: Dictionary with the following keys:
+      - mode: 'train' or 'test'; required
+      - eps: Constant for numeric stability
+      - momentum: Constant for running mean / variance.
+      - running_mean: Array of shape (D,) giving running mean of features
+      - running_var Array of shape (D,) giving running variance of features
+
+    Returns a tuple of:
+    - out: of shape (N, D)
+    - cache: A tuple of values needed in the backward pass
+    """
+    mode = bn_param["mode"]
+    eps = bn_param.get("eps", 1e-5)
+    momentum = bn_param.get("momentum", 0.9)
+
+    N, D = x.shape
+    running_mean = bn_param.get("running_mean", np.zeros(D, dtype=x.dtype))
+    running_var = bn_param.get("running_var", np.zeros(D, dtype=x.dtype))
+
+    out, cache = None, None
+    if mode == "train":
+        #######################################################################
+        # TODO: Implement the training-time forward pass for batch norm.      #
+        # Use minibatch statistics to compute the mean and variance, use      #
+        # these statistics to normalize the incoming data, and scale and      #
+        # shift the normalized data using gamma and beta.                     #
+        #                                                                     #
+        # You should store the output in the variable out. Any intermediates  #
+        # that you need for the backward pass should be stored in the cache   #
+        # variable.                                                           #
+        #                                                                     #
+        # You should also use your computed sample mean and variance together #
+        # with the momentum variable to update the running mean and running   #
+        # variance, storing your result in the running_mean and running_var   #
+        # variables.                                                          #
+        #                                                                     #
+        # Note that though you should be keeping track of the running         #
+        # variance, you should normalize the data based on the standard       #
+        # deviation (square root of variance) instead!                        #
+        # Referencing the original paper (https://arxiv.org/abs/1502.03167)   #
+        # might prove to be helpful.                                          #
+        #######################################################################
+        # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
+
+        pass
+
+        # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
+        #######################################################################
+        #                           END OF YOUR CODE                          #
+        #######################################################################
+    elif mode == "test":
+        #######################################################################
+        # TODO: Implement the test-time forward pass for batch normalization. #
+        # Use the running mean and variance to normalize the incoming data,   #
+        # then scale and shift the normalized data using gamma and beta.      #
+        # Store the result in the out variable.                               #
+        #######################################################################
+        # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
+
+        pass
+
+        # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
+        #######################################################################
+        #                          END OF YOUR CODE                           #
+        #######################################################################
+    else:
+        raise ValueError('Invalid forward batchnorm mode "%s"' % mode)
+
+    # Store the updated running means back into bn_param
+    bn_param["running_mean"] = running_mean
+    bn_param["running_var"] = running_var
+
+    return out, cache
+
+
+def batchnorm_backward(dout, cache):
+    """
+    Backward pass for batch normalization.
+
+    For this implementation, you should write out a computation graph for
+    batch normalization on paper and propagate gradients backward through
+    intermediate nodes.
+
+    Inputs:
+    - dout: Upstream derivatives, of shape (N, D)
+    - cache: Variable of intermediates from batchnorm_forward.
+
+    Returns a tuple of:
+    - dx: Gradient with respect to inputs x, of shape (N, D)
+    - dgamma: Gradient with respect to scale parameter gamma, of shape (D,)
+    - dbeta: Gradient with respect to shift parameter beta, of shape (D,)
+    """
+    dx, dgamma, dbeta = None, None, None
+    ###########################################################################
+    # TODO: Implement the backward pass for batch normalization. Store the    #
+    # results in the dx, dgamma, and dbeta variables.                         #
+    # Referencing the original paper (https://arxiv.org/abs/1502.03167)       #
+    # might prove to be helpful.                                              #
+    ###########################################################################
+    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
+
+    pass
+
+    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
+    ###########################################################################
+    #                             END OF YOUR CODE                            #
+    ###########################################################################
+
+    return dx, dgamma, dbeta
+
+
+
+
+
+
+
+
+
+
+def dropout_forward(x, dropout_param):
+    """
+    Performs the forward pass for (inverted) dropout.
+
+    Inputs:
+    - x: Input data, of any shape
+    - dropout_param: A dictionary with the following keys:
+      - p: Dropout parameter. We keep each neuron output with probability p.
+      - mode: 'test' or 'train'. If the mode is train, then perform dropout;
+        if the mode is test, then just return the input.
+      - seed: Seed for the random number generator. Passing seed makes this
+        function deterministic, which is needed for gradient checking but not
+        in real networks.
+
+    Outputs:
+    - out: Array of the same shape as x.
+    - cache: tuple (dropout_param, mask). In training mode, mask is the dropout
+      mask that was used to multiply the input; in test mode, mask is None.
+
+    NOTE: Please implement **inverted** dropout, not the vanilla version of dropout.
+    See http://cs231n.github.io/neural-networks-2/#reg for more details.
+
+    NOTE 2: Keep in mind that p is the probability of **keep** a neuron
+    output; this might be contrary to some sources, where it is referred to
+    as the probability of dropping a neuron output.
+    """
+    p, mode = dropout_param["p"], dropout_param["mode"]
+    if "seed" in dropout_param:
+        np.random.seed(dropout_param["seed"])
+
+    mask = None
+    out = None
+
+    if mode == "train":
+        #######################################################################
+        # TODO: Implement training phase forward pass for inverted dropout.   #
+        # Store the dropout mask in the mask variable.                        #
+        #######################################################################
+        # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
+
+        pass
+
+        # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
+        #######################################################################
+        #                           END OF YOUR CODE                          #
+        #######################################################################
+    elif mode == "test":
+        #######################################################################
+        # TODO: Implement the test phase forward pass for inverted dropout.   #
+        #######################################################################
+        # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
+
+        pass
+
+        # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
+        #######################################################################
+        #                            END OF YOUR CODE                         #
+        #######################################################################
+
+    cache = (dropout_param, mask)
+    out = out.astype(x.dtype, copy=False)
+
+    return out, cache
+
+
+def dropout_backward(dout, cache):
+    """
+    Perform the backward pass for (inverted) dropout.
+
+    Inputs:
+    - dout: Upstream derivatives, of any shape
+    - cache: (dropout_param, mask) from dropout_forward.
+    """
+    dropout_param, mask = cache
+    mode = dropout_param["mode"]
+
+    dx = None
+    if mode == "train":
+        #######################################################################
+        # TODO: Implement training phase backward pass for inverted dropout   #
+        #######################################################################
+        # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
+
+        pass
+
+        # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
+        #######################################################################
+        #                          END OF YOUR CODE                           #
+        #######################################################################
+    elif mode == "test":
+        dx = dout
+    return dx
+
+
+def conv_forward_naive(x, w, b, conv_param):
+    """
+    A naive implementation of the forward pass for a convolutional layer.
+
+    The input consists of N data points, each with C channels, height H and
+    width W. We convolve each input with F different filters, where each filter
+    spans all C channels and has height HH and width WW.
+
+    Input:
+    - x: Input data of shape (N, C, H, W)
+    - w: Filter weights of shape (F, C, HH, WW)
+    - b: Biases, of shape (F,)
+    - conv_param: A dictionary with the following keys:
+      - 'stride': The number of pixels between adjacent receptive fields in the
+        horizontal and vertical directions.
+      - 'pad': The number of pixels that will be used to zero-pad the input. 
+        
+
+    During padding, 'pad' zeros should be placed symmetrically (i.e equally on both sides)
+    along the height and width axes of the input. Be careful not to modfiy the original
+    input x directly.
+
+    Returns a tuple of:
+    - out: Output data, of shape (N, F, H', W') where H' and W' are given by
+      H' = 1 + (H + 2 * pad - HH) / stride
+      W' = 1 + (W + 2 * pad - WW) / stride
+    - cache: (x, w, b, conv_param)
+    """
+    out = None
+    ###########################################################################
+    # TODO: Implement the convolutional forward pass.                         #
+    # Hint: you can use the function np.pad for padding.                      #
+    ###########################################################################
+    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
+
+    pass
+
+    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
+    ###########################################################################
+    #                             END OF YOUR CODE                            #
+    ###########################################################################
+    cache = (x, w, b, conv_param)
+    return out, cache
+
+
+def conv_backward_naive(dout, cache):
+    """
+    A naive implementation of the backward pass for a convolutional layer.
+
+    Inputs:
+    - dout: Upstream derivatives.
+    - cache: A tuple of (x, w, b, conv_param) as in conv_forward_naive
+
+    Returns a tuple of:
+    - dx: Gradient with respect to x
+    - dw: Gradient with respect to w
+    - db: Gradient with respect to b
+    """
+    dx, dw, db = None, None, None
+    ###########################################################################
+    # TODO: Implement the convolutional backward pass.                        #
+    ###########################################################################
+    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
+
+    pass
+
+    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
+    ###########################################################################
+    #                             END OF YOUR CODE                            #
+    ###########################################################################
+    return dx, dw, db
+
+
+def max_pool_forward_naive(x, pool_param):
+    """
+    A naive implementation of the forward pass for a max-pooling layer.
+
+    Inputs:
+    - x: Input data, of shape (N, C, H, W)
+    - pool_param: dictionary with the following keys:
+      - 'pool_height': The height of each pooling region
+      - 'pool_width': The width of each pooling region
+      - 'stride': The distance between adjacent pooling regions
+
+    No padding is necessary here. Output size is given by 
+
+    Returns a tuple of:
+    - out: Output data, of shape (N, C, H', W') where H' and W' are given by
+      H' = 1 + (H - pool_height) / stride
+      W' = 1 + (W - pool_width) / stride
+    - cache: (x, pool_param)
+    """
+    out = None
+    ###########################################################################
+    # TODO: Implement the max-pooling forward pass                            #
+    ###########################################################################
+    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
+
+    pass
+
+    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
+    ###########################################################################
+    #                             END OF YOUR CODE                            #
+    ###########################################################################
+    cache = (x, pool_param)
+    return out, cache
+
+
+def max_pool_backward_naive(dout, cache):
+    """
+    A naive implementation of the backward pass for a max-pooling layer.
+
+    Inputs:
+    - dout: Upstream derivatives
+    - cache: A tuple of (x, pool_param) as in the forward pass.
+
+    Returns:
+    - dx: Gradient with respect to x
+    """
+    dx = None
+    ###########################################################################
+    # TODO: Implement the max-pooling backward pass                           #
+    ###########################################################################
+    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
+
+    pass
+
+    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
+    ###########################################################################
+    #                             END OF YOUR CODE                            #
+    ###########################################################################
+    return dx
+
+
+def spatial_batchnorm_forward(x, gamma, beta, bn_param):
+    """
+    Computes the forward pass for spatial batch normalization.
+
+    Inputs:
+    - x: Input data of shape (N, C, H, W)
+    - gamma: Scale parameter, of shape (C,)
+    - beta: Shift parameter, of shape (C,)
+    - bn_param: Dictionary with the following keys:
+      - mode: 'train' or 'test'; required
+      - eps: Constant for numeric stability
+      - momentum: Constant for running mean / variance. momentum=0 means that
+        old information is discarded completely at every time step, while
+        momentum=1 means that new information is never incorporated. The
+        default of momentum=0.9 should work well in most situations.
+      - running_mean: Array of shape (D,) giving running mean of features
+      - running_var Array of shape (D,) giving running variance of features
+
+    Returns a tuple of:
+    - out: Output data, of shape (N, C, H, W)
+    - cache: Values needed for the backward pass
+    """
+    out, cache = None, None
+
+    ###########################################################################
+    # TODO: Implement the forward pass for spatial batch normalization.       #
+    #                                                                         #
+    # HINT: You can implement spatial batch normalization by calling the      #
+    # vanilla version of batch normalization you implemented above.           #
+    # Your implementation should be very short; ours is less than five lines. #
+    ###########################################################################
+    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
+
+    pass
+
+    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
+    ###########################################################################
+    #                             END OF YOUR CODE                            #
+    ###########################################################################
+
+    return out, cache
+
+
+def spatial_batchnorm_backward(dout, cache):
+    """
+    Computes the backward pass for spatial batch normalization.
+
+    Inputs:
+    - dout: Upstream derivatives, of shape (N, C, H, W)
+    - cache: Values from the forward pass
+
+    Returns a tuple of:
+    - dx: Gradient with respect to inputs, of shape (N, C, H, W)
+    - dgamma: Gradient with respect to scale parameter, of shape (C,)
+    - dbeta: Gradient with respect to shift parameter, of shape (C,)
+    """
+    dx, dgamma, dbeta = None, None, None
+
+    ###########################################################################
+    # TODO: Implement the backward pass for spatial batch normalization.      #
+    #                                                                         #
+    # HINT: You can implement spatial batch normalization by calling the      #
+    # vanilla version of batch normalization you implemented above.           #
+    # Your implementation should be very short; ours is less than five lines. #
+    ###########################################################################
+    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
+
+    pass
+
+    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
+    ###########################################################################
+    #                             END OF YOUR CODE                            #
+    ###########################################################################
+
+    return dx, dgamma, dbeta
+
+
+def spatial_groupnorm_forward(x, gamma, beta, G, gn_param):
+    """
+    Computes the forward pass for spatial group normalization.
+    In contrast to layer normalization, group normalization splits each entry 
+    in the data into G contiguous pieces, which it then normalizes independently.
+    Per feature shifting and scaling are then applied to the data, in a manner identical to that of batch normalization and layer normalization.
+
+    Inputs:
+    - x: Input data of shape (N, C, H, W)
+    - gamma: Scale parameter, of shape (C,)
+    - beta: Shift parameter, of shape (C,)
+    - G: Integer mumber of groups to split into, should be a divisor of C
+    - gn_param: Dictionary with the following keys:
+      - eps: Constant for numeric stability
+
+    Returns a tuple of:
+    - out: Output data, of shape (N, C, H, W)
+    - cache: Values needed for the backward pass
+    """
+    out, cache = None, None
+    eps = gn_param.get("eps", 1e-5)
+    ###########################################################################
+    # TODO: Implement the forward pass for spatial group normalization.       #
+    # This will be extremely similar to the layer norm implementation.        #
+    # In particular, think about how you could transform the matrix so that   #
+    # the bulk of the code is similar to both train-time batch normalization  #
+    # and layer normalization!                                                #
+    ###########################################################################
+    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
+
+    pass
+
+    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
+    ###########################################################################
+    #                             END OF YOUR CODE                            #
+    ###########################################################################
+    return out, cache
+
+
+def spatial_groupnorm_backward(dout, cache):
+    """
+    Computes the backward pass for spatial group normalization.
+
+    Inputs:
+    - dout: Upstream derivatives, of shape (N, C, H, W)
+    - cache: Values from the forward pass
+
+    Returns a tuple of:
+    - dx: Gradient with respect to inputs, of shape (N, C, H, W)
+    - dgamma: Gradient with respect to scale parameter, of shape (C,)
+    - dbeta: Gradient with respect to shift parameter, of shape (C,)
+    """
+    dx, dgamma, dbeta = None, None, None
+
+    ###########################################################################
+    # TODO: Implement the backward pass for spatial group normalization.      #
+    # This will be extremely similar to the layer norm implementation.        #
+    ###########################################################################
+    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
+
+    pass
+
+    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
+    ###########################################################################
+    #                             END OF YOUR CODE                            #
+    ###########################################################################
+    return dx, dgamma, dbeta
+
+
+def svm_loss(x, y):
+    """
+    Computes the loss and gradient using for multiclass SVM classification.
+
+    Inputs:
+    - x: Input data, of shape (N, C) where x[i, j] is the score for the jth
+      class for the ith input.
+    - y: Vector of labels, of shape (N,) where y[i] is the label for x[i] and
+      0 <= y[i] < C
+
+    Returns a tuple of:
+    - loss: Scalar giving the loss
+    - dx: Gradient of the loss with respect to x
+    """
+    N = x.shape[0]
+    correct_class_scores = x[np.arange(N), y]
+    margins = np.maximum(0, x - correct_class_scores[:, np.newaxis] + 1.0)
+    margins[np.arange(N), y] = 0
+    loss = np.sum(margins) / N
+    num_pos = np.sum(margins > 0, axis=1)
+    dx = np.zeros_like(x)
+    dx[margins > 0] = 1
+    dx[np.arange(N), y] -= num_pos
+    dx /= N
+    return loss, dx
+
+
+def softmax_loss(x, y):
+    """
+    Computes the loss and gradient for softmax classification.
+
+    Inputs:
+    - x: Input data, of shape (N, C) where x[i, j] is the score for the jth
+      class for the ith input.
+    - y: Vector of labels, of shape (N,) where y[i] is the label for x[i] and
+      0 <= y[i] < C
+
+    Returns a tuple of:
+    - loss: Scalar giving the loss
+    - dx: Gradient of the loss with respect to x
+    """
+    shifted_logits = x - np.max(x, axis=1, keepdims=True)
+    Z = np.sum(np.exp(shifted_logits), axis=1, keepdims=True)
+    log_probs = shifted_logits - np.log(Z)
+    probs = np.exp(log_probs)
+    N = x.shape[0]
+    loss = -np.sum(log_probs[np.arange(N), y]) / N
+    dx = probs.copy()
+    dx[np.arange(N), y] -= 1
+    dx /= N
+    return loss, dx
diff --git a/lab_3/scripts/optim.py b/lab_3/scripts/optim.py
new file mode 100644
index 0000000..2194b35
--- /dev/null
+++ b/lab_3/scripts/optim.py
@@ -0,0 +1,162 @@
+import numpy as np
+
+"""
+This file implements various first-order update rules that are commonly used
+for training neural networks. Each update rule accepts current weights and the
+gradient of the loss with respect to those weights and produces the next set of
+weights. Each update rule has the same interface:
+
+def update(w, dw, config=None):
+
+Inputs:
+  - w: A numpy array giving the current weights.
+  - dw: A numpy array of the same shape as w giving the gradient of the
+    loss with respect to w.
+  - config: A dictionary containing hyperparameter values such as learning
+    rate, momentum, etc. If the update rule requires caching values over many
+    iterations, then config will also hold these cached values.
+
+Returns:
+  - next_w: The next point after the update.
+  - config: The config dictionary to be passed to the next iteration of the
+    update rule.
+
+NOTE: For most update rules, the default learning rate will probably not
+perform well; however the default values of the other hyperparameters should
+work well for a variety of different problems.
+
+For efficiency, update rules may perform in-place updates, mutating w and
+setting next_w equal to w.
+"""
+
+
+def sgd(w, dw, config=None):
+    """
+    Performs vanilla stochastic gradient descent.
+
+    config format:
+    - learning_rate: Scalar learning rate.
+    """
+    if config is None:
+        config = {}
+    config.setdefault("learning_rate", 1e-2)
+
+    w -= config["learning_rate"] * dw
+    return w, config
+
+
+def sgd_momentum(w, dw, config=None):
+    """
+    Performs stochastic gradient descent with momentum.
+
+    config format:
+    - learning_rate: Scalar learning rate.
+    - momentum: Scalar between 0 and 1 giving the momentum value.
+      Setting momentum = 0 reduces to sgd.
+    - velocity: A numpy array of the same shape as w and dw used to store a
+      moving average of the gradients.
+    """
+    if config is None:
+        config = {}
+    config.setdefault("learning_rate", 1e-2)
+    config.setdefault("momentum", 0.9)
+    v = config.get("velocity", np.zeros_like(w))
+
+    next_w = None
+    ###########################################################################
+    # TODO: Implement the momentum update formula. Store the updated value in #
+    # the next_w variable. You should also use and update the velocity v.     #
+    ###########################################################################
+    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
+
+    pass
+
+    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
+    ###########################################################################
+    #                             END OF YOUR CODE                            #
+    ###########################################################################
+    config["velocity"] = v
+
+    return next_w, config
+
+
+def rmsprop(w, dw, config=None):
+    """
+    Uses the RMSProp update rule, which uses a moving average of squared
+    gradient values to set adaptive per-parameter learning rates.
+
+    config format:
+    - learning_rate: Scalar learning rate.
+    - decay_rate: Scalar between 0 and 1 giving the decay rate for the squared
+      gradient cache.
+    - epsilon: Small scalar used for smoothing to avoid dividing by zero.
+    - cache: Moving average of second moments of gradients.
+    """
+    if config is None:
+        config = {}
+    config.setdefault("learning_rate", 1e-2)
+    config.setdefault("decay_rate", 0.99)
+    config.setdefault("epsilon", 1e-8)
+    config.setdefault("cache", np.zeros_like(w))
+
+    next_w = None
+    ###########################################################################
+    # TODO: Implement the RMSprop update formula, storing the next value of w #
+    # in the next_w variable. Don't forget to update cache value stored in    #
+    # config['cache'].                                                        #
+    ###########################################################################
+    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
+
+    pass
+
+    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
+    ###########################################################################
+    #                             END OF YOUR CODE                            #
+    ###########################################################################
+
+    return next_w, config
+
+
+def adam(w, dw, config=None):
+    """
+    Uses the Adam update rule, which incorporates moving averages of both the
+    gradient and its square and a bias correction term.
+
+    config format:
+    - learning_rate: Scalar learning rate.
+    - beta1: Decay rate for moving average of first moment of gradient.
+    - beta2: Decay rate for moving average of second moment of gradient.
+    - epsilon: Small scalar used for smoothing to avoid dividing by zero.
+    - m: Moving average of gradient.
+    - v: Moving average of squared gradient.
+    - t: Iteration number.
+    """
+    if config is None:
+        config = {}
+    config.setdefault("learning_rate", 1e-3)
+    config.setdefault("beta1", 0.9)
+    config.setdefault("beta2", 0.999)
+    config.setdefault("epsilon", 1e-8)
+    config.setdefault("m", np.zeros_like(w))
+    config.setdefault("v", np.zeros_like(w))
+    config.setdefault("t", 0)
+
+    next_w = None
+    ###########################################################################
+    # TODO: Implement the Adam update formula, storing the next value of w in #
+    # the next_w variable. Don't forget to update the m, v, and t variables   #
+    # stored in config.                                                       #
+    #                                                                         #
+    # NOTE: In order to match the reference output, please modify t _before_  #
+    # using it in any calculations.                                           #
+    ###########################################################################
+    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
+
+    pass
+
+    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
+    ###########################################################################
+    #                             END OF YOUR CODE                            #
+    ###########################################################################
+
+    return next_w, config
diff --git a/lab_3/scripts/setup.py b/lab_3/scripts/setup.py
new file mode 100644
index 0000000..569bf77
--- /dev/null
+++ b/lab_3/scripts/setup.py
@@ -0,0 +1,12 @@
+from distutils.core import setup
+from distutils.extension import Extension
+from Cython.Build import cythonize
+import numpy
+
+extensions = [
+    Extension(
+        "im2col_cython", ["im2col_cython.pyx"], include_dirs=[numpy.get_include()]
+    ),
+]
+
+setup(ext_modules=cythonize(extensions),)
diff --git a/lab_3/scripts/solver.py b/lab_3/scripts/solver.py
new file mode 100644
index 0000000..f797e21
--- /dev/null
+++ b/lab_3/scripts/solver.py
@@ -0,0 +1,309 @@
+from __future__ import print_function, division
+from future import standard_library
+
+standard_library.install_aliases()
+from builtins import range
+from builtins import object
+import os
+import pickle as pickle
+
+import numpy as np
+
+from scripts import optim
+
+
+class Solver(object):
+    """
+    A Solver encapsulates all the logic necessary for training classification
+    models. The Solver performs stochastic gradient descent using different
+    update rules defined in optim.py.
+
+    The solver accepts both training and validataion data and labels so it can
+    periodically check classification accuracy on both training and validation
+    data to watch out for overfitting.
+
+    To train a model, you will first construct a Solver instance, passing the
+    model, dataset, and various options (learning rate, batch size, etc) to the
+    constructor. You will then call the train() method to run the optimization
+    procedure and train the model.
+
+    After the train() method returns, model.params will contain the parameters
+    that performed best on the validation set over the course of training.
+    In addition, the instance variable solver.loss_history will contain a list
+    of all losses encountered during training and the instance variables
+    solver.train_acc_history and solver.val_acc_history will be lists of the
+    accuracies of the model on the training and validation set at each epoch.
+
+    Example usage might look something like this:
+
+    data = {
+      'X_train': # training data
+      'y_train': # training labels
+      'X_val': # validation data
+      'y_val': # validation labels
+    }
+    model = MyAwesomeModel(hidden_size=100, reg=10)
+    solver = Solver(model, data,
+                    update_rule='sgd',
+                    optim_config={
+                      'learning_rate': 1e-3,
+                    },
+                    lr_decay=0.95,
+                    num_epochs=10, batch_size=100,
+                    print_every=100)
+    solver.train()
+
+
+    A Solver works on a model object that must conform to the following API:
+
+    - model.params must be a dictionary mapping string parameter names to numpy
+      arrays containing parameter values.
+
+    - model.loss(X, y) must be a function that computes training-time loss and
+      gradients, and test-time classification scores, with the following inputs
+      and outputs:
+
+      Inputs:
+      - X: Array giving a minibatch of input data of shape (N, d_1, ..., d_k)
+      - y: Array of labels, of shape (N,) giving labels for X where y[i] is the
+        label for X[i].
+
+      Returns:
+      If y is None, run a test-time forward pass and return:
+      - scores: Array of shape (N, C) giving classification scores for X where
+        scores[i, c] gives the score of class c for X[i].
+
+      If y is not None, run a training time forward and backward pass and
+      return a tuple of:
+      - loss: Scalar giving the loss
+      - grads: Dictionary with the same keys as self.params mapping parameter
+        names to gradients of the loss with respect to those parameters.
+    """
+
+    def __init__(self, model, data, **kwargs):
+        """
+        Construct a new Solver instance.
+
+        Required arguments:
+        - model: A model object conforming to the API described above
+        - data: A dictionary of training and validation data containing:
+          'X_train': Array, shape (N_train, d_1, ..., d_k) of training images
+          'X_val': Array, shape (N_val, d_1, ..., d_k) of validation images
+          'y_train': Array, shape (N_train,) of labels for training images
+          'y_val': Array, shape (N_val,) of labels for validation images
+
+        Optional arguments:
+        - update_rule: A string giving the name of an update rule in optim.py.
+          Default is 'sgd'.
+        - optim_config: A dictionary containing hyperparameters that will be
+          passed to the chosen update rule. Each update rule requires different
+          hyperparameters (see optim.py) but all update rules require a
+          'learning_rate' parameter so that should always be present.
+        - lr_decay: A scalar for learning rate decay; after each epoch the
+          learning rate is multiplied by this value.
+        - batch_size: Size of minibatches used to compute loss and gradient
+          during training.
+        - num_epochs: The number of epochs to run for during training.
+        - print_every: Integer; training losses will be printed every
+          print_every iterations.
+        - verbose: Boolean; if set to false then no output will be printed
+          during training.
+        - num_train_samples: Number of training samples used to check training
+          accuracy; default is 1000; set to None to use entire training set.
+        - num_val_samples: Number of validation samples to use to check val
+          accuracy; default is None, which uses the entire validation set.
+        - checkpoint_name: If not None, then save model checkpoints here every
+          epoch.
+        """
+        self.model = model
+        self.X_train = data["X_train"]
+        self.y_train = data["y_train"]
+        self.X_val = data["X_val"]
+        self.y_val = data["y_val"]
+
+        # Unpack keyword arguments
+        self.update_rule = kwargs.pop("update_rule", "sgd")
+        self.optim_config = kwargs.pop("optim_config", {})
+        self.lr_decay = kwargs.pop("lr_decay", 1.0)
+        self.batch_size = kwargs.pop("batch_size", 100)
+        self.num_epochs = kwargs.pop("num_epochs", 10)
+        self.num_train_samples = kwargs.pop("num_train_samples", 1000)
+        self.num_val_samples = kwargs.pop("num_val_samples", None)
+
+        self.checkpoint_name = kwargs.pop("checkpoint_name", None)
+        self.print_every = kwargs.pop("print_every", 10)
+        self.verbose = kwargs.pop("verbose", True)
+
+        # Throw an error if there are extra keyword arguments
+        if len(kwargs) > 0:
+            extra = ", ".join('"%s"' % k for k in list(kwargs.keys()))
+            raise ValueError("Unrecognized arguments %s" % extra)
+
+        # Make sure the update rule exists, then replace the string
+        # name with the actual function
+        if not hasattr(optim, self.update_rule):
+            raise ValueError('Invalid update_rule "%s"' % self.update_rule)
+        self.update_rule = getattr(optim, self.update_rule)
+
+        self._reset()
+
+    def _reset(self):
+        """
+        Set up some book-keeping variables for optimization. Don't call this
+        manually.
+        """
+        # Set up some variables for book-keeping
+        self.epoch = 0
+        self.best_val_acc = 0
+        self.best_params = {}
+        self.loss_history = []
+        self.train_acc_history = []
+        self.val_acc_history = []
+
+        # Make a deep copy of the optim_config for each parameter
+        self.optim_configs = {}
+        for p in self.model.params:
+            d = {k: v for k, v in self.optim_config.items()}
+            self.optim_configs[p] = d
+
+    def _step(self):
+        """
+        Make a single gradient update. This is called by train() and should not
+        be called manually.
+        """
+        # Make a minibatch of training data
+        num_train = self.X_train.shape[0]
+        batch_mask = np.random.choice(num_train, self.batch_size)
+        X_batch = self.X_train[batch_mask]
+        y_batch = self.y_train[batch_mask]
+
+        # Compute loss and gradient
+        loss, grads = self.model.loss(X_batch, y_batch)
+        self.loss_history.append(loss)
+
+        # Perform a parameter update
+        for p, w in self.model.params.items():
+            dw = grads[p]
+            config = self.optim_configs[p]
+            next_w, next_config = self.update_rule(w, dw, config)
+            self.model.params[p] = next_w
+            self.optim_configs[p] = next_config
+
+    def _save_checkpoint(self):
+        if self.checkpoint_name is None:
+            return
+        checkpoint = {
+            "model": self.model,
+            "update_rule": self.update_rule,
+            "lr_decay": self.lr_decay,
+            "optim_config": self.optim_config,
+            "batch_size": self.batch_size,
+            "num_train_samples": self.num_train_samples,
+            "num_val_samples": self.num_val_samples,
+            "epoch": self.epoch,
+            "loss_history": self.loss_history,
+            "train_acc_history": self.train_acc_history,
+            "val_acc_history": self.val_acc_history,
+        }
+        filename = "%s_epoch_%d.pkl" % (self.checkpoint_name, self.epoch)
+        if self.verbose:
+            print('Saving checkpoint to "%s"' % filename)
+        with open(filename, "wb") as f:
+            pickle.dump(checkpoint, f)
+
+    def check_accuracy(self, X, y, num_samples=None, batch_size=100):
+        """
+        Check accuracy of the model on the provided data.
+
+        Inputs:
+        - X: Array of data, of shape (N, d_1, ..., d_k)
+        - y: Array of labels, of shape (N,)
+        - num_samples: If not None, subsample the data and only test the model
+          on num_samples datapoints.
+        - batch_size: Split X and y into batches of this size to avoid using
+          too much memory.
+
+        Returns:
+        - acc: Scalar giving the fraction of instances that were correctly
+          classified by the model.
+        """
+
+        # Maybe subsample the data
+        N = X.shape[0]
+        if num_samples is not None and N > num_samples:
+            mask = np.random.choice(N, num_samples)
+            N = num_samples
+            X = X[mask]
+            y = y[mask]
+
+        # Compute predictions in batches
+        num_batches = N // batch_size
+        if N % batch_size != 0:
+            num_batches += 1
+        y_pred = []
+        for i in range(num_batches):
+            start = i * batch_size
+            end = (i + 1) * batch_size
+            scores = self.model.loss(X[start:end])
+            y_pred.append(np.argmax(scores, axis=1))
+        y_pred = np.hstack(y_pred)
+        acc = np.mean(y_pred == y)
+
+        return acc
+
+    def train(self):
+        """
+        Run optimization to train the model.
+        """
+        num_train = self.X_train.shape[0]
+        iterations_per_epoch = max(num_train // self.batch_size, 1)
+        num_iterations = self.num_epochs * iterations_per_epoch
+
+        for t in range(num_iterations):
+            self._step()
+
+            # Maybe print training loss
+            if self.verbose and t % self.print_every == 0:
+                print(
+                    "(Iteration %d / %d) loss: %f"
+                    % (t + 1, num_iterations, self.loss_history[-1])
+                )
+
+            # At the end of every epoch, increment the epoch counter and decay
+            # the learning rate.
+            epoch_end = (t + 1) % iterations_per_epoch == 0
+            if epoch_end:
+                self.epoch += 1
+                for k in self.optim_configs:
+                    self.optim_configs[k]["learning_rate"] *= self.lr_decay
+
+            # Check train and val accuracy on the first iteration, the last
+            # iteration, and at the end of each epoch.
+            first_it = t == 0
+            last_it = t == num_iterations - 1
+            if first_it or last_it or epoch_end:
+                train_acc = self.check_accuracy(
+                    self.X_train, self.y_train, num_samples=self.num_train_samples
+                )
+                val_acc = self.check_accuracy(
+                    self.X_val, self.y_val, num_samples=self.num_val_samples
+                )
+                self.train_acc_history.append(train_acc)
+                self.val_acc_history.append(val_acc)
+                self._save_checkpoint()
+
+                if self.verbose:
+                    print(
+                        "(Epoch %d / %d) train acc: %f; val_acc: %f"
+                        % (self.epoch, self.num_epochs, train_acc, val_acc)
+                    )
+
+                # Keep track of the best model
+                if val_acc > self.best_val_acc:
+                    self.best_val_acc = val_acc
+                    self.best_params = {}
+                    for k, v in self.model.params.items():
+                        self.best_params[k] = v.copy()
+
+        # At the end of training swap the best params into the model
+        self.model.params = self.best_params
diff --git a/lab_3/scripts/vis_utils.py b/lab_3/scripts/vis_utils.py
new file mode 100644
index 0000000..c1049a0
--- /dev/null
+++ b/lab_3/scripts/vis_utils.py
@@ -0,0 +1,78 @@
+from builtins import range
+from past.builtins import xrange
+
+from math import sqrt, ceil
+import numpy as np
+
+
+def visualize_grid(Xs, ubound=255.0, padding=1):
+    """
+    Reshape a 4D tensor of image data to a grid for easy visualization.
+
+    Inputs:
+    - Xs: Data of shape (N, H, W, C)
+    - ubound: Output grid will have values scaled to the range [0, ubound]
+    - padding: The number of blank pixels between elements of the grid
+    """
+    (N, H, W, C) = Xs.shape
+    grid_size = int(ceil(sqrt(N)))
+    grid_height = H * grid_size + padding * (grid_size - 1)
+    grid_width = W * grid_size + padding * (grid_size - 1)
+    grid = np.zeros((grid_height, grid_width, C))
+    next_idx = 0
+    y0, y1 = 0, H
+    for y in range(grid_size):
+        x0, x1 = 0, W
+        for x in range(grid_size):
+            if next_idx < N:
+                img = Xs[next_idx]
+                low, high = np.min(img), np.max(img)
+                grid[y0:y1, x0:x1] = ubound * (img - low) / (high - low)
+                # grid[y0:y1, x0:x1] = Xs[next_idx]
+                next_idx += 1
+            x0 += W + padding
+            x1 += W + padding
+        y0 += H + padding
+        y1 += H + padding
+    # grid_max = np.max(grid)
+    # grid_min = np.min(grid)
+    # grid = ubound * (grid - grid_min) / (grid_max - grid_min)
+    return grid
+
+
+def vis_grid(Xs):
+    """ visualize a grid of images """
+    (N, H, W, C) = Xs.shape
+    A = int(ceil(sqrt(N)))
+    G = np.ones((A * H + A, A * W + A, C), Xs.dtype)
+    G *= np.min(Xs)
+    n = 0
+    for y in range(A):
+        for x in range(A):
+            if n < N:
+                G[y * H + y : (y + 1) * H + y, x * W + x : (x + 1) * W + x, :] = Xs[
+                    n, :, :, :
+                ]
+                n += 1
+    # normalize to [0,1]
+    maxg = G.max()
+    ming = G.min()
+    G = (G - ming) / (maxg - ming)
+    return G
+
+
+def vis_nn(rows):
+    """ visualize array of arrays of images """
+    N = len(rows)
+    D = len(rows[0])
+    H, W, C = rows[0][0].shape
+    Xs = rows[0][0]
+    G = np.ones((N * H + N, D * W + D, C), Xs.dtype)
+    for y in range(N):
+        for x in range(D):
+            G[y * H + y : (y + 1) * H + y, x * W + x : (x + 1) * W + x, :] = rows[y][x]
+    # normalize to [0,1]
+    maxg = G.max()
+    ming = G.min()
+    G = (G - ming) / (maxg - ming)
+    return G