diff --git a/lab_3/assignment3.ipynb b/lab_3/assignment3.ipynb new file mode 100644 index 0000000..a0eed34 --- /dev/null +++ b/lab_3/assignment3.ipynb @@ -0,0 +1,1871 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Лабораторная работа 3" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "1) Полносвязная нейронная сеть ( Fully-Connected Neural Network)\n", + "\n", + "2) Нормализация по мини-батчам (Batch normalization)\n", + "\n", + "3) Dropout\n", + "\n", + "4) Сверточные нейронные сети (Convolutional Networks)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Лабораторные работы можно выполнять с использованием сервиса Google Colaboratory (https://medium.com/deep-learning-turkey/google-colab-free-gpu-tutorial-e113627b9f5d) или на локальном компьютере. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Полносвязная нейронная сеть" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "В данной лабораторной работе необходимо будет реализовать полносвязную нейронную сеть, используя модульный подход. Для каждого слоя реализации прямого и обратного проходов алгоритма обратного распространения ошибки будут иметь следующий вид:\n", + "\n", + "```python\n", + "def layer_forward(x, w):\n", + " \"\"\" Receive inputs x and weights w \"\"\"\n", + " # Do some computations ...\n", + " z = # ... some intermediate value\n", + " # Do some more computations ...\n", + " out = # the output\n", + " \n", + " cache = (x, w, z, out) # Values we need to compute gradients\n", + " \n", + " return out, cache\n", + "```\n", + "\n", + "\n", + "\n", + "```python\n", + "def layer_backward(dout, cache):\n", + " \"\"\"\n", + " Receive dout (derivative of loss with respect to outputs) and cache,\n", + " and compute derivative with respect to inputs.\n", + " \"\"\"\n", + " # Unpack cache values\n", + " x, w, z, out = cache\n", + " \n", + " # Use values in cache to compute derivatives\n", + " dx = # Derivative of loss with respect to x\n", + " dw = # Derivative of loss with respect to w\n", + " \n", + " return dx, dw\n", + "```\n", + "\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=========== You can safely ignore the message below if you are NOT working on ConvolutionalNetworks.ipynb ===========\n", + "\tYou will need to compile a Cython extension for a portion of this assignment.\n", + "\tThe instructions to do this will be given in a section of the notebook below.\n", + "\tThere will be an option for Colab users and another for Jupyter (local) users.\n" + ] + } + ], + "source": [ + "from __future__ import print_function\n", + "import time\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "from scripts.classifiers.fc_net import *\n", + "\n", + "from scripts.gradient_check import eval_numerical_gradient, eval_numerical_gradient_array\n", + "from scripts.solver import Solver\n", + "from scripts.classifiers.cnn import *\n", + "from scripts.layers import *\n", + "from scripts.fast_layers import *\n", + "\n", + "\n", + "%matplotlib inline\n", + "plt.rcParams['figure.figsize'] = (10.0, 8.0) \n", + "plt.rcParams['image.interpolation'] = 'nearest'\n", + "plt.rcParams['image.cmap'] = 'gray'\n", + "\n", + "# for auto-reloading external modules\n", + "# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython\n", + "%load_ext autoreload\n", + "%autoreload 2\n", + "\n", + "def rel_error(x, y):\n", + " \"\"\" returns relative error \"\"\"\n", + " return np.max(np.abs(x - y) / (np.maximum(1e-8, np.abs(x) + np.abs(y))))\n", + "def print_mean_std(x,axis=0):\n", + " print(' means: ', x.mean(axis=axis))\n", + " print(' stds: ', x.std(axis=axis))\n", + " print() " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Загрузите данные из предыдущей лабораторной работы. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Для полносвязного слоя реализуйте прямой проход (метод affine_forward в scripts/layers.py). Протестируйте свою реализацию. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "num_inputs = 2\n", + "input_shape = (4, 5, 6)\n", + "output_dim = 3\n", + "\n", + "input_size = num_inputs * np.prod(input_shape)\n", + "weight_size = output_dim * np.prod(input_shape)\n", + "\n", + "x = np.linspace(-0.1, 0.5, num=input_size).reshape(num_inputs, *input_shape)\n", + "w = np.linspace(-0.2, 0.3, num=weight_size).reshape(np.prod(input_shape), output_dim)\n", + "b = np.linspace(-0.3, 0.1, num=output_dim)\n", + "\n", + "out, _ = affine_forward(x, w, b)\n", + "correct_out = np.array([[ 1.49834967, 1.70660132, 1.91485297],\n", + " [ 3.25553199, 3.5141327, 3.77273342]])\n", + "\n", + "\n", + "print('Testing affine_forward function:')\n", + "print('difference: ', rel_error(out, correct_out))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Для полносвязного слоя реализуйте обратный проход (метод affine_backward в scripts/layers.py). Протестируйте свою реализацию. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "np.random.seed(231)\n", + "x = np.random.randn(10, 2, 3)\n", + "w = np.random.randn(6, 5)\n", + "b = np.random.randn(5)\n", + "dout = np.random.randn(10, 5)\n", + "\n", + "dx_num = eval_numerical_gradient_array(lambda x: affine_forward(x, w, b)[0], x, dout)\n", + "dw_num = eval_numerical_gradient_array(lambda w: affine_forward(x, w, b)[0], w, dout)\n", + "db_num = eval_numerical_gradient_array(lambda b: affine_forward(x, w, b)[0], b, dout)\n", + "\n", + "_, cache = affine_forward(x, w, b)\n", + "dx, dw, db = affine_backward(dout, cache)\n", + "\n", + "print('Testing affine_backward function:')\n", + "print('dx error: ', rel_error(dx_num, dx))\n", + "print('dw error: ', rel_error(dw_num, dw))\n", + "print('db error: ', rel_error(db_num, db))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Реализуйте прямой проход для слоя активации ReLU (relu_forward) и протестируйте его." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "x = np.linspace(-0.5, 0.5, num=12).reshape(3, 4)\n", + "\n", + "out, _ = relu_forward(x)\n", + "correct_out = np.array([[ 0., 0., 0., 0., ],\n", + " [ 0., 0., 0.04545455, 0.13636364,],\n", + " [ 0.22727273, 0.31818182, 0.40909091, 0.5, ]])\n", + "\n", + "# Compare your output with ours. The error should be on the order of e-8\n", + "print('Testing relu_forward function:')\n", + "print('difference: ', rel_error(out, correct_out))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Реализуйте обратный проход для слоя активации ReLU (relu_backward ) и протестируйте его." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "np.random.seed(231)\n", + "x = np.random.randn(10, 10)\n", + "dout = np.random.randn(*x.shape)\n", + "\n", + "dx_num = eval_numerical_gradient_array(lambda x: relu_forward(x)[0], x, dout)\n", + "\n", + "_, cache = relu_forward(x)\n", + "dx = relu_backward(dout, cache)\n", + "\n", + "# The error should be on the order of e-12\n", + "print('Testing relu_backward function:')\n", + "print('dx error: ', rel_error(dx_num, dx))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "В скрипте /layer_utils.py приведены реализации прямого и обратного проходов для часто используемых комбинаций слоев. Например, за полносвязным слоем часто следует слой активации. Ознакомьтесь с функциями affine_relu_forward и affine_relu_backward, запустите код ниже и убедитесь, что ошибка порядка e-10 или ниже. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from scripts.layer_utils import affine_relu_forward, affine_relu_backward\n", + "np.random.seed(231)\n", + "x = np.random.randn(2, 3, 4)\n", + "w = np.random.randn(12, 10)\n", + "b = np.random.randn(10)\n", + "dout = np.random.randn(2, 10)\n", + "\n", + "out, cache = affine_relu_forward(x, w, b)\n", + "dx, dw, db = affine_relu_backward(dout, cache)\n", + "\n", + "dx_num = eval_numerical_gradient_array(lambda x: affine_relu_forward(x, w, b)[0], x, dout)\n", + "dw_num = eval_numerical_gradient_array(lambda w: affine_relu_forward(x, w, b)[0], w, dout)\n", + "db_num = eval_numerical_gradient_array(lambda b: affine_relu_forward(x, w, b)[0], b, dout)\n", + "\n", + "# Relative error should be around e-10 or less\n", + "print('Testing affine_relu_forward and affine_relu_backward:')\n", + "print('dx error: ', rel_error(dx_num, dx))\n", + "print('dw error: ', rel_error(dw_num, dw))\n", + "print('db error: ', rel_error(db_num, db))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Реализуйте двухслойную полносвязную сеть - класс TwoLayerNet в scripts/classifiers/fc_net.py . Проверьте свою реализацию, запустив код ниже. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "np.random.seed(231)\n", + "N, D, H, C = 3, 5, 50, 7\n", + "X = np.random.randn(N, D)\n", + "y = np.random.randint(C, size=N)\n", + "\n", + "std = 1e-3\n", + "model = TwoLayerNet(input_dim=D, hidden_dim=H, num_classes=C, weight_scale=std)\n", + "\n", + "print('Testing initialization ... ')\n", + "W1_std = abs(model.params['W1'].std() - std)\n", + "b1 = model.params['b1']\n", + "W2_std = abs(model.params['W2'].std() - std)\n", + "b2 = model.params['b2']\n", + "assert W1_std < std / 10, 'First layer weights do not seem right'\n", + "assert np.all(b1 == 0), 'First layer biases do not seem right'\n", + "assert W2_std < std / 10, 'Second layer weights do not seem right'\n", + "assert np.all(b2 == 0), 'Second layer biases do not seem right'\n", + "\n", + "print('Testing test-time forward pass ... ')\n", + "model.params['W1'] = np.linspace(-0.7, 0.3, num=D*H).reshape(D, H)\n", + "model.params['b1'] = np.linspace(-0.1, 0.9, num=H)\n", + "model.params['W2'] = np.linspace(-0.3, 0.4, num=H*C).reshape(H, C)\n", + "model.params['b2'] = np.linspace(-0.9, 0.1, num=C)\n", + "X = np.linspace(-5.5, 4.5, num=N*D).reshape(D, N).T\n", + "scores = model.loss(X)\n", + "correct_scores = np.asarray(\n", + " [[11.53165108, 12.2917344, 13.05181771, 13.81190102, 14.57198434, 15.33206765, 16.09215096],\n", + " [12.05769098, 12.74614105, 13.43459113, 14.1230412, 14.81149128, 15.49994135, 16.18839143],\n", + " [12.58373087, 13.20054771, 13.81736455, 14.43418138, 15.05099822, 15.66781506, 16.2846319 ]])\n", + "scores_diff = np.abs(scores - correct_scores).sum()\n", + "assert scores_diff < 1e-6, 'Problem with test-time forward pass'\n", + "\n", + "print('Testing training loss (no regularization)')\n", + "y = np.asarray([0, 5, 1])\n", + "loss, grads = model.loss(X, y)\n", + "correct_loss = 3.4702243556\n", + "assert abs(loss - correct_loss) < 1e-10, 'Problem with training-time loss'\n", + "\n", + "model.reg = 1.0\n", + "loss, grads = model.loss(X, y)\n", + "correct_loss = 26.5948426952\n", + "assert abs(loss - correct_loss) < 1e-10, 'Problem with regularization loss'\n", + "\n", + "# Errors should be around e-7 or less\n", + "for reg in [0.0, 0.7]:\n", + " print('Running numeric gradient check with reg = ', reg)\n", + " model.reg = reg\n", + " loss, grads = model.loss(X, y)\n", + "\n", + " for name in sorted(grads):\n", + " f = lambda _: model.loss(X, y)[0]\n", + " grad_num = eval_numerical_gradient(f, model.params[name], verbose=False)\n", + " print('%s relative error: %.2e' % (name, rel_error(grad_num, grads[name])))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Ознакомьтесь с API для обучения и тестирования моделей в scripts/solver.py . Используйте экземпляр класса Solver для обучения двухслойной полносвязной сети. Необходимо достичь минимум 50% верно классифицированных объектов на валидационном наборе. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model = TwoLayerNet()\n", + "solver = None\n", + "\n", + "##############################################################################\n", + "# TODO: Use a Solver instance to train a TwoLayerNet that achieves at least #\n", + "# 50% accuracy on the validation set. #\n", + "##############################################################################\n", + "# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****\n", + "\n", + "pass\n", + "\n", + "# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****\n", + "##############################################################################\n", + "# END OF YOUR CODE #\n", + "##############################################################################" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plt.subplot(2, 1, 1)\n", + "plt.title('Training loss')\n", + "plt.plot(solver.loss_history, 'o')\n", + "plt.xlabel('Iteration')\n", + "\n", + "plt.subplot(2, 1, 2)\n", + "plt.title('Accuracy')\n", + "plt.plot(solver.train_acc_history, '-o', label='train')\n", + "plt.plot(solver.val_acc_history, '-o', label='val')\n", + "plt.plot([0.5] * len(solver.val_acc_history), 'k--')\n", + "plt.xlabel('Epoch')\n", + "plt.legend(loc='lower right')\n", + "plt.gcf().set_size_inches(15, 12)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Теперь реализуйте полносвязную сеть с произвольным числом скрытых слоев. Ознакомьтесь с классом FullyConnectedNet в scripts/classifiers/fc_net.py . Реализуйте инициализацию, прямой и обратный проходы." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "np.random.seed(231)\n", + "N, D, H1, H2, C = 2, 15, 20, 30, 10\n", + "X = np.random.randn(N, D)\n", + "y = np.random.randint(C, size=(N,))\n", + "\n", + "for reg in [0, 3.14]:\n", + " print('Running check with reg = ', reg)\n", + " model = FullyConnectedNet([H1, H2], input_dim=D, num_classes=C,\n", + " reg=reg, weight_scale=5e-2, dtype=np.float64)\n", + "\n", + " loss, grads = model.loss(X, y)\n", + " print('Initial loss: ', loss)\n", + " \n", + " # Most of the errors should be on the order of e-7 or smaller. \n", + " # NOTE: It is fine however to see an error for W2 on the order of e-5\n", + " # for the check when reg = 0.0\n", + " for name in sorted(grads):\n", + " f = lambda _: model.loss(X, y)[0]\n", + " grad_num = eval_numerical_gradient(f, model.params[name], verbose=False, h=1e-5)\n", + " print('%s relative error: %.2e' % (name, rel_error(grad_num, grads[name])))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Попробуйте добиться эффекта переобучения на небольшом наборе изображений (например, 50). Используйте трехслойную сеть со 100 нейронами на каждом скрытом слое. Попробуйте переобучить сеть, достигнув 100 % accuracy за 20 эпох. Для этого поэкспериментируйте с параметрами weight_scale и learning_rate. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# TODO: Use a three-layer Net to overfit 50 training examples by \n", + "# tweaking just the learning rate and initialization scale.\n", + "\n", + "num_train = 50\n", + "small_data = {\n", + " 'X_train': data['X_train'][:num_train],\n", + " 'y_train': data['y_train'][:num_train],\n", + " 'X_val': data['X_val'],\n", + " 'y_val': data['y_val'],\n", + "}\n", + "\n", + "weight_scale = 1e-2 # Experiment with this!\n", + "learning_rate = 1e-4 # Experiment with this!\n", + "model = FullyConnectedNet([100, 100],\n", + " weight_scale=weight_scale, dtype=np.float64)\n", + "solver = Solver(model, small_data,\n", + " print_every=10, num_epochs=20, batch_size=25,\n", + " update_rule='sgd',\n", + " optim_config={\n", + " 'learning_rate': learning_rate,\n", + " }\n", + " )\n", + "solver.train()\n", + "\n", + "plt.plot(solver.loss_history, 'o')\n", + "plt.title('Training loss history')\n", + "plt.xlabel('Iteration')\n", + "plt.ylabel('Training loss')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Повторите эксперимент, описанный выше, для пятислойной сети." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# TODO: Use a five-layer Net to overfit 50 training examples by \n", + "# tweaking just the learning rate and initialization scale.\n", + "\n", + "num_train = 50\n", + "small_data = {\n", + " 'X_train': data['X_train'][:num_train],\n", + " 'y_train': data['y_train'][:num_train],\n", + " 'X_val': data['X_val'],\n", + " 'y_val': data['y_val'],\n", + "}\n", + "\n", + "learning_rate = 2e-3 # Experiment with this!\n", + "weight_scale = 1e-5 # Experiment with this!\n", + "model = FullyConnectedNet([100, 100, 100, 100],\n", + " weight_scale=weight_scale, dtype=np.float64)\n", + "solver = Solver(model, small_data,\n", + " print_every=10, num_epochs=20, batch_size=25,\n", + " update_rule='sgd',\n", + " optim_config={\n", + " 'learning_rate': learning_rate,\n", + " }\n", + " )\n", + "solver.train()\n", + "\n", + "plt.plot(solver.loss_history, 'o')\n", + "plt.title('Training loss history')\n", + "plt.xlabel('Iteration')\n", + "plt.ylabel('Training loss')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Сделайте выводы по проведенному эксперименту. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Ранее обновление весов проходило по правилу SGD. Теперь попробуйте реализовать стохастический градиентный спуск с импульсом (SGD+momentum). http://cs231n.github.io/neural-networks-3/#sgd Реализуйте sgd_momentum в scripts/optim.py и запустите проверку. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from scripts.optim import sgd_momentum\n", + "\n", + "N, D = 4, 5\n", + "w = np.linspace(-0.4, 0.6, num=N*D).reshape(N, D)\n", + "dw = np.linspace(-0.6, 0.4, num=N*D).reshape(N, D)\n", + "v = np.linspace(0.6, 0.9, num=N*D).reshape(N, D)\n", + "\n", + "config = {'learning_rate': 1e-3, 'velocity': v}\n", + "next_w, _ = sgd_momentum(w, dw, config=config)\n", + "\n", + "expected_next_w = np.asarray([\n", + " [ 0.1406, 0.20738947, 0.27417895, 0.34096842, 0.40775789],\n", + " [ 0.47454737, 0.54133684, 0.60812632, 0.67491579, 0.74170526],\n", + " [ 0.80849474, 0.87528421, 0.94207368, 1.00886316, 1.07565263],\n", + " [ 1.14244211, 1.20923158, 1.27602105, 1.34281053, 1.4096 ]])\n", + "expected_velocity = np.asarray([\n", + " [ 0.5406, 0.55475789, 0.56891579, 0.58307368, 0.59723158],\n", + " [ 0.61138947, 0.62554737, 0.63970526, 0.65386316, 0.66802105],\n", + " [ 0.68217895, 0.69633684, 0.71049474, 0.72465263, 0.73881053],\n", + " [ 0.75296842, 0.76712632, 0.78128421, 0.79544211, 0.8096 ]])\n", + "\n", + "# Should see relative errors around e-8 or less\n", + "print('next_w error: ', rel_error(next_w, expected_next_w))\n", + "print('velocity error: ', rel_error(expected_velocity, config['velocity']))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Сравните результаты обучения шестислойной сети, обученной классическим градиентным спуском и адаптивным алгоритмом с импульсом. Какой алгоритм сходится быстрее." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "num_train = 4000\n", + "small_data = {\n", + " 'X_train': data['X_train'][:num_train],\n", + " 'y_train': data['y_train'][:num_train],\n", + " 'X_val': data['X_val'],\n", + " 'y_val': data['y_val'],\n", + "}\n", + "\n", + "solvers = {}\n", + "\n", + "for update_rule in ['sgd', 'sgd_momentum']:\n", + " print('running with ', update_rule)\n", + " model = FullyConnectedNet([100, 100, 100, 100, 100], weight_scale=5e-2)\n", + "\n", + " solver = Solver(model, small_data,\n", + " num_epochs=5, batch_size=100,\n", + " update_rule=update_rule,\n", + " optim_config={\n", + " 'learning_rate': 5e-3,\n", + " },\n", + " verbose=True)\n", + " solvers[update_rule] = solver\n", + " solver.train()\n", + " print()\n", + "\n", + "plt.subplot(3, 1, 1)\n", + "plt.title('Training loss')\n", + "plt.xlabel('Iteration')\n", + "\n", + "plt.subplot(3, 1, 2)\n", + "plt.title('Training accuracy')\n", + "plt.xlabel('Epoch')\n", + "\n", + "plt.subplot(3, 1, 3)\n", + "plt.title('Validation accuracy')\n", + "plt.xlabel('Epoch')\n", + "\n", + "for update_rule, solver in solvers.items():\n", + " plt.subplot(3, 1, 1)\n", + " plt.plot(solver.loss_history, 'o', label=\"loss_%s\" % update_rule)\n", + " \n", + " plt.subplot(3, 1, 2)\n", + " plt.plot(solver.train_acc_history, '-o', label=\"train_acc_%s\" % update_rule)\n", + "\n", + " plt.subplot(3, 1, 3)\n", + " plt.plot(solver.val_acc_history, '-o', label=\"val_acc_%s\" % update_rule)\n", + " \n", + "for i in [1, 2, 3]:\n", + " plt.subplot(3, 1, i)\n", + " plt.legend(loc='upper center', ncol=4)\n", + "plt.gcf().set_size_inches(15, 15)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Реализуйте алгоритмы RMSProp [1] and Adam [2] с коррекцией смещения - методы rmsprop и adam . \n", + "\n", + "\n", + "[1] Tijmen Tieleman and Geoffrey Hinton. \"Lecture 6.5-rmsprop: Divide the gradient by a running average of its recent magnitude.\" COURSERA: Neural Networks for Machine Learning 4 (2012).\n", + "\n", + "[2] Diederik Kingma and Jimmy Ba, \"Adam: A Method for Stochastic Optimization\", ICLR 2015." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Test RMSProp implementation\n", + "from scripts.optim import rmsprop\n", + "\n", + "N, D = 4, 5\n", + "w = np.linspace(-0.4, 0.6, num=N*D).reshape(N, D)\n", + "dw = np.linspace(-0.6, 0.4, num=N*D).reshape(N, D)\n", + "cache = np.linspace(0.6, 0.9, num=N*D).reshape(N, D)\n", + "\n", + "config = {'learning_rate': 1e-2, 'cache': cache}\n", + "next_w, _ = rmsprop(w, dw, config=config)\n", + "\n", + "expected_next_w = np.asarray([\n", + " [-0.39223849, -0.34037513, -0.28849239, -0.23659121, -0.18467247],\n", + " [-0.132737, -0.08078555, -0.02881884, 0.02316247, 0.07515774],\n", + " [ 0.12716641, 0.17918792, 0.23122175, 0.28326742, 0.33532447],\n", + " [ 0.38739248, 0.43947102, 0.49155973, 0.54365823, 0.59576619]])\n", + "expected_cache = np.asarray([\n", + " [ 0.5976, 0.6126277, 0.6277108, 0.64284931, 0.65804321],\n", + " [ 0.67329252, 0.68859723, 0.70395734, 0.71937285, 0.73484377],\n", + " [ 0.75037008, 0.7659518, 0.78158892, 0.79728144, 0.81302936],\n", + " [ 0.82883269, 0.84469141, 0.86060554, 0.87657507, 0.8926 ]])\n", + "\n", + "# You should see relative errors around e-7 or less\n", + "print('next_w error: ', rel_error(expected_next_w, next_w))\n", + "print('cache error: ', rel_error(expected_cache, config['cache']))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Test Adam implementation\n", + "from scripts.optim import adam\n", + "\n", + "N, D = 4, 5\n", + "w = np.linspace(-0.4, 0.6, num=N*D).reshape(N, D)\n", + "dw = np.linspace(-0.6, 0.4, num=N*D).reshape(N, D)\n", + "m = np.linspace(0.6, 0.9, num=N*D).reshape(N, D)\n", + "v = np.linspace(0.7, 0.5, num=N*D).reshape(N, D)\n", + "\n", + "config = {'learning_rate': 1e-2, 'm': m, 'v': v, 't': 5}\n", + "next_w, _ = adam(w, dw, config=config)\n", + "\n", + "expected_next_w = np.asarray([\n", + " [-0.40094747, -0.34836187, -0.29577703, -0.24319299, -0.19060977],\n", + " [-0.1380274, -0.08544591, -0.03286534, 0.01971428, 0.0722929],\n", + " [ 0.1248705, 0.17744702, 0.23002243, 0.28259667, 0.33516969],\n", + " [ 0.38774145, 0.44031188, 0.49288093, 0.54544852, 0.59801459]])\n", + "expected_v = np.asarray([\n", + " [ 0.69966, 0.68908382, 0.67851319, 0.66794809, 0.65738853,],\n", + " [ 0.64683452, 0.63628604, 0.6257431, 0.61520571, 0.60467385,],\n", + " [ 0.59414753, 0.58362676, 0.57311152, 0.56260183, 0.55209767,],\n", + " [ 0.54159906, 0.53110598, 0.52061845, 0.51013645, 0.49966, ]])\n", + "expected_m = np.asarray([\n", + " [ 0.48, 0.49947368, 0.51894737, 0.53842105, 0.55789474],\n", + " [ 0.57736842, 0.59684211, 0.61631579, 0.63578947, 0.65526316],\n", + " [ 0.67473684, 0.69421053, 0.71368421, 0.73315789, 0.75263158],\n", + " [ 0.77210526, 0.79157895, 0.81105263, 0.83052632, 0.85 ]])\n", + "\n", + "# You should see relative errors around e-7 or less\n", + "print('next_w error: ', rel_error(expected_next_w, next_w))\n", + "print('v error: ', rel_error(expected_v, config['v']))\n", + "print('m error: ', rel_error(expected_m, config['m']))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Обучите пару глубоких сетей с испольованием RMSProp и Adam алгоритмов обновления весов и сравните результаты обучения." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Получите лучшую полносвязную сеть для классификации вашего набора данных. На наборе CIFAR-10 необходимо получить accuracy не ниже 50 % на валидационном наборе." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "best_model = None\n", + "################################################################################\n", + "# TODO: Train the best FullyConnectedNet that you can on CIFAR-10. You might #\n", + "# find batch/layer normalization and dropout useful. Store your best model in #\n", + "# the best_model variable. #\n", + "################################################################################\n", + "# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****\n", + "\n", + "pass\n", + "\n", + "# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****\n", + "################################################################################\n", + "# END OF YOUR CODE #\n", + "################################################################################" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Получите оценку accuracy для валидационной и тестовой выборок. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "y_test_pred = np.argmax(best_model.loss(data['X_test']), axis=1)\n", + "y_val_pred = np.argmax(best_model.loss(data['X_val']), axis=1)\n", + "print('Validation set accuracy: ', (y_val_pred == data['y_val']).mean())\n", + "print('Test set accuracy: ', (y_test_pred == data['y_test']).mean())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Нормализация по мини-батчам\n", + "\n", + "Идея нормализации по мини-батчам предложена в работе [1]\n", + "\n", + "[1] Sergey Ioffe and Christian Szegedy, \"Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift\", ICML 2015." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Реализуйте прямой проход для слоя батч-нормализации - функция batchnorm_forward в scripts/layers.py . Проверьте свою реализацию, запустив следующий код:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Check the training-time forward pass by checking means and variances\n", + "# of features both before and after batch normalization \n", + "\n", + "# Simulate the forward pass for a two-layer network\n", + "np.random.seed(231)\n", + "N, D1, D2, D3 = 200, 50, 60, 3\n", + "X = np.random.randn(N, D1)\n", + "W1 = np.random.randn(D1, D2)\n", + "W2 = np.random.randn(D2, D3)\n", + "a = np.maximum(0, X.dot(W1)).dot(W2)\n", + "\n", + "print('Before batch normalization:')\n", + "print_mean_std(a,axis=0)\n", + "\n", + "gamma = np.ones((D3,))\n", + "beta = np.zeros((D3,))\n", + "# Means should be close to zero and stds close to one\n", + "print('After batch normalization (gamma=1, beta=0)')\n", + "a_norm, _ = batchnorm_forward(a, gamma, beta, {'mode': 'train'})\n", + "print_mean_std(a_norm,axis=0)\n", + "\n", + "gamma = np.asarray([1.0, 2.0, 3.0])\n", + "beta = np.asarray([11.0, 12.0, 13.0])\n", + "# Now means should be close to beta and stds close to gamma\n", + "print('After batch normalization (gamma=', gamma, ', beta=', beta, ')')\n", + "a_norm, _ = batchnorm_forward(a, gamma, beta, {'mode': 'train'})\n", + "print_mean_std(a_norm,axis=0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Check the test-time forward pass by running the training-time\n", + "# forward pass many times to warm up the running averages, and then\n", + "# checking the means and variances of activations after a test-time\n", + "# forward pass.\n", + "\n", + "np.random.seed(231)\n", + "N, D1, D2, D3 = 200, 50, 60, 3\n", + "W1 = np.random.randn(D1, D2)\n", + "W2 = np.random.randn(D2, D3)\n", + "\n", + "bn_param = {'mode': 'train'}\n", + "gamma = np.ones(D3)\n", + "beta = np.zeros(D3)\n", + "\n", + "for t in range(50):\n", + " X = np.random.randn(N, D1)\n", + " a = np.maximum(0, X.dot(W1)).dot(W2)\n", + " batchnorm_forward(a, gamma, beta, bn_param)\n", + "\n", + "bn_param['mode'] = 'test'\n", + "X = np.random.randn(N, D1)\n", + "a = np.maximum(0, X.dot(W1)).dot(W2)\n", + "a_norm, _ = batchnorm_forward(a, gamma, beta, bn_param)\n", + "\n", + "# Means should be close to zero and stds close to one, but will be\n", + "# noisier than training-time forward passes.\n", + "print('After batch normalization (test-time):')\n", + "print_mean_std(a_norm,axis=0)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Реализуйте обратный проход в функции batchnorm_backward." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Gradient check batchnorm backward pass\n", + "np.random.seed(231)\n", + "N, D = 4, 5\n", + "x = 5 * np.random.randn(N, D) + 12\n", + "gamma = np.random.randn(D)\n", + "beta = np.random.randn(D)\n", + "dout = np.random.randn(N, D)\n", + "\n", + "bn_param = {'mode': 'train'}\n", + "fx = lambda x: batchnorm_forward(x, gamma, beta, bn_param)[0]\n", + "fg = lambda a: batchnorm_forward(x, a, beta, bn_param)[0]\n", + "fb = lambda b: batchnorm_forward(x, gamma, b, bn_param)[0]\n", + "\n", + "dx_num = eval_numerical_gradient_array(fx, x, dout)\n", + "da_num = eval_numerical_gradient_array(fg, gamma.copy(), dout)\n", + "db_num = eval_numerical_gradient_array(fb, beta.copy(), dout)\n", + "\n", + "_, cache = batchnorm_forward(x, gamma, beta, bn_param)\n", + "dx, dgamma, dbeta = batchnorm_backward(dout, cache)\n", + "#You should expect to see relative errors between 1e-13 and 1e-8\n", + "print('dx error: ', rel_error(dx_num, dx))\n", + "print('dgamma error: ', rel_error(da_num, dgamma))\n", + "print('dbeta error: ', rel_error(db_num, dbeta))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Измените реализацию класса FullyConnectedNet, добавив батч-нормализацию. \n", + "Если флаг normalization == \"batchnorm\", то вам необходимо вставить слой батч-нормализации перед каждым слоем активации ReLU, кроме выхода сети. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "np.random.seed(231)\n", + "N, D, H1, H2, C = 2, 15, 20, 30, 10\n", + "X = np.random.randn(N, D)\n", + "y = np.random.randint(C, size=(N,))\n", + "\n", + "# You should expect losses between 1e-4~1e-10 for W, \n", + "# losses between 1e-08~1e-10 for b,\n", + "# and losses between 1e-08~1e-09 for beta and gammas.\n", + "for reg in [0, 3.14]:\n", + " print('Running check with reg = ', reg)\n", + " model = FullyConnectedNet([H1, H2], input_dim=D, num_classes=C,\n", + " reg=reg, weight_scale=5e-2, dtype=np.float64,\n", + " normalization='batchnorm')\n", + "\n", + " loss, grads = model.loss(X, y)\n", + " print('Initial loss: ', loss)\n", + "\n", + " for name in sorted(grads):\n", + " f = lambda _: model.loss(X, y)[0]\n", + " grad_num = eval_numerical_gradient(f, model.params[name], verbose=False, h=1e-5)\n", + " print('%s relative error: %.2e' % (name, rel_error(grad_num, grads[name])))\n", + " if reg == 0: print()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Обучите 6-ти слойную сеть на наборе из 1000 изображений с батч-нормализацией и без нее" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "np.random.seed(231)\n", + "# Try training a very deep net with batchnorm\n", + "hidden_dims = [100, 100, 100, 100, 100]\n", + "\n", + "num_train = 1000\n", + "small_data = {\n", + " 'X_train': data['X_train'][:num_train],\n", + " 'y_train': data['y_train'][:num_train],\n", + " 'X_val': data['X_val'],\n", + " 'y_val': data['y_val'],\n", + "}\n", + "\n", + "weight_scale = 2e-2\n", + "bn_model = FullyConnectedNet(hidden_dims, weight_scale=weight_scale, normalization='batchnorm')\n", + "model = FullyConnectedNet(hidden_dims, weight_scale=weight_scale, normalization=None)\n", + "\n", + "print('Solver with batch norm:')\n", + "bn_solver = Solver(bn_model, small_data,\n", + " num_epochs=10, batch_size=50,\n", + " update_rule='adam',\n", + " optim_config={\n", + " 'learning_rate': 1e-3,\n", + " },\n", + " verbose=True,print_every=20)\n", + "bn_solver.train()\n", + "\n", + "print('\\nSolver without batch norm:')\n", + "solver = Solver(model, small_data,\n", + " num_epochs=10, batch_size=50,\n", + " update_rule='adam',\n", + " optim_config={\n", + " 'learning_rate': 1e-3,\n", + " },\n", + " verbose=True, print_every=20)\n", + "solver.train()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Визуализируйте процесс обучения для двух сетей. Увеличилась ли скорость сходимости в случае с батч-нормализацией? Сделайте выводы. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def plot_training_history(title, label, baseline, bn_solvers, plot_fn, bl_marker='.', bn_marker='.', labels=None):\n", + " \"\"\"utility function for plotting training history\"\"\"\n", + " plt.title(title)\n", + " plt.xlabel(label)\n", + " bn_plots = [plot_fn(bn_solver) for bn_solver in bn_solvers]\n", + " bl_plot = plot_fn(baseline)\n", + " num_bn = len(bn_plots)\n", + " for i in range(num_bn):\n", + " label='with_norm'\n", + " if labels is not None:\n", + " label += str(labels[i])\n", + " plt.plot(bn_plots[i], bn_marker, label=label)\n", + " label='baseline'\n", + " if labels is not None:\n", + " label += str(labels[0])\n", + " plt.plot(bl_plot, bl_marker, label=label)\n", + " plt.legend(loc='lower center', ncol=num_bn+1) \n", + "\n", + " \n", + "plt.subplot(3, 1, 1)\n", + "plot_training_history('Training loss','Iteration', solver, [bn_solver], \\\n", + " lambda x: x.loss_history, bl_marker='o', bn_marker='o')\n", + "plt.subplot(3, 1, 2)\n", + "plot_training_history('Training accuracy','Epoch', solver, [bn_solver], \\\n", + " lambda x: x.train_acc_history, bl_marker='-o', bn_marker='-o')\n", + "plt.subplot(3, 1, 3)\n", + "plot_training_history('Validation accuracy','Epoch', solver, [bn_solver], \\\n", + " lambda x: x.val_acc_history, bl_marker='-o', bn_marker='-o')\n", + "\n", + "plt.gcf().set_size_inches(15, 15)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Обучите 6-тислойную сеть с батч-нормализацией и без нее, используя разные размеры батча. Визуализируйте графики обучения. Сделайте выводы по результатам эксперимента. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def run_batchsize_experiments(normalization_mode):\n", + " np.random.seed(231)\n", + " # Try training a very deep net with batchnorm\n", + " hidden_dims = [100, 100, 100, 100, 100]\n", + " num_train = 1000\n", + " small_data = {\n", + " 'X_train': data['X_train'][:num_train],\n", + " 'y_train': data['y_train'][:num_train],\n", + " 'X_val': data['X_val'],\n", + " 'y_val': data['y_val'],\n", + " }\n", + " n_epochs=10\n", + " weight_scale = 2e-2\n", + " batch_sizes = [5,10,50]\n", + " lr = 10**(-3.5)\n", + " solver_bsize = batch_sizes[0]\n", + "\n", + " print('No normalization: batch size = ',solver_bsize)\n", + " model = FullyConnectedNet(hidden_dims, weight_scale=weight_scale, normalization=None)\n", + " solver = Solver(model, small_data,\n", + " num_epochs=n_epochs, batch_size=solver_bsize,\n", + " update_rule='adam',\n", + " optim_config={\n", + " 'learning_rate': lr,\n", + " },\n", + " verbose=False)\n", + " solver.train()\n", + " \n", + " bn_solvers = []\n", + " for i in range(len(batch_sizes)):\n", + " b_size=batch_sizes[i]\n", + " print('Normalization: batch size = ',b_size)\n", + " bn_model = FullyConnectedNet(hidden_dims, weight_scale=weight_scale, normalization=normalization_mode)\n", + " bn_solver = Solver(bn_model, small_data,\n", + " num_epochs=n_epochs, batch_size=b_size,\n", + " update_rule='adam',\n", + " optim_config={\n", + " 'learning_rate': lr,\n", + " },\n", + " verbose=False)\n", + " bn_solver.train()\n", + " bn_solvers.append(bn_solver)\n", + " \n", + " return bn_solvers, solver, batch_sizes\n", + "\n", + "batch_sizes = [5,10,50]\n", + "bn_solvers_bsize, solver_bsize, batch_sizes = run_batchsize_experiments('batchnorm')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plt.subplot(2, 1, 1)\n", + "plot_training_history('Training accuracy (Batch Normalization)','Epoch', solver_bsize, bn_solvers_bsize, \\\n", + " lambda x: x.train_acc_history, bl_marker='-^', bn_marker='-o', labels=batch_sizes)\n", + "plt.subplot(2, 1, 2)\n", + "plot_training_history('Validation accuracy (Batch Normalization)','Epoch', solver_bsize, bn_solvers_bsize, \\\n", + " lambda x: x.val_acc_history, bl_marker='-^', bn_marker='-o', labels=batch_sizes)\n", + "\n", + "plt.gcf().set_size_inches(15, 10)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Dropout" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Реализуйте прямой проход для dropout-слоя в scripts/layers.py\n", + "\n", + "http://cs231n.github.io/neural-networks-2/#reg" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "np.random.seed(231)\n", + "x = np.random.randn(500, 500) + 10\n", + "\n", + "for p in [0.25, 0.4, 0.7]:\n", + " out, _ = dropout_forward(x, {'mode': 'train', 'p': p})\n", + " out_test, _ = dropout_forward(x, {'mode': 'test', 'p': p})\n", + "\n", + " print('Running tests with p = ', p)\n", + " print('Mean of input: ', x.mean())\n", + " print('Mean of train-time output: ', out.mean())\n", + " print('Mean of test-time output: ', out_test.mean())\n", + " print('Fraction of train-time output set to zero: ', (out == 0).mean())\n", + " print('Fraction of test-time output set to zero: ', (out_test == 0).mean())\n", + " print()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Реализуйте обратный проход для dropout-слоя" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "np.random.seed(231)\n", + "x = np.random.randn(10, 10) + 10\n", + "dout = np.random.randn(*x.shape)\n", + "\n", + "dropout_param = {'mode': 'train', 'p': 0.2, 'seed': 123}\n", + "out, cache = dropout_forward(x, dropout_param)\n", + "dx = dropout_backward(dout, cache)\n", + "dx_num = eval_numerical_gradient_array(lambda xx: dropout_forward(xx, dropout_param)[0], x, dout)\n", + "\n", + "# Error should be around e-10 or less\n", + "print('dx relative error: ', rel_error(dx, dx_num))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Добавьте в реализацию класса FullyConnectedNet поддержку dropout. Если параметр dropout != 1, то добавьте в модель dropout-слой после каждого слоя активации. Проверьте свою реализацию" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "np.random.seed(231)\n", + "N, D, H1, H2, C = 2, 15, 20, 30, 10\n", + "X = np.random.randn(N, D)\n", + "y = np.random.randint(C, size=(N,))\n", + "\n", + "for dropout in [1, 0.75, 0.5]:\n", + " print('Running check with dropout = ', dropout)\n", + " model = FullyConnectedNet([H1, H2], input_dim=D, num_classes=C,\n", + " weight_scale=5e-2, dtype=np.float64,\n", + " dropout=dropout, seed=123)\n", + "\n", + " loss, grads = model.loss(X, y)\n", + " print('Initial loss: ', loss)\n", + " \n", + " # Relative errors should be around e-6 or less; Note that it's fine\n", + " # if for dropout=1 you have W2 error be on the order of e-5.\n", + " for name in sorted(grads):\n", + " f = lambda _: model.loss(X, y)[0]\n", + " grad_num = eval_numerical_gradient(f, model.params[name], verbose=False, h=1e-5)\n", + " print('%s relative error: %.2e' % (name, rel_error(grad_num, grads[name])))\n", + " print()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Обучите две двухслойные сети с dropout-слоем (вероятность отсева 0,25) и без на наборе из 500 изображений. Визуализируйте графики обучения. Сделайте выводы по результатам эксперимента" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Train two identical nets, one with dropout and one without\n", + "np.random.seed(231)\n", + "num_train = 500\n", + "small_data = {\n", + " 'X_train': data['X_train'][:num_train],\n", + " 'y_train': data['y_train'][:num_train],\n", + " 'X_val': data['X_val'],\n", + " 'y_val': data['y_val'],\n", + "}\n", + "\n", + "solvers = {}\n", + "dropout_choices = [1, 0.25]\n", + "for dropout in dropout_choices:\n", + " model = FullyConnectedNet([500], dropout=dropout)\n", + " print(dropout)\n", + "\n", + " solver = Solver(model, small_data,\n", + " num_epochs=25, batch_size=100,\n", + " update_rule='adam',\n", + " optim_config={\n", + " 'learning_rate': 5e-4,\n", + " },\n", + " verbose=True, print_every=100)\n", + " solver.train()\n", + " solvers[dropout] = solver\n", + " print()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Plot train and validation accuracies of the two models\n", + "\n", + "train_accs = []\n", + "val_accs = []\n", + "for dropout in dropout_choices:\n", + " solver = solvers[dropout]\n", + " train_accs.append(solver.train_acc_history[-1])\n", + " val_accs.append(solver.val_acc_history[-1])\n", + "\n", + "plt.subplot(3, 1, 1)\n", + "for dropout in dropout_choices:\n", + " plt.plot(solvers[dropout].train_acc_history, 'o', label='%.2f dropout' % dropout)\n", + "plt.title('Train accuracy')\n", + "plt.xlabel('Epoch')\n", + "plt.ylabel('Accuracy')\n", + "plt.legend(ncol=2, loc='lower right')\n", + " \n", + "plt.subplot(3, 1, 2)\n", + "for dropout in dropout_choices:\n", + " plt.plot(solvers[dropout].val_acc_history, 'o', label='%.2f dropout' % dropout)\n", + "plt.title('Val accuracy')\n", + "plt.xlabel('Epoch')\n", + "plt.ylabel('Accuracy')\n", + "plt.legend(ncol=2, loc='lower right')\n", + "\n", + "plt.gcf().set_size_inches(15, 15)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Сверточные нейронные сети (CNN)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Реализуйте прямой проход для сверточного слоя - функция conv_forward_naive в scripts/layers.py юПроверьте свою реализацию, запустив код ниже " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "x_shape = (2, 3, 4, 4)\n", + "w_shape = (3, 3, 4, 4)\n", + "x = np.linspace(-0.1, 0.5, num=np.prod(x_shape)).reshape(x_shape)\n", + "w = np.linspace(-0.2, 0.3, num=np.prod(w_shape)).reshape(w_shape)\n", + "b = np.linspace(-0.1, 0.2, num=3)\n", + "\n", + "conv_param = {'stride': 2, 'pad': 1}\n", + "out, _ = conv_forward_naive(x, w, b, conv_param)\n", + "correct_out = np.array([[[[-0.08759809, -0.10987781],\n", + " [-0.18387192, -0.2109216 ]],\n", + " [[ 0.21027089, 0.21661097],\n", + " [ 0.22847626, 0.23004637]],\n", + " [[ 0.50813986, 0.54309974],\n", + " [ 0.64082444, 0.67101435]]],\n", + " [[[-0.98053589, -1.03143541],\n", + " [-1.19128892, -1.24695841]],\n", + " [[ 0.69108355, 0.66880383],\n", + " [ 0.59480972, 0.56776003]],\n", + " [[ 2.36270298, 2.36904306],\n", + " [ 2.38090835, 2.38247847]]]])\n", + "\n", + "# Compare your output to ours; difference should be around e-8\n", + "print('Testing conv_forward_naive')\n", + "print('difference: ', rel_error(out, correct_out))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Реализуйте обратный проход - функция conv_backward_naive в scripts/layers.py" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "np.random.seed(231)\n", + "x = np.random.randn(4, 3, 5, 5)\n", + "w = np.random.randn(2, 3, 3, 3)\n", + "b = np.random.randn(2,)\n", + "dout = np.random.randn(4, 2, 5, 5)\n", + "conv_param = {'stride': 1, 'pad': 1}\n", + "\n", + "dx_num = eval_numerical_gradient_array(lambda x: conv_forward_naive(x, w, b, conv_param)[0], x, dout)\n", + "dw_num = eval_numerical_gradient_array(lambda w: conv_forward_naive(x, w, b, conv_param)[0], w, dout)\n", + "db_num = eval_numerical_gradient_array(lambda b: conv_forward_naive(x, w, b, conv_param)[0], b, dout)\n", + "\n", + "out, cache = conv_forward_naive(x, w, b, conv_param)\n", + "dx, dw, db = conv_backward_naive(dout, cache)\n", + "\n", + "# Your errors should be around e-8 or less.\n", + "print('Testing conv_backward_naive function')\n", + "print('dx error: ', rel_error(dx, dx_num))\n", + "print('dw error: ', rel_error(dw, dw_num))\n", + "print('db error: ', rel_error(db, db_num))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Реализуйте прямой проход для max-pooling слоя -функция max_pool_forward_naive в scripts/layers.py" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "x_shape = (2, 3, 4, 4)\n", + "x = np.linspace(-0.3, 0.4, num=np.prod(x_shape)).reshape(x_shape)\n", + "pool_param = {'pool_width': 2, 'pool_height': 2, 'stride': 2}\n", + "\n", + "out, _ = max_pool_forward_naive(x, pool_param)\n", + "\n", + "correct_out = np.array([[[[-0.26315789, -0.24842105],\n", + " [-0.20421053, -0.18947368]],\n", + " [[-0.14526316, -0.13052632],\n", + " [-0.08631579, -0.07157895]],\n", + " [[-0.02736842, -0.01263158],\n", + " [ 0.03157895, 0.04631579]]],\n", + " [[[ 0.09052632, 0.10526316],\n", + " [ 0.14947368, 0.16421053]],\n", + " [[ 0.20842105, 0.22315789],\n", + " [ 0.26736842, 0.28210526]],\n", + " [[ 0.32631579, 0.34105263],\n", + " [ 0.38526316, 0.4 ]]]])\n", + "\n", + "# Compare your output with ours. Difference should be on the order of e-8.\n", + "print('Testing max_pool_forward_naive function:')\n", + "print('difference: ', rel_error(out, correct_out))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Реализуйте обратный проход для max-pooling слоя в max_pool_backward_naive . " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "np.random.seed(231)\n", + "x = np.random.randn(3, 2, 8, 8)\n", + "dout = np.random.randn(3, 2, 4, 4)\n", + "pool_param = {'pool_height': 2, 'pool_width': 2, 'stride': 2}\n", + "\n", + "dx_num = eval_numerical_gradient_array(lambda x: max_pool_forward_naive(x, pool_param)[0], x, dout)\n", + "\n", + "out, cache = max_pool_forward_naive(x, pool_param)\n", + "dx = max_pool_backward_naive(dout, cache)\n", + "\n", + "# Your error should be on the order of e-12\n", + "print('Testing max_pool_backward_naive function:')\n", + "print('dx error: ', rel_error(dx, dx_num))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "В скрипте scripts/fast_layers.py представлены быстрые реализации слоев свертки и пуллинга, написанных с использованием Cython. \n", + "\n", + "Для компиляции выполните следующую команду в директории scripts\n", + "\n", + "```bash\n", + "python setup.py build_ext --inplace\n", + "```\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Сравните ваши реализации слоев свертки и пуллинга с быстрыми реализациями." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Rel errors should be around e-9 or less\n", + "from scripts.fast_layers import conv_forward_fast, conv_backward_fast\n", + "from time import time\n", + "np.random.seed(231)\n", + "x = np.random.randn(100, 3, 31, 31)\n", + "w = np.random.randn(25, 3, 3, 3)\n", + "b = np.random.randn(25,)\n", + "dout = np.random.randn(100, 25, 16, 16)\n", + "conv_param = {'stride': 2, 'pad': 1}\n", + "\n", + "t0 = time()\n", + "out_naive, cache_naive = conv_forward_naive(x, w, b, conv_param)\n", + "t1 = time()\n", + "out_fast, cache_fast = conv_forward_fast(x, w, b, conv_param)\n", + "t2 = time()\n", + "\n", + "print('Testing conv_forward_fast:')\n", + "print('Naive: %fs' % (t1 - t0))\n", + "print('Fast: %fs' % (t2 - t1))\n", + "print('Speedup: %fx' % ((t1 - t0) / (t2 - t1)))\n", + "print('Difference: ', rel_error(out_naive, out_fast))\n", + "\n", + "t0 = time()\n", + "dx_naive, dw_naive, db_naive = conv_backward_naive(dout, cache_naive)\n", + "t1 = time()\n", + "dx_fast, dw_fast, db_fast = conv_backward_fast(dout, cache_fast)\n", + "t2 = time()\n", + "\n", + "print('\\nTesting conv_backward_fast:')\n", + "print('Naive: %fs' % (t1 - t0))\n", + "print('Fast: %fs' % (t2 - t1))\n", + "print('Speedup: %fx' % ((t1 - t0) / (t2 - t1)))\n", + "print('dx difference: ', rel_error(dx_naive, dx_fast))\n", + "print('dw difference: ', rel_error(dw_naive, dw_fast))\n", + "print('db difference: ', rel_error(db_naive, db_fast))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Relative errors should be close to 0.0\n", + "from scripts.fast_layers import max_pool_forward_fast, max_pool_backward_fast\n", + "np.random.seed(231)\n", + "x = np.random.randn(100, 3, 32, 32)\n", + "dout = np.random.randn(100, 3, 16, 16)\n", + "pool_param = {'pool_height': 2, 'pool_width': 2, 'stride': 2}\n", + "\n", + "t0 = time()\n", + "out_naive, cache_naive = max_pool_forward_naive(x, pool_param)\n", + "t1 = time()\n", + "out_fast, cache_fast = max_pool_forward_fast(x, pool_param)\n", + "t2 = time()\n", + "\n", + "print('Testing pool_forward_fast:')\n", + "print('Naive: %fs' % (t1 - t0))\n", + "print('fast: %fs' % (t2 - t1))\n", + "print('speedup: %fx' % ((t1 - t0) / (t2 - t1)))\n", + "print('difference: ', rel_error(out_naive, out_fast))\n", + "\n", + "t0 = time()\n", + "dx_naive = max_pool_backward_naive(dout, cache_naive)\n", + "t1 = time()\n", + "dx_fast = max_pool_backward_fast(dout, cache_fast)\n", + "t2 = time()\n", + "\n", + "print('\\nTesting pool_backward_fast:')\n", + "print('Naive: %fs' % (t1 - t0))\n", + "print('fast: %fs' % (t2 - t1))\n", + "print('speedup: %fx' % ((t1 - t0) / (t2 - t1)))\n", + "print('dx difference: ', rel_error(dx_naive, dx_fast))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "В layer_utils.py вы можете найти часто используемые комбинации слоев, используемых в сверточных сетях. Ознакомьтесь с ними и запустите код ниже для проверки их работы" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from scripts.layer_utils import conv_relu_pool_forward, conv_relu_pool_backward\n", + "np.random.seed(231)\n", + "x = np.random.randn(2, 3, 16, 16)\n", + "w = np.random.randn(3, 3, 3, 3)\n", + "b = np.random.randn(3,)\n", + "dout = np.random.randn(2, 3, 8, 8)\n", + "conv_param = {'stride': 1, 'pad': 1}\n", + "pool_param = {'pool_height': 2, 'pool_width': 2, 'stride': 2}\n", + "\n", + "out, cache = conv_relu_pool_forward(x, w, b, conv_param, pool_param)\n", + "dx, dw, db = conv_relu_pool_backward(dout, cache)\n", + "\n", + "dx_num = eval_numerical_gradient_array(lambda x: conv_relu_pool_forward(x, w, b, conv_param, pool_param)[0], x, dout)\n", + "dw_num = eval_numerical_gradient_array(lambda w: conv_relu_pool_forward(x, w, b, conv_param, pool_param)[0], w, dout)\n", + "db_num = eval_numerical_gradient_array(lambda b: conv_relu_pool_forward(x, w, b, conv_param, pool_param)[0], b, dout)\n", + "\n", + "# Relative errors should be around e-8 or less\n", + "print('Testing conv_relu_pool')\n", + "print('dx error: ', rel_error(dx_num, dx))\n", + "print('dw error: ', rel_error(dw_num, dw))\n", + "print('db error: ', rel_error(db_num, db))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from scripts.layer_utils import conv_relu_forward, conv_relu_backward\n", + "np.random.seed(231)\n", + "x = np.random.randn(2, 3, 8, 8)\n", + "w = np.random.randn(3, 3, 3, 3)\n", + "b = np.random.randn(3,)\n", + "dout = np.random.randn(2, 3, 8, 8)\n", + "conv_param = {'stride': 1, 'pad': 1}\n", + "\n", + "out, cache = conv_relu_forward(x, w, b, conv_param)\n", + "dx, dw, db = conv_relu_backward(dout, cache)\n", + "\n", + "dx_num = eval_numerical_gradient_array(lambda x: conv_relu_forward(x, w, b, conv_param)[0], x, dout)\n", + "dw_num = eval_numerical_gradient_array(lambda w: conv_relu_forward(x, w, b, conv_param)[0], w, dout)\n", + "db_num = eval_numerical_gradient_array(lambda b: conv_relu_forward(x, w, b, conv_param)[0], b, dout)\n", + "\n", + "# Relative errors should be around e-8 or less\n", + "print('Testing conv_relu:')\n", + "print('dx error: ', rel_error(dx_num, dx))\n", + "print('dw error: ', rel_error(dw_num, dw))\n", + "print('db error: ', rel_error(db_num, db))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Напишите реализацию класса ThreeLayerConvNet в scripts/classifiers/cnn.py . Вы можете использовать готовые реализации слоев и их комбинаций." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Проверьте вашу реализацию. Ожидается, что значение функции потерь softmax будет порядка `log(C)` для `C` классов для случая без регуляризации. В случае регуляризации значение функции потерь должно немного возрасти. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model = ThreeLayerConvNet()\n", + "\n", + "N = 50\n", + "X = np.random.randn(N, 3, 32, 32)\n", + "y = np.random.randint(10, size=N)\n", + "\n", + "loss, grads = model.loss(X, y)\n", + "print('Initial loss (no regularization): ', loss)\n", + "\n", + "model.reg = 0.5\n", + "loss, grads = model.loss(X, y)\n", + "print('Initial loss (with regularization): ', loss)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Проверьте реализацию обратного прохода" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "num_inputs = 2\n", + "input_dim = (3, 16, 16)\n", + "reg = 0.0\n", + "num_classes = 10\n", + "np.random.seed(231)\n", + "X = np.random.randn(num_inputs, *input_dim)\n", + "y = np.random.randint(num_classes, size=num_inputs)\n", + "\n", + "model = ThreeLayerConvNet(num_filters=3, filter_size=3,\n", + " input_dim=input_dim, hidden_dim=7,\n", + " dtype=np.float64)\n", + "loss, grads = model.loss(X, y)\n", + "# Errors should be small, but correct implementations may have\n", + "# relative errors up to the order of e-2\n", + "for param_name in sorted(grads):\n", + " f = lambda _: model.loss(X, y)[0]\n", + " param_grad_num = eval_numerical_gradient(f, model.params[param_name], verbose=False, h=1e-6)\n", + " e = rel_error(param_grad_num, grads[param_name])\n", + " print('%s max relative error: %e' % (param_name, rel_error(param_grad_num, grads[param_name])))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Попробуйте добиться эффекта переобучения. Обучите модель на небольшом наборе данных.Сравните значения accuracy на обучающих данных и на валидационных. Визуализируйте графики обучения " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "np.random.seed(231)\n", + "\n", + "num_train = 100\n", + "small_data = {\n", + " 'X_train': data['X_train'][:num_train],\n", + " 'y_train': data['y_train'][:num_train],\n", + " 'X_val': data['X_val'],\n", + " 'y_val': data['y_val'],\n", + "}\n", + "\n", + "model = ThreeLayerConvNet(weight_scale=1e-2)\n", + "\n", + "solver = Solver(model, small_data,\n", + " num_epochs=15, batch_size=50,\n", + " update_rule='adam',\n", + " optim_config={\n", + " 'learning_rate': 1e-3,\n", + " },\n", + " verbose=True, print_every=1)\n", + "solver.train()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Print final training accuracy\n", + "print(\n", + " \"Small data training accuracy:\",\n", + " solver.check_accuracy(small_data['X_train'], small_data['y_train'])\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Print final validation accuracy\n", + "print(\n", + " \"Small data validation accuracy:\",\n", + " solver.check_accuracy(small_data['X_val'], small_data['y_val'])\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plt.subplot(2, 1, 1)\n", + "plt.plot(solver.loss_history, 'o')\n", + "plt.xlabel('iteration')\n", + "plt.ylabel('loss')\n", + "\n", + "plt.subplot(2, 1, 2)\n", + "plt.plot(solver.train_acc_history, '-o')\n", + "plt.plot(solver.val_acc_history, '-o')\n", + "plt.legend(['train', 'val'], loc='upper left')\n", + "plt.xlabel('epoch')\n", + "plt.ylabel('accuracy')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Обучите сеть на полном наборе данных. Выведите accuracy на обучающей и валидационной выборках" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model = ThreeLayerConvNet(weight_scale=0.001, hidden_dim=500, reg=0.001)\n", + "\n", + "solver = Solver(model, data,\n", + " num_epochs=1, batch_size=50,\n", + " update_rule='adam',\n", + " optim_config={\n", + " 'learning_rate': 1e-3,\n", + " },\n", + " verbose=True, print_every=20)\n", + "solver.train()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Print final training accuracy\n", + "print(\n", + " \"Full data training accuracy:\",\n", + " solver.check_accuracy(small_data['X_train'], small_data['y_train'])\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Print final validation accuracy\n", + "print(\n", + " \"Full data validation accuracy:\",\n", + " solver.check_accuracy(data['X_val'], data['y_val'])\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Визуализируйте фильтры на первом слое обученной сети" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from scripts.vis_utils import visualize_grid\n", + "\n", + "grid = visualize_grid(model.params['W1'].transpose(0, 2, 3, 1))\n", + "plt.imshow(grid.astype('uint8'))\n", + "plt.axis('off')\n", + "plt.gcf().set_size_inches(5, 5)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/lab_3/scripts/__init__.py b/lab_3/scripts/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/lab_3/scripts/classifiers/__init__.py b/lab_3/scripts/classifiers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/lab_3/scripts/classifiers/cnn.py b/lab_3/scripts/classifiers/cnn.py new file mode 100644 index 0000000..08894d6 --- /dev/null +++ b/lab_3/scripts/classifiers/cnn.py @@ -0,0 +1,135 @@ +from builtins import object +import numpy as np + +from ..layers import * +from ..fast_layers import * +from ..layer_utils import * + + +class ThreeLayerConvNet(object): + """ + A three-layer convolutional network with the following architecture: + + conv - relu - 2x2 max pool - affine - relu - affine - softmax + + The network operates on minibatches of data that have shape (N, C, H, W) + consisting of N images, each with height H and width W and with C input + channels. + """ + + def __init__( + self, + input_dim=(3, 32, 32), + num_filters=32, + filter_size=7, + hidden_dim=100, + num_classes=10, + weight_scale=1e-3, + reg=0.0, + dtype=np.float32, + ): + """ + Initialize a new network. + + Inputs: + - input_dim: Tuple (C, H, W) giving size of input data + - num_filters: Number of filters to use in the convolutional layer + - filter_size: Width/height of filters to use in the convolutional layer + - hidden_dim: Number of units to use in the fully-connected hidden layer + - num_classes: Number of scores to produce from the final affine layer. + - weight_scale: Scalar giving standard deviation for random initialization + of weights. + - reg: Scalar giving L2 regularization strength + - dtype: numpy datatype to use for computation. + """ + self.params = {} + self.reg = reg + self.dtype = dtype + + ############################################################################ + # TODO: Initialize weights and biases for the three-layer convolutional # + # network. Weights should be initialized from a Gaussian centered at 0.0 # + # with standard deviation equal to weight_scale; biases should be # + # initialized to zero. All weights and biases should be stored in the # + # dictionary self.params. Store weights and biases for the convolutional # + # layer using the keys 'W1' and 'b1'; use keys 'W2' and 'b2' for the # + # weights and biases of the hidden affine layer, and keys 'W3' and 'b3' # + # for the weights and biases of the output affine layer. # + # # + # IMPORTANT: For this assignment, you can assume that the padding # + # and stride of the first convolutional layer are chosen so that # + # **the width and height of the input are preserved**. Take a look at # + # the start of the loss() function to see how that happens. # + ############################################################################ + # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** + + pass + + # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** + ############################################################################ + # END OF YOUR CODE # + ############################################################################ + + for k, v in self.params.items(): + self.params[k] = v.astype(dtype) + + def loss(self, X, y=None): + """ + Evaluate loss and gradient for the three-layer convolutional network. + + Input / output: Same API as TwoLayerNet in fc_net.py. + """ + W1, b1 = self.params["W1"], self.params["b1"] + W2, b2 = self.params["W2"], self.params["b2"] + W3, b3 = self.params["W3"], self.params["b3"] + + # pass conv_param to the forward pass for the convolutional layer + # Padding and stride chosen to preserve the input spatial size + filter_size = W1.shape[2] + conv_param = {"stride": 1, "pad": (filter_size - 1) // 2} + + # pass pool_param to the forward pass for the max-pooling layer + pool_param = {"pool_height": 2, "pool_width": 2, "stride": 2} + + scores = None + ############################################################################ + # TODO: Implement the forward pass for the three-layer convolutional net, # + # computing the class scores for X and storing them in the scores # + # variable. # + # # + # Remember you can use the functions defined in cs231n/fast_layers.py and # + # cs231n/layer_utils.py in your implementation (already imported). # + ############################################################################ + # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** + + pass + + # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** + ############################################################################ + # END OF YOUR CODE # + ############################################################################ + + if y is None: + return scores + + loss, grads = 0, {} + ############################################################################ + # TODO: Implement the backward pass for the three-layer convolutional net, # + # storing the loss and gradients in the loss and grads variables. Compute # + # data loss using softmax, and make sure that grads[k] holds the gradients # + # for self.params[k]. Don't forget to add L2 regularization! # + # # + # NOTE: To ensure that your implementation matches ours and you pass the # + # automated tests, make sure that your L2 regularization includes a factor # + # of 0.5 to simplify the expression for the gradient. # + ############################################################################ + # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** + + pass + + # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** + ############################################################################ + # END OF YOUR CODE # + ############################################################################ + + return loss, grads diff --git a/lab_3/scripts/classifiers/fc_net.py b/lab_3/scripts/classifiers/fc_net.py new file mode 100644 index 0000000..8b71b0f --- /dev/null +++ b/lab_3/scripts/classifiers/fc_net.py @@ -0,0 +1,291 @@ +from builtins import range +from builtins import object +import numpy as np + +from ..layers import * +from ..layer_utils import * + + +class TwoLayerNet(object): + """ + A two-layer fully-connected neural network with ReLU nonlinearity and + softmax loss that uses a modular layer design. We assume an input dimension + of D, a hidden dimension of H, and perform classification over C classes. + + The architecure should be affine - relu - affine - softmax. + + Note that this class does not implement gradient descent; instead, it + will interact with a separate Solver object that is responsible for running + optimization. + + The learnable parameters of the model are stored in the dictionary + self.params that maps parameter names to numpy arrays. + """ + + def __init__( + self, + input_dim=3 * 32 * 32, + hidden_dim=100, + num_classes=10, + weight_scale=1e-3, + reg=0.0, + ): + """ + Initialize a new network. + + Inputs: + - input_dim: An integer giving the size of the input + - hidden_dim: An integer giving the size of the hidden layer + - num_classes: An integer giving the number of classes to classify + - weight_scale: Scalar giving the standard deviation for random + initialization of the weights. + - reg: Scalar giving L2 regularization strength. + """ + self.params = {} + self.reg = reg + + ############################################################################ + # TODO: Initialize the weights and biases of the two-layer net. Weights # + # should be initialized from a Gaussian centered at 0.0 with # + # standard deviation equal to weight_scale, and biases should be # + # initialized to zero. All weights and biases should be stored in the # + # dictionary self.params, with first layer weights # + # and biases using the keys 'W1' and 'b1' and second layer # + # weights and biases using the keys 'W2' and 'b2'. # + ############################################################################ + # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** + + pass + + # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** + ############################################################################ + # END OF YOUR CODE # + ############################################################################ + + def loss(self, X, y=None): + """ + Compute loss and gradient for a minibatch of data. + + Inputs: + - X: Array of input data of shape (N, d_1, ..., d_k) + - y: Array of labels, of shape (N,). y[i] gives the label for X[i]. + + Returns: + If y is None, then run a test-time forward pass of the model and return: + - scores: Array of shape (N, C) giving classification scores, where + scores[i, c] is the classification score for X[i] and class c. + + If y is not None, then run a training-time forward and backward pass and + return a tuple of: + - loss: Scalar value giving the loss + - grads: Dictionary with the same keys as self.params, mapping parameter + names to gradients of the loss with respect to those parameters. + """ + scores = None + ############################################################################ + # TODO: Implement the forward pass for the two-layer net, computing the # + # class scores for X and storing them in the scores variable. # + ############################################################################ + # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** + + pass + + # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** + ############################################################################ + # END OF YOUR CODE # + ############################################################################ + + # If y is None then we are in test mode so just return scores + if y is None: + return scores + + loss, grads = 0, {} + ############################################################################ + # TODO: Implement the backward pass for the two-layer net. Store the loss # + # in the loss variable and gradients in the grads dictionary. Compute data # + # loss using softmax, and make sure that grads[k] holds the gradients for # + # self.params[k]. Don't forget to add L2 regularization! # + # # + # NOTE: To ensure that your implementation matches ours and you pass the # + # automated tests, make sure that your L2 regularization includes a factor # + # of 0.5 to simplify the expression for the gradient. # + ############################################################################ + # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** + + pass + + # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** + ############################################################################ + # END OF YOUR CODE # + ############################################################################ + + return loss, grads + + +class FullyConnectedNet(object): + """ + A fully-connected neural network with an arbitrary number of hidden layers, + ReLU nonlinearities, and a softmax loss function. This will also implement + dropout and batch/layer normalization as options. For a network with L layers, + the architecture will be + + {affine - [batch/layer norm] - relu - [dropout]} x (L - 1) - affine - softmax + + where batch/layer normalization and dropout are optional, and the {...} block is + repeated L - 1 times. + + Similar to the TwoLayerNet above, learnable parameters are stored in the + self.params dictionary and will be learned using the Solver class. + """ + + def __init__( + self, + hidden_dims, + input_dim=3 * 32 * 32, + num_classes=10, + dropout=1, + normalization=None, + reg=0.0, + weight_scale=1e-2, + dtype=np.float32, + seed=None, + ): + """ + Initialize a new FullyConnectedNet. + + Inputs: + - hidden_dims: A list of integers giving the size of each hidden layer. + - input_dim: An integer giving the size of the input. + - num_classes: An integer giving the number of classes to classify. + - dropout: Scalar between 0 and 1 giving dropout strength. If dropout=1 then + the network should not use dropout at all. + - normalization: What type of normalization the network should use. Valid values + are "batchnorm", "layernorm", or None for no normalization (the default). + - reg: Scalar giving L2 regularization strength. + - weight_scale: Scalar giving the standard deviation for random + initialization of the weights. + - dtype: A numpy datatype object; all computations will be performed using + this datatype. float32 is faster but less accurate, so you should use + float64 for numeric gradient checking. + - seed: If not None, then pass this random seed to the dropout layers. This + will make the dropout layers deteriminstic so we can gradient check the + model. + """ + self.normalization = normalization + self.use_dropout = dropout != 1 + self.reg = reg + self.num_layers = 1 + len(hidden_dims) + self.dtype = dtype + self.params = {} + + ############################################################################ + # TODO: Initialize the parameters of the network, storing all values in # + # the self.params dictionary. Store weights and biases for the first layer # + # in W1 and b1; for the second layer use W2 and b2, etc. Weights should be # + # initialized from a normal distribution centered at 0 with standard # + # deviation equal to weight_scale. Biases should be initialized to zero. # + # # + # When using batch normalization, store scale and shift parameters for the # + # first layer in gamma1 and beta1; for the second layer use gamma2 and # + # beta2, etc. Scale parameters should be initialized to ones and shift # + # parameters should be initialized to zeros. # + ############################################################################ + # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** + + pass + + # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** + ############################################################################ + # END OF YOUR CODE # + ############################################################################ + + # When using dropout we need to pass a dropout_param dictionary to each + # dropout layer so that the layer knows the dropout probability and the mode + # (train / test). You can pass the same dropout_param to each dropout layer. + self.dropout_param = {} + if self.use_dropout: + self.dropout_param = {"mode": "train", "p": dropout} + if seed is not None: + self.dropout_param["seed"] = seed + + # With batch normalization we need to keep track of running means and + # variances, so we need to pass a special bn_param object to each batch + # normalization layer. You should pass self.bn_params[0] to the forward pass + # of the first batch normalization layer, self.bn_params[1] to the forward + # pass of the second batch normalization layer, etc. + self.bn_params = [] + if self.normalization == "batchnorm": + self.bn_params = [{"mode": "train"} for i in range(self.num_layers - 1)] + if self.normalization == "layernorm": + self.bn_params = [{} for i in range(self.num_layers - 1)] + + # Cast all parameters to the correct datatype + for k, v in self.params.items(): + self.params[k] = v.astype(dtype) + + def loss(self, X, y=None): + """ + Compute loss and gradient for the fully-connected net. + + Input / output: Same as TwoLayerNet above. + """ + X = X.astype(self.dtype) + mode = "test" if y is None else "train" + + # Set train/test mode for batchnorm params and dropout param since they + # behave differently during training and testing. + if self.use_dropout: + self.dropout_param["mode"] = mode + if self.normalization == "batchnorm": + for bn_param in self.bn_params: + bn_param["mode"] = mode + scores = None + ############################################################################ + # TODO: Implement the forward pass for the fully-connected net, computing # + # the class scores for X and storing them in the scores variable. # + # # + # When using dropout, you'll need to pass self.dropout_param to each # + # dropout forward pass. # + # # + # When using batch normalization, you'll need to pass self.bn_params[0] to # + # the forward pass for the first batch normalization layer, pass # + # self.bn_params[1] to the forward pass for the second batch normalization # + # layer, etc. # + ############################################################################ + # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** + + pass + + # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** + ############################################################################ + # END OF YOUR CODE # + ############################################################################ + + # If test mode return early + if mode == "test": + return scores + + loss, grads = 0.0, {} + ############################################################################ + # TODO: Implement the backward pass for the fully-connected net. Store the # + # loss in the loss variable and gradients in the grads dictionary. Compute # + # data loss using softmax, and make sure that grads[k] holds the gradients # + # for self.params[k]. Don't forget to add L2 regularization! # + # # + # When using batch/layer normalization, you don't need to regularize the scale # + # and shift parameters. # + # # + # NOTE: To ensure that your implementation matches ours and you pass the # + # automated tests, make sure that your L2 regularization includes a factor # + # of 0.5 to simplify the expression for the gradient. # + ############################################################################ + # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** + + pass + + # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** + ############################################################################ + # END OF YOUR CODE # + ############################################################################ + + return loss, grads diff --git a/lab_3/scripts/data_utils.py b/lab_3/scripts/data_utils.py new file mode 100644 index 0000000..e88cfec --- /dev/null +++ b/lab_3/scripts/data_utils.py @@ -0,0 +1,270 @@ +from __future__ import print_function + +from builtins import range +from six.moves import cPickle as pickle +import numpy as np +import os +from imageio import imread +import platform + + +def load_pickle(f): + version = platform.python_version_tuple() + if version[0] == "2": + return pickle.load(f) + elif version[0] == "3": + return pickle.load(f, encoding="latin1") + raise ValueError("invalid python version: {}".format(version)) + + +def load_CIFAR_batch(filename): + """ load single batch of cifar """ + with open(filename, "rb") as f: + datadict = load_pickle(f) + X = datadict["data"] + Y = datadict["labels"] + X = X.reshape(10000, 3, 32, 32).transpose(0, 2, 3, 1).astype("float") + Y = np.array(Y) + return X, Y + + +def load_CIFAR10(ROOT): + """ load all of cifar """ + xs = [] + ys = [] + for b in range(1, 6): + f = os.path.join(ROOT, "data_batch_%d" % (b,)) + X, Y = load_CIFAR_batch(f) + xs.append(X) + ys.append(Y) + Xtr = np.concatenate(xs) + Ytr = np.concatenate(ys) + del X, Y + Xte, Yte = load_CIFAR_batch(os.path.join(ROOT, "test_batch")) + return Xtr, Ytr, Xte, Yte + + +def get_CIFAR10_data( + num_training=49000, num_validation=1000, num_test=1000, subtract_mean=True +): + """ + Load the CIFAR-10 dataset from disk and perform preprocessing to prepare + it for classifiers. These are the same steps as we used for the SVM, but + condensed to a single function. + """ + # Load the raw CIFAR-10 data + cifar10_dir = os.path.join( + os.path.dirname(__file__), "datasets/cifar-10-batches-py" + ) + X_train, y_train, X_test, y_test = load_CIFAR10(cifar10_dir) + + # Subsample the data + mask = list(range(num_training, num_training + num_validation)) + X_val = X_train[mask] + y_val = y_train[mask] + mask = list(range(num_training)) + X_train = X_train[mask] + y_train = y_train[mask] + mask = list(range(num_test)) + X_test = X_test[mask] + y_test = y_test[mask] + + # Normalize the data: subtract the mean image + if subtract_mean: + mean_image = np.mean(X_train, axis=0) + X_train -= mean_image + X_val -= mean_image + X_test -= mean_image + + # Transpose so that channels come first + X_train = X_train.transpose(0, 3, 1, 2).copy() + X_val = X_val.transpose(0, 3, 1, 2).copy() + X_test = X_test.transpose(0, 3, 1, 2).copy() + + # Package data into a dictionary + return { + "X_train": X_train, + "y_train": y_train, + "X_val": X_val, + "y_val": y_val, + "X_test": X_test, + "y_test": y_test, + } + + +def load_tiny_imagenet(path, dtype=np.float32, subtract_mean=True): + """ + Load TinyImageNet. Each of TinyImageNet-100-A, TinyImageNet-100-B, and + TinyImageNet-200 have the same directory structure, so this can be used + to load any of them. + + Inputs: + - path: String giving path to the directory to load. + - dtype: numpy datatype used to load the data. + - subtract_mean: Whether to subtract the mean training image. + + Returns: A dictionary with the following entries: + - class_names: A list where class_names[i] is a list of strings giving the + WordNet names for class i in the loaded dataset. + - X_train: (N_tr, 3, 64, 64) array of training images + - y_train: (N_tr,) array of training labels + - X_val: (N_val, 3, 64, 64) array of validation images + - y_val: (N_val,) array of validation labels + - X_test: (N_test, 3, 64, 64) array of testing images. + - y_test: (N_test,) array of test labels; if test labels are not available + (such as in student code) then y_test will be None. + - mean_image: (3, 64, 64) array giving mean training image + """ + # First load wnids + with open(os.path.join(path, "wnids.txt"), "r") as f: + wnids = [x.strip() for x in f] + + # Map wnids to integer labels + wnid_to_label = {wnid: i for i, wnid in enumerate(wnids)} + + # Use words.txt to get names for each class + with open(os.path.join(path, "words.txt"), "r") as f: + wnid_to_words = dict(line.split("\t") for line in f) + for wnid, words in wnid_to_words.items(): + wnid_to_words[wnid] = [w.strip() for w in words.split(",")] + class_names = [wnid_to_words[wnid] for wnid in wnids] + + # Next load training data. + X_train = [] + y_train = [] + for i, wnid in enumerate(wnids): + if (i + 1) % 20 == 0: + print("loading training data for synset %d / %d" % (i + 1, len(wnids))) + # To figure out the filenames we need to open the boxes file + boxes_file = os.path.join(path, "train", wnid, "%s_boxes.txt" % wnid) + with open(boxes_file, "r") as f: + filenames = [x.split("\t")[0] for x in f] + num_images = len(filenames) + + X_train_block = np.zeros((num_images, 3, 64, 64), dtype=dtype) + y_train_block = wnid_to_label[wnid] * np.ones(num_images, dtype=np.int64) + for j, img_file in enumerate(filenames): + img_file = os.path.join(path, "train", wnid, "images", img_file) + img = imread(img_file) + if img.ndim == 2: + ## grayscale file + img.shape = (64, 64, 1) + X_train_block[j] = img.transpose(2, 0, 1) + X_train.append(X_train_block) + y_train.append(y_train_block) + + # We need to concatenate all training data + X_train = np.concatenate(X_train, axis=0) + y_train = np.concatenate(y_train, axis=0) + + # Next load validation data + with open(os.path.join(path, "val", "val_annotations.txt"), "r") as f: + img_files = [] + val_wnids = [] + for line in f: + img_file, wnid = line.split("\t")[:2] + img_files.append(img_file) + val_wnids.append(wnid) + num_val = len(img_files) + y_val = np.array([wnid_to_label[wnid] for wnid in val_wnids]) + X_val = np.zeros((num_val, 3, 64, 64), dtype=dtype) + for i, img_file in enumerate(img_files): + img_file = os.path.join(path, "val", "images", img_file) + img = imread(img_file) + if img.ndim == 2: + img.shape = (64, 64, 1) + X_val[i] = img.transpose(2, 0, 1) + + # Next load test images + # Students won't have test labels, so we need to iterate over files in the + # images directory. + img_files = os.listdir(os.path.join(path, "test", "images")) + X_test = np.zeros((len(img_files), 3, 64, 64), dtype=dtype) + for i, img_file in enumerate(img_files): + img_file = os.path.join(path, "test", "images", img_file) + img = imread(img_file) + if img.ndim == 2: + img.shape = (64, 64, 1) + X_test[i] = img.transpose(2, 0, 1) + + y_test = None + y_test_file = os.path.join(path, "test", "test_annotations.txt") + if os.path.isfile(y_test_file): + with open(y_test_file, "r") as f: + img_file_to_wnid = {} + for line in f: + line = line.split("\t") + img_file_to_wnid[line[0]] = line[1] + y_test = [wnid_to_label[img_file_to_wnid[img_file]] for img_file in img_files] + y_test = np.array(y_test) + + mean_image = X_train.mean(axis=0) + if subtract_mean: + X_train -= mean_image[None] + X_val -= mean_image[None] + X_test -= mean_image[None] + + return { + "class_names": class_names, + "X_train": X_train, + "y_train": y_train, + "X_val": X_val, + "y_val": y_val, + "X_test": X_test, + "y_test": y_test, + "class_names": class_names, + "mean_image": mean_image, + } + + +def load_models(models_dir): + """ + Load saved models from disk. This will attempt to unpickle all files in a + directory; any files that give errors on unpickling (such as README.txt) + will be skipped. + + Inputs: + - models_dir: String giving the path to a directory containing model files. + Each model file is a pickled dictionary with a 'model' field. + + Returns: + A dictionary mapping model file names to models. + """ + models = {} + for model_file in os.listdir(models_dir): + with open(os.path.join(models_dir, model_file), "rb") as f: + try: + models[model_file] = load_pickle(f)["model"] + except pickle.UnpicklingError: + continue + return models + + +def load_imagenet_val(num=None): + """Load a handful of validation images from ImageNet. + + Inputs: + - num: Number of images to load (max of 25) + + Returns: + - X: numpy array with shape [num, 224, 224, 3] + - y: numpy array of integer image labels, shape [num] + - class_names: dict mapping integer label to class name + """ + imagenet_fn = os.path.join( + os.path.dirname(__file__), "datasets/imagenet_val_25.npz" + ) + if not os.path.isfile(imagenet_fn): + print("file %s not found" % imagenet_fn) + print("Run the following:") + print("cd cs231n/datasets") + print("bash get_imagenet_val.sh") + assert False, "Need to download imagenet_val_25.npz" + f = np.load(imagenet_fn) + X = f["X"] + y = f["y"] + class_names = f["label_map"].item() + if num is not None: + X = X[:num] + y = y[:num] + return X, y, class_names diff --git a/lab_3/scripts/datasets/get_datasets.sh b/lab_3/scripts/datasets/get_datasets.sh new file mode 100644 index 0000000..06d4b3c --- /dev/null +++ b/lab_3/scripts/datasets/get_datasets.sh @@ -0,0 +1,5 @@ +if [ ! -d "cifar-10-batches-py" ]; then + wget http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz -O cifar-10-python.tar.gz + tar -xzvf cifar-10-python.tar.gz + rm cifar-10-python.tar.gz +fi diff --git a/lab_3/scripts/fast_layers.py b/lab_3/scripts/fast_layers.py new file mode 100644 index 0000000..46797f2 --- /dev/null +++ b/lab_3/scripts/fast_layers.py @@ -0,0 +1,283 @@ +from __future__ import print_function +import numpy as np + +try: + from .im2col_cython import col2im_cython, im2col_cython + from .im2col_cython import col2im_6d_cython +except ImportError: + print("""=========== You can safely ignore the message below if you are NOT working on ConvolutionalNetworks.ipynb ===========""") + print("\tYou will need to compile a Cython extension for a portion of this assignment.") + print("\tThe instructions to do this will be given in a section of the notebook below.") + print("\tThere will be an option for Colab users and another for Jupyter (local) users.") + +from .im2col import * + + +def conv_forward_im2col(x, w, b, conv_param): + """ + A fast implementation of the forward pass for a convolutional layer + based on im2col and col2im. + """ + N, C, H, W = x.shape + num_filters, _, filter_height, filter_width = w.shape + stride, pad = conv_param["stride"], conv_param["pad"] + + # Check dimensions + assert (W + 2 * pad - filter_width) % stride == 0, "width does not work" + assert (H + 2 * pad - filter_height) % stride == 0, "height does not work" + + # Create output + out_height = (H + 2 * pad - filter_height) // stride + 1 + out_width = (W + 2 * pad - filter_width) // stride + 1 + out = np.zeros((N, num_filters, out_height, out_width), dtype=x.dtype) + + # x_cols = im2col_indices(x, w.shape[2], w.shape[3], pad, stride) + x_cols = im2col_cython(x, w.shape[2], w.shape[3], pad, stride) + res = w.reshape((w.shape[0], -1)).dot(x_cols) + b.reshape(-1, 1) + + out = res.reshape(w.shape[0], out.shape[2], out.shape[3], x.shape[0]) + out = out.transpose(3, 0, 1, 2) + + cache = (x, w, b, conv_param, x_cols) + return out, cache + + +def conv_forward_strides(x, w, b, conv_param): + N, C, H, W = x.shape + F, _, HH, WW = w.shape + stride, pad = conv_param["stride"], conv_param["pad"] + + # Check dimensions + # assert (W + 2 * pad - WW) % stride == 0, 'width does not work' + # assert (H + 2 * pad - HH) % stride == 0, 'height does not work' + + # Pad the input + p = pad + x_padded = np.pad(x, ((0, 0), (0, 0), (p, p), (p, p)), mode="constant") + + # Figure out output dimensions + H += 2 * pad + W += 2 * pad + out_h = (H - HH) // stride + 1 + out_w = (W - WW) // stride + 1 + + # Perform an im2col operation by picking clever strides + shape = (C, HH, WW, N, out_h, out_w) + strides = (H * W, W, 1, C * H * W, stride * W, stride) + strides = x.itemsize * np.array(strides) + x_stride = np.lib.stride_tricks.as_strided(x_padded, shape=shape, strides=strides) + x_cols = np.ascontiguousarray(x_stride) + x_cols.shape = (C * HH * WW, N * out_h * out_w) + + # Now all our convolutions are a big matrix multiply + res = w.reshape(F, -1).dot(x_cols) + b.reshape(-1, 1) + + # Reshape the output + res.shape = (F, N, out_h, out_w) + out = res.transpose(1, 0, 2, 3) + + # Be nice and return a contiguous array + # The old version of conv_forward_fast doesn't do this, so for a fair + # comparison we won't either + out = np.ascontiguousarray(out) + + cache = (x, w, b, conv_param, x_cols) + return out, cache + + +def conv_backward_strides(dout, cache): + x, w, b, conv_param, x_cols = cache + stride, pad = conv_param["stride"], conv_param["pad"] + + N, C, H, W = x.shape + F, _, HH, WW = w.shape + _, _, out_h, out_w = dout.shape + + db = np.sum(dout, axis=(0, 2, 3)) + + dout_reshaped = dout.transpose(1, 0, 2, 3).reshape(F, -1) + dw = dout_reshaped.dot(x_cols.T).reshape(w.shape) + + dx_cols = w.reshape(F, -1).T.dot(dout_reshaped) + dx_cols.shape = (C, HH, WW, N, out_h, out_w) + dx = col2im_6d_cython(dx_cols, N, C, H, W, HH, WW, pad, stride) + + return dx, dw, db + + +def conv_backward_im2col(dout, cache): + """ + A fast implementation of the backward pass for a convolutional layer + based on im2col and col2im. + """ + x, w, b, conv_param, x_cols = cache + stride, pad = conv_param["stride"], conv_param["pad"] + + db = np.sum(dout, axis=(0, 2, 3)) + + num_filters, _, filter_height, filter_width = w.shape + dout_reshaped = dout.transpose(1, 2, 3, 0).reshape(num_filters, -1) + dw = dout_reshaped.dot(x_cols.T).reshape(w.shape) + + dx_cols = w.reshape(num_filters, -1).T.dot(dout_reshaped) + # dx = col2im_indices(dx_cols, x.shape, filter_height, filter_width, pad, stride) + dx = col2im_cython( + dx_cols, + x.shape[0], + x.shape[1], + x.shape[2], + x.shape[3], + filter_height, + filter_width, + pad, + stride, + ) + + return dx, dw, db + + +conv_forward_fast = conv_forward_strides +conv_backward_fast = conv_backward_strides + + +def max_pool_forward_fast(x, pool_param): + """ + A fast implementation of the forward pass for a max pooling layer. + + This chooses between the reshape method and the im2col method. If the pooling + regions are square and tile the input image, then we can use the reshape + method which is very fast. Otherwise we fall back on the im2col method, which + is not much faster than the naive method. + """ + N, C, H, W = x.shape + pool_height, pool_width = pool_param["pool_height"], pool_param["pool_width"] + stride = pool_param["stride"] + + same_size = pool_height == pool_width == stride + tiles = H % pool_height == 0 and W % pool_width == 0 + if same_size and tiles: + out, reshape_cache = max_pool_forward_reshape(x, pool_param) + cache = ("reshape", reshape_cache) + else: + out, im2col_cache = max_pool_forward_im2col(x, pool_param) + cache = ("im2col", im2col_cache) + return out, cache + + +def max_pool_backward_fast(dout, cache): + """ + A fast implementation of the backward pass for a max pooling layer. + + This switches between the reshape method an the im2col method depending on + which method was used to generate the cache. + """ + method, real_cache = cache + if method == "reshape": + return max_pool_backward_reshape(dout, real_cache) + elif method == "im2col": + return max_pool_backward_im2col(dout, real_cache) + else: + raise ValueError('Unrecognized method "%s"' % method) + + +def max_pool_forward_reshape(x, pool_param): + """ + A fast implementation of the forward pass for the max pooling layer that uses + some clever reshaping. + + This can only be used for square pooling regions that tile the input. + """ + N, C, H, W = x.shape + pool_height, pool_width = pool_param["pool_height"], pool_param["pool_width"] + stride = pool_param["stride"] + assert pool_height == pool_width == stride, "Invalid pool params" + assert H % pool_height == 0 + assert W % pool_height == 0 + x_reshaped = x.reshape( + N, C, H // pool_height, pool_height, W // pool_width, pool_width + ) + out = x_reshaped.max(axis=3).max(axis=4) + + cache = (x, x_reshaped, out) + return out, cache + + +def max_pool_backward_reshape(dout, cache): + """ + A fast implementation of the backward pass for the max pooling layer that + uses some clever broadcasting and reshaping. + + This can only be used if the forward pass was computed using + max_pool_forward_reshape. + + NOTE: If there are multiple argmaxes, this method will assign gradient to + ALL argmax elements of the input rather than picking one. In this case the + gradient will actually be incorrect. However this is unlikely to occur in + practice, so it shouldn't matter much. One possible solution is to split the + upstream gradient equally among all argmax elements; this should result in a + valid subgradient. You can make this happen by uncommenting the line below; + however this results in a significant performance penalty (about 40% slower) + and is unlikely to matter in practice so we don't do it. + """ + x, x_reshaped, out = cache + + dx_reshaped = np.zeros_like(x_reshaped) + out_newaxis = out[:, :, :, np.newaxis, :, np.newaxis] + mask = x_reshaped == out_newaxis + dout_newaxis = dout[:, :, :, np.newaxis, :, np.newaxis] + dout_broadcast, _ = np.broadcast_arrays(dout_newaxis, dx_reshaped) + dx_reshaped[mask] = dout_broadcast[mask] + dx_reshaped /= np.sum(mask, axis=(3, 5), keepdims=True) + dx = dx_reshaped.reshape(x.shape) + + return dx + + +def max_pool_forward_im2col(x, pool_param): + """ + An implementation of the forward pass for max pooling based on im2col. + + This isn't much faster than the naive version, so it should be avoided if + possible. + """ + N, C, H, W = x.shape + pool_height, pool_width = pool_param["pool_height"], pool_param["pool_width"] + stride = pool_param["stride"] + + assert (H - pool_height) % stride == 0, "Invalid height" + assert (W - pool_width) % stride == 0, "Invalid width" + + out_height = (H - pool_height) // stride + 1 + out_width = (W - pool_width) // stride + 1 + + x_split = x.reshape(N * C, 1, H, W) + x_cols = im2col(x_split, pool_height, pool_width, padding=0, stride=stride) + x_cols_argmax = np.argmax(x_cols, axis=0) + x_cols_max = x_cols[x_cols_argmax, np.arange(x_cols.shape[1])] + out = x_cols_max.reshape(out_height, out_width, N, C).transpose(2, 3, 0, 1) + + cache = (x, x_cols, x_cols_argmax, pool_param) + return out, cache + + +def max_pool_backward_im2col(dout, cache): + """ + An implementation of the backward pass for max pooling based on im2col. + + This isn't much faster than the naive version, so it should be avoided if + possible. + """ + x, x_cols, x_cols_argmax, pool_param = cache + N, C, H, W = x.shape + pool_height, pool_width = pool_param["pool_height"], pool_param["pool_width"] + stride = pool_param["stride"] + + dout_reshaped = dout.transpose(2, 3, 0, 1).flatten() + dx_cols = np.zeros_like(x_cols) + dx_cols[x_cols_argmax, np.arange(dx_cols.shape[1])] = dout_reshaped + dx = col2im_indices( + dx_cols, (N * C, 1, H, W), pool_height, pool_width, padding=0, stride=stride + ) + dx = dx.reshape(x.shape) + + return dx diff --git a/lab_3/scripts/gradient_check.py b/lab_3/scripts/gradient_check.py new file mode 100644 index 0000000..901c307 --- /dev/null +++ b/lab_3/scripts/gradient_check.py @@ -0,0 +1,133 @@ +from __future__ import print_function +from builtins import range +from past.builtins import xrange + +import numpy as np +from random import randrange + + +def eval_numerical_gradient(f, x, verbose=True, h=0.00001): + """ + a naive implementation of numerical gradient of f at x + - f should be a function that takes a single argument + - x is the point (numpy array) to evaluate the gradient at + """ + + fx = f(x) # evaluate function value at original point + grad = np.zeros_like(x) + # iterate over all indexes in x + it = np.nditer(x, flags=["multi_index"], op_flags=["readwrite"]) + while not it.finished: + + # evaluate function at x+h + ix = it.multi_index + oldval = x[ix] + x[ix] = oldval + h # increment by h + fxph = f(x) # evalute f(x + h) + x[ix] = oldval - h + fxmh = f(x) # evaluate f(x - h) + x[ix] = oldval # restore + + # compute the partial derivative with centered formula + grad[ix] = (fxph - fxmh) / (2 * h) # the slope + if verbose: + print(ix, grad[ix]) + it.iternext() # step to next dimension + + return grad + + +def eval_numerical_gradient_array(f, x, df, h=1e-5): + """ + Evaluate a numeric gradient for a function that accepts a numpy + array and returns a numpy array. + """ + grad = np.zeros_like(x) + it = np.nditer(x, flags=["multi_index"], op_flags=["readwrite"]) + while not it.finished: + ix = it.multi_index + + oldval = x[ix] + x[ix] = oldval + h + pos = f(x).copy() + x[ix] = oldval - h + neg = f(x).copy() + x[ix] = oldval + + grad[ix] = np.sum((pos - neg) * df) / (2 * h) + it.iternext() + return grad + + +def eval_numerical_gradient_blobs(f, inputs, output, h=1e-5): + """ + Compute numeric gradients for a function that operates on input + and output blobs. + + We assume that f accepts several input blobs as arguments, followed by a + blob where outputs will be written. For example, f might be called like: + + f(x, w, out) + + where x and w are input Blobs, and the result of f will be written to out. + + Inputs: + - f: function + - inputs: tuple of input blobs + - output: output blob + - h: step size + """ + numeric_diffs = [] + for input_blob in inputs: + diff = np.zeros_like(input_blob.diffs) + it = np.nditer(input_blob.vals, flags=["multi_index"], op_flags=["readwrite"]) + while not it.finished: + idx = it.multi_index + orig = input_blob.vals[idx] + + input_blob.vals[idx] = orig + h + f(*(inputs + (output,))) + pos = np.copy(output.vals) + input_blob.vals[idx] = orig - h + f(*(inputs + (output,))) + neg = np.copy(output.vals) + input_blob.vals[idx] = orig + + diff[idx] = np.sum((pos - neg) * output.diffs) / (2.0 * h) + + it.iternext() + numeric_diffs.append(diff) + return numeric_diffs + + +def eval_numerical_gradient_net(net, inputs, output, h=1e-5): + return eval_numerical_gradient_blobs( + lambda *args: net.forward(), inputs, output, h=h + ) + + +def grad_check_sparse(f, x, analytic_grad, num_checks=10, h=1e-5): + """ + sample a few random elements and only return numerical + in this dimensions. + """ + + for i in range(num_checks): + ix = tuple([randrange(m) for m in x.shape]) + + oldval = x[ix] + x[ix] = oldval + h # increment by h + fxph = f(x) # evaluate f(x + h) + x[ix] = oldval - h # increment by h + fxmh = f(x) # evaluate f(x - h) + x[ix] = oldval # reset + + grad_numerical = (fxph - fxmh) / (2 * h) + grad_analytic = analytic_grad[ix] + rel_error = abs(grad_numerical - grad_analytic) / ( + abs(grad_numerical) + abs(grad_analytic) + ) + print( + "numerical: %f analytic: %f, relative error: %e" + % (grad_numerical, grad_analytic, rel_error) + ) diff --git a/lab_3/scripts/im2col.py b/lab_3/scripts/im2col.py new file mode 100644 index 0000000..e1fc034 --- /dev/null +++ b/lab_3/scripts/im2col.py @@ -0,0 +1,58 @@ +from builtins import range +import numpy as np + + +def get_im2col_indices(x_shape, field_height, field_width, padding=1, stride=1): + # First figure out what the size of the output should be + N, C, H, W = x_shape + assert (H + 2 * padding - field_height) % stride == 0 + assert (W + 2 * padding - field_height) % stride == 0 + out_height = (H + 2 * padding - field_height) / stride + 1 + out_width = (W + 2 * padding - field_width) / stride + 1 + + i0 = np.repeat(np.arange(field_height), field_width) + i0 = np.tile(i0, C) + i1 = stride * np.repeat(np.arange(out_height), out_width) + j0 = np.tile(np.arange(field_width), field_height * C) + j1 = stride * np.tile(np.arange(out_width), out_height) + i = i0.reshape(-1, 1) + i1.reshape(1, -1) + j = j0.reshape(-1, 1) + j1.reshape(1, -1) + + k = np.repeat(np.arange(C), field_height * field_width).reshape(-1, 1) + + return (k, i, j) + + +def im2col_indices(x, field_height, field_width, padding=1, stride=1): + """ An implementation of im2col based on some fancy indexing """ + # Zero-pad the input + p = padding + x_padded = np.pad(x, ((0, 0), (0, 0), (p, p), (p, p)), mode="constant") + + k, i, j = get_im2col_indices(x.shape, field_height, field_width, padding, stride) + + cols = x_padded[:, k, i, j] + C = x.shape[1] + cols = cols.transpose(1, 2, 0).reshape(field_height * field_width * C, -1) + return cols + + +def col2im_indices(cols, x_shape, field_height=3, field_width=3, padding=1, stride=1): + """ An implementation of col2im based on fancy indexing and np.add.at """ + N, C, H, W = x_shape + H_padded, W_padded = H + 2 * padding, W + 2 * padding + x_padded = np.zeros((N, C, H_padded, W_padded), dtype=cols.dtype) + k, i, j = get_im2col_indices(x_shape, field_height, field_width, padding, stride) + cols_reshaped = cols.reshape(C * field_height * field_width, -1, N) + cols_reshaped = cols_reshaped.transpose(2, 0, 1) + np.add.at(x_padded, (slice(None), k, i, j), cols_reshaped) + if padding == 0: + return x_padded + return x_padded[:, :, padding:-padding, padding:-padding] + + +# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** + +pass + +# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** diff --git a/lab_3/scripts/im2col_cython.pyx b/lab_3/scripts/im2col_cython.pyx new file mode 100644 index 0000000..d6e33c6 --- /dev/null +++ b/lab_3/scripts/im2col_cython.pyx @@ -0,0 +1,121 @@ +import numpy as np +cimport numpy as np +cimport cython + +# DTYPE = np.float64 +# ctypedef np.float64_t DTYPE_t + +ctypedef fused DTYPE_t: + np.float32_t + np.float64_t + +def im2col_cython(np.ndarray[DTYPE_t, ndim=4] x, int field_height, + int field_width, int padding, int stride): + cdef int N = x.shape[0] + cdef int C = x.shape[1] + cdef int H = x.shape[2] + cdef int W = x.shape[3] + + cdef int HH = (H + 2 * padding - field_height) / stride + 1 + cdef int WW = (W + 2 * padding - field_width) / stride + 1 + + cdef int p = padding + cdef np.ndarray[DTYPE_t, ndim=4] x_padded = np.pad(x, + ((0, 0), (0, 0), (p, p), (p, p)), mode='constant') + + cdef np.ndarray[DTYPE_t, ndim=2] cols = np.zeros( + (C * field_height * field_width, N * HH * WW), + dtype=x.dtype) + + # Moving the inner loop to a C function with no bounds checking works, but does + # not seem to help performance in any measurable way. + + im2col_cython_inner(cols, x_padded, N, C, H, W, HH, WW, + field_height, field_width, padding, stride) + return cols + + +@cython.boundscheck(False) +cdef int im2col_cython_inner(np.ndarray[DTYPE_t, ndim=2] cols, + np.ndarray[DTYPE_t, ndim=4] x_padded, + int N, int C, int H, int W, int HH, int WW, + int field_height, int field_width, int padding, int stride) except? -1: + cdef int c, ii, jj, row, yy, xx, i, col + + for c in range(C): + for yy in range(HH): + for xx in range(WW): + for ii in range(field_height): + for jj in range(field_width): + row = c * field_width * field_height + ii * field_height + jj + for i in range(N): + col = yy * WW * N + xx * N + i + cols[row, col] = x_padded[i, c, stride * yy + ii, stride * xx + jj] + + + +def col2im_cython(np.ndarray[DTYPE_t, ndim=2] cols, int N, int C, int H, int W, + int field_height, int field_width, int padding, int stride): + cdef np.ndarray x = np.empty((N, C, H, W), dtype=cols.dtype) + cdef int HH = (H + 2 * padding - field_height) / stride + 1 + cdef int WW = (W + 2 * padding - field_width) / stride + 1 + cdef np.ndarray[DTYPE_t, ndim=4] x_padded = np.zeros((N, C, H + 2 * padding, W + 2 * padding), + dtype=cols.dtype) + + # Moving the inner loop to a C-function with no bounds checking improves + # performance quite a bit for col2im. + col2im_cython_inner(cols, x_padded, N, C, H, W, HH, WW, + field_height, field_width, padding, stride) + if padding > 0: + return x_padded[:, :, padding:-padding, padding:-padding] + return x_padded + + +@cython.boundscheck(False) +cdef int col2im_cython_inner(np.ndarray[DTYPE_t, ndim=2] cols, + np.ndarray[DTYPE_t, ndim=4] x_padded, + int N, int C, int H, int W, int HH, int WW, + int field_height, int field_width, int padding, int stride) except? -1: + cdef int c, ii, jj, row, yy, xx, i, col + + for c in range(C): + for ii in range(field_height): + for jj in range(field_width): + row = c * field_width * field_height + ii * field_height + jj + for yy in range(HH): + for xx in range(WW): + for i in range(N): + col = yy * WW * N + xx * N + i + x_padded[i, c, stride * yy + ii, stride * xx + jj] += cols[row, col] + + +@cython.boundscheck(False) +@cython.wraparound(False) +cdef col2im_6d_cython_inner(np.ndarray[DTYPE_t, ndim=6] cols, + np.ndarray[DTYPE_t, ndim=4] x_padded, + int N, int C, int H, int W, int HH, int WW, + int out_h, int out_w, int pad, int stride): + + cdef int c, hh, ww, n, h, w + for n in range(N): + for c in range(C): + for hh in range(HH): + for ww in range(WW): + for h in range(out_h): + for w in range(out_w): + x_padded[n, c, stride * h + hh, stride * w + ww] += cols[c, hh, ww, n, h, w] + + +def col2im_6d_cython(np.ndarray[DTYPE_t, ndim=6] cols, int N, int C, int H, int W, + int HH, int WW, int pad, int stride): + cdef np.ndarray x = np.empty((N, C, H, W), dtype=cols.dtype) + cdef int out_h = (H + 2 * pad - HH) / stride + 1 + cdef int out_w = (W + 2 * pad - WW) / stride + 1 + cdef np.ndarray[DTYPE_t, ndim=4] x_padded = np.zeros((N, C, H + 2 * pad, W + 2 * pad), + dtype=cols.dtype) + + col2im_6d_cython_inner(cols, x_padded, N, C, H, W, HH, WW, out_h, out_w, pad, stride) + + if pad > 0: + return x_padded[:, :, pad:-pad, pad:-pad] + return x_padded diff --git a/lab_3/scripts/layer_utils.py b/lab_3/scripts/layer_utils.py new file mode 100644 index 0000000..c055e28 --- /dev/null +++ b/lab_3/scripts/layer_utils.py @@ -0,0 +1,110 @@ +# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** + +pass + +# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** +from .layers import * +from .fast_layers import * + + +def affine_relu_forward(x, w, b): + """ + Convenience layer that perorms an affine transform followed by a ReLU + + Inputs: + - x: Input to the affine layer + - w, b: Weights for the affine layer + + Returns a tuple of: + - out: Output from the ReLU + - cache: Object to give to the backward pass + """ + a, fc_cache = affine_forward(x, w, b) + out, relu_cache = relu_forward(a) + cache = (fc_cache, relu_cache) + return out, cache + + +def affine_relu_backward(dout, cache): + """ + Backward pass for the affine-relu convenience layer + """ + fc_cache, relu_cache = cache + da = relu_backward(dout, relu_cache) + dx, dw, db = affine_backward(da, fc_cache) + return dx, dw, db + + +def conv_relu_forward(x, w, b, conv_param): + """ + A convenience layer that performs a convolution followed by a ReLU. + + Inputs: + - x: Input to the convolutional layer + - w, b, conv_param: Weights and parameters for the convolutional layer + + Returns a tuple of: + - out: Output from the ReLU + - cache: Object to give to the backward pass + """ + a, conv_cache = conv_forward_fast(x, w, b, conv_param) + out, relu_cache = relu_forward(a) + cache = (conv_cache, relu_cache) + return out, cache + + +def conv_relu_backward(dout, cache): + """ + Backward pass for the conv-relu convenience layer. + """ + conv_cache, relu_cache = cache + da = relu_backward(dout, relu_cache) + dx, dw, db = conv_backward_fast(da, conv_cache) + return dx, dw, db + + +def conv_bn_relu_forward(x, w, b, gamma, beta, conv_param, bn_param): + a, conv_cache = conv_forward_fast(x, w, b, conv_param) + an, bn_cache = spatial_batchnorm_forward(a, gamma, beta, bn_param) + out, relu_cache = relu_forward(an) + cache = (conv_cache, bn_cache, relu_cache) + return out, cache + + +def conv_bn_relu_backward(dout, cache): + conv_cache, bn_cache, relu_cache = cache + dan = relu_backward(dout, relu_cache) + da, dgamma, dbeta = spatial_batchnorm_backward(dan, bn_cache) + dx, dw, db = conv_backward_fast(da, conv_cache) + return dx, dw, db, dgamma, dbeta + + +def conv_relu_pool_forward(x, w, b, conv_param, pool_param): + """ + Convenience layer that performs a convolution, a ReLU, and a pool. + + Inputs: + - x: Input to the convolutional layer + - w, b, conv_param: Weights and parameters for the convolutional layer + - pool_param: Parameters for the pooling layer + + Returns a tuple of: + - out: Output from the pooling layer + - cache: Object to give to the backward pass + """ + a, conv_cache = conv_forward_fast(x, w, b, conv_param) + s, relu_cache = relu_forward(a) + out, pool_cache = max_pool_forward_fast(s, pool_param) + cache = (conv_cache, relu_cache, pool_cache) + return out, cache + + +def conv_relu_pool_backward(dout, cache): + """ + Backward pass for the conv-relu-pool convenience layer + """ + conv_cache, relu_cache, pool_cache = cache + ds = max_pool_backward_fast(dout, pool_cache) + da = relu_backward(ds, relu_cache) + dx, dw, db = conv_backward_fast(da, conv_cache) + return dx, dw, db diff --git a/lab_3/scripts/layers.py b/lab_3/scripts/layers.py new file mode 100644 index 0000000..678931d --- /dev/null +++ b/lab_3/scripts/layers.py @@ -0,0 +1,696 @@ +from builtins import range +import numpy as np + + + +def affine_forward(x, w, b): + """ + Computes the forward pass for an affine (fully-connected) layer. + + The input x has shape (N, d_1, ..., d_k) and contains a minibatch of N + examples, where each example x[i] has shape (d_1, ..., d_k). We will + reshape each input into a vector of dimension D = d_1 * ... * d_k, and + then transform it to an output vector of dimension M. + + Inputs: + - x: A numpy array containing input data, of shape (N, d_1, ..., d_k) + - w: A numpy array of weights, of shape (D, M) + - b: A numpy array of biases, of shape (M,) + + Returns a tuple of: + - out: output, of shape (N, M) + - cache: (x, w, b) + """ + out = None + ########################################################################### + # TODO: Implement the affine forward pass. Store the result in out. You # + # will need to reshape the input into rows. # + ########################################################################### + # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** + + pass + + # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** + ########################################################################### + # END OF YOUR CODE # + ########################################################################### + cache = (x, w, b) + return out, cache + + +def affine_backward(dout, cache): + """ + Computes the backward pass for an affine layer. + + Inputs: + - dout: Upstream derivative, of shape (N, M) + - cache: Tuple of: + - x: Input data, of shape (N, d_1, ... d_k) + - w: Weights, of shape (D, M) + - b: Biases, of shape (M,) + + Returns a tuple of: + - dx: Gradient with respect to x, of shape (N, d1, ..., d_k) + - dw: Gradient with respect to w, of shape (D, M) + - db: Gradient with respect to b, of shape (M,) + """ + x, w, b = cache + dx, dw, db = None, None, None + ########################################################################### + # TODO: Implement the affine backward pass. # + ########################################################################### + # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** + + pass + + # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** + ########################################################################### + # END OF YOUR CODE # + ########################################################################### + return dx, dw, db + + +def relu_forward(x): + """ + Computes the forward pass for a layer of rectified linear units (ReLUs). + + Input: + - x: Inputs, of any shape + + Returns a tuple of: + - out: Output, of the same shape as x + - cache: x + """ + out = None + ########################################################################### + # TODO: Implement the ReLU forward pass. # + ########################################################################### + # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** + + pass + + # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** + ########################################################################### + # END OF YOUR CODE # + ########################################################################### + cache = x + return out, cache + + +def relu_backward(dout, cache): + """ + Computes the backward pass for a layer of rectified linear units (ReLUs). + + Input: + - dout: Upstream derivatives, of any shape + - cache: Input x, of same shape as dout + + Returns: + - dx: Gradient with respect to x + """ + dx, x = None, cache + ########################################################################### + # TODO: Implement the ReLU backward pass. # + ########################################################################### + # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** + + pass + + # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** + ########################################################################### + # END OF YOUR CODE # + ########################################################################### + return dx + + +def batchnorm_forward(x, gamma, beta, bn_param): + """ + Forward pass for batch normalization. + + During training the sample mean and (uncorrected) sample variance are + computed from minibatch statistics and used to normalize the incoming data. + During training we also keep an exponentially decaying running mean of the + mean and variance of each feature, and these averages are used to normalize + data at test-time. + + At each timestep we update the running averages for mean and variance using + an exponential decay based on the momentum parameter: + + running_mean = momentum * running_mean + (1 - momentum) * sample_mean + running_var = momentum * running_var + (1 - momentum) * sample_var + + Note that the batch normalization paper suggests a different test-time + behavior: they compute sample mean and variance for each feature using a + large number of training images rather than using a running average. For + this implementation we have chosen to use running averages instead since + they do not require an additional estimation step; the torch7 + implementation of batch normalization also uses running averages. + + Input: + - x: Data of shape (N, D) + - gamma: Scale parameter of shape (D,) + - beta: Shift paremeter of shape (D,) + - bn_param: Dictionary with the following keys: + - mode: 'train' or 'test'; required + - eps: Constant for numeric stability + - momentum: Constant for running mean / variance. + - running_mean: Array of shape (D,) giving running mean of features + - running_var Array of shape (D,) giving running variance of features + + Returns a tuple of: + - out: of shape (N, D) + - cache: A tuple of values needed in the backward pass + """ + mode = bn_param["mode"] + eps = bn_param.get("eps", 1e-5) + momentum = bn_param.get("momentum", 0.9) + + N, D = x.shape + running_mean = bn_param.get("running_mean", np.zeros(D, dtype=x.dtype)) + running_var = bn_param.get("running_var", np.zeros(D, dtype=x.dtype)) + + out, cache = None, None + if mode == "train": + ####################################################################### + # TODO: Implement the training-time forward pass for batch norm. # + # Use minibatch statistics to compute the mean and variance, use # + # these statistics to normalize the incoming data, and scale and # + # shift the normalized data using gamma and beta. # + # # + # You should store the output in the variable out. Any intermediates # + # that you need for the backward pass should be stored in the cache # + # variable. # + # # + # You should also use your computed sample mean and variance together # + # with the momentum variable to update the running mean and running # + # variance, storing your result in the running_mean and running_var # + # variables. # + # # + # Note that though you should be keeping track of the running # + # variance, you should normalize the data based on the standard # + # deviation (square root of variance) instead! # + # Referencing the original paper (https://arxiv.org/abs/1502.03167) # + # might prove to be helpful. # + ####################################################################### + # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** + + pass + + # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** + ####################################################################### + # END OF YOUR CODE # + ####################################################################### + elif mode == "test": + ####################################################################### + # TODO: Implement the test-time forward pass for batch normalization. # + # Use the running mean and variance to normalize the incoming data, # + # then scale and shift the normalized data using gamma and beta. # + # Store the result in the out variable. # + ####################################################################### + # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** + + pass + + # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** + ####################################################################### + # END OF YOUR CODE # + ####################################################################### + else: + raise ValueError('Invalid forward batchnorm mode "%s"' % mode) + + # Store the updated running means back into bn_param + bn_param["running_mean"] = running_mean + bn_param["running_var"] = running_var + + return out, cache + + +def batchnorm_backward(dout, cache): + """ + Backward pass for batch normalization. + + For this implementation, you should write out a computation graph for + batch normalization on paper and propagate gradients backward through + intermediate nodes. + + Inputs: + - dout: Upstream derivatives, of shape (N, D) + - cache: Variable of intermediates from batchnorm_forward. + + Returns a tuple of: + - dx: Gradient with respect to inputs x, of shape (N, D) + - dgamma: Gradient with respect to scale parameter gamma, of shape (D,) + - dbeta: Gradient with respect to shift parameter beta, of shape (D,) + """ + dx, dgamma, dbeta = None, None, None + ########################################################################### + # TODO: Implement the backward pass for batch normalization. Store the # + # results in the dx, dgamma, and dbeta variables. # + # Referencing the original paper (https://arxiv.org/abs/1502.03167) # + # might prove to be helpful. # + ########################################################################### + # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** + + pass + + # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** + ########################################################################### + # END OF YOUR CODE # + ########################################################################### + + return dx, dgamma, dbeta + + + + + + + + + + +def dropout_forward(x, dropout_param): + """ + Performs the forward pass for (inverted) dropout. + + Inputs: + - x: Input data, of any shape + - dropout_param: A dictionary with the following keys: + - p: Dropout parameter. We keep each neuron output with probability p. + - mode: 'test' or 'train'. If the mode is train, then perform dropout; + if the mode is test, then just return the input. + - seed: Seed for the random number generator. Passing seed makes this + function deterministic, which is needed for gradient checking but not + in real networks. + + Outputs: + - out: Array of the same shape as x. + - cache: tuple (dropout_param, mask). In training mode, mask is the dropout + mask that was used to multiply the input; in test mode, mask is None. + + NOTE: Please implement **inverted** dropout, not the vanilla version of dropout. + See http://cs231n.github.io/neural-networks-2/#reg for more details. + + NOTE 2: Keep in mind that p is the probability of **keep** a neuron + output; this might be contrary to some sources, where it is referred to + as the probability of dropping a neuron output. + """ + p, mode = dropout_param["p"], dropout_param["mode"] + if "seed" in dropout_param: + np.random.seed(dropout_param["seed"]) + + mask = None + out = None + + if mode == "train": + ####################################################################### + # TODO: Implement training phase forward pass for inverted dropout. # + # Store the dropout mask in the mask variable. # + ####################################################################### + # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** + + pass + + # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** + ####################################################################### + # END OF YOUR CODE # + ####################################################################### + elif mode == "test": + ####################################################################### + # TODO: Implement the test phase forward pass for inverted dropout. # + ####################################################################### + # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** + + pass + + # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** + ####################################################################### + # END OF YOUR CODE # + ####################################################################### + + cache = (dropout_param, mask) + out = out.astype(x.dtype, copy=False) + + return out, cache + + +def dropout_backward(dout, cache): + """ + Perform the backward pass for (inverted) dropout. + + Inputs: + - dout: Upstream derivatives, of any shape + - cache: (dropout_param, mask) from dropout_forward. + """ + dropout_param, mask = cache + mode = dropout_param["mode"] + + dx = None + if mode == "train": + ####################################################################### + # TODO: Implement training phase backward pass for inverted dropout # + ####################################################################### + # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** + + pass + + # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** + ####################################################################### + # END OF YOUR CODE # + ####################################################################### + elif mode == "test": + dx = dout + return dx + + +def conv_forward_naive(x, w, b, conv_param): + """ + A naive implementation of the forward pass for a convolutional layer. + + The input consists of N data points, each with C channels, height H and + width W. We convolve each input with F different filters, where each filter + spans all C channels and has height HH and width WW. + + Input: + - x: Input data of shape (N, C, H, W) + - w: Filter weights of shape (F, C, HH, WW) + - b: Biases, of shape (F,) + - conv_param: A dictionary with the following keys: + - 'stride': The number of pixels between adjacent receptive fields in the + horizontal and vertical directions. + - 'pad': The number of pixels that will be used to zero-pad the input. + + + During padding, 'pad' zeros should be placed symmetrically (i.e equally on both sides) + along the height and width axes of the input. Be careful not to modfiy the original + input x directly. + + Returns a tuple of: + - out: Output data, of shape (N, F, H', W') where H' and W' are given by + H' = 1 + (H + 2 * pad - HH) / stride + W' = 1 + (W + 2 * pad - WW) / stride + - cache: (x, w, b, conv_param) + """ + out = None + ########################################################################### + # TODO: Implement the convolutional forward pass. # + # Hint: you can use the function np.pad for padding. # + ########################################################################### + # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** + + pass + + # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** + ########################################################################### + # END OF YOUR CODE # + ########################################################################### + cache = (x, w, b, conv_param) + return out, cache + + +def conv_backward_naive(dout, cache): + """ + A naive implementation of the backward pass for a convolutional layer. + + Inputs: + - dout: Upstream derivatives. + - cache: A tuple of (x, w, b, conv_param) as in conv_forward_naive + + Returns a tuple of: + - dx: Gradient with respect to x + - dw: Gradient with respect to w + - db: Gradient with respect to b + """ + dx, dw, db = None, None, None + ########################################################################### + # TODO: Implement the convolutional backward pass. # + ########################################################################### + # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** + + pass + + # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** + ########################################################################### + # END OF YOUR CODE # + ########################################################################### + return dx, dw, db + + +def max_pool_forward_naive(x, pool_param): + """ + A naive implementation of the forward pass for a max-pooling layer. + + Inputs: + - x: Input data, of shape (N, C, H, W) + - pool_param: dictionary with the following keys: + - 'pool_height': The height of each pooling region + - 'pool_width': The width of each pooling region + - 'stride': The distance between adjacent pooling regions + + No padding is necessary here. Output size is given by + + Returns a tuple of: + - out: Output data, of shape (N, C, H', W') where H' and W' are given by + H' = 1 + (H - pool_height) / stride + W' = 1 + (W - pool_width) / stride + - cache: (x, pool_param) + """ + out = None + ########################################################################### + # TODO: Implement the max-pooling forward pass # + ########################################################################### + # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** + + pass + + # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** + ########################################################################### + # END OF YOUR CODE # + ########################################################################### + cache = (x, pool_param) + return out, cache + + +def max_pool_backward_naive(dout, cache): + """ + A naive implementation of the backward pass for a max-pooling layer. + + Inputs: + - dout: Upstream derivatives + - cache: A tuple of (x, pool_param) as in the forward pass. + + Returns: + - dx: Gradient with respect to x + """ + dx = None + ########################################################################### + # TODO: Implement the max-pooling backward pass # + ########################################################################### + # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** + + pass + + # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** + ########################################################################### + # END OF YOUR CODE # + ########################################################################### + return dx + + +def spatial_batchnorm_forward(x, gamma, beta, bn_param): + """ + Computes the forward pass for spatial batch normalization. + + Inputs: + - x: Input data of shape (N, C, H, W) + - gamma: Scale parameter, of shape (C,) + - beta: Shift parameter, of shape (C,) + - bn_param: Dictionary with the following keys: + - mode: 'train' or 'test'; required + - eps: Constant for numeric stability + - momentum: Constant for running mean / variance. momentum=0 means that + old information is discarded completely at every time step, while + momentum=1 means that new information is never incorporated. The + default of momentum=0.9 should work well in most situations. + - running_mean: Array of shape (D,) giving running mean of features + - running_var Array of shape (D,) giving running variance of features + + Returns a tuple of: + - out: Output data, of shape (N, C, H, W) + - cache: Values needed for the backward pass + """ + out, cache = None, None + + ########################################################################### + # TODO: Implement the forward pass for spatial batch normalization. # + # # + # HINT: You can implement spatial batch normalization by calling the # + # vanilla version of batch normalization you implemented above. # + # Your implementation should be very short; ours is less than five lines. # + ########################################################################### + # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** + + pass + + # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** + ########################################################################### + # END OF YOUR CODE # + ########################################################################### + + return out, cache + + +def spatial_batchnorm_backward(dout, cache): + """ + Computes the backward pass for spatial batch normalization. + + Inputs: + - dout: Upstream derivatives, of shape (N, C, H, W) + - cache: Values from the forward pass + + Returns a tuple of: + - dx: Gradient with respect to inputs, of shape (N, C, H, W) + - dgamma: Gradient with respect to scale parameter, of shape (C,) + - dbeta: Gradient with respect to shift parameter, of shape (C,) + """ + dx, dgamma, dbeta = None, None, None + + ########################################################################### + # TODO: Implement the backward pass for spatial batch normalization. # + # # + # HINT: You can implement spatial batch normalization by calling the # + # vanilla version of batch normalization you implemented above. # + # Your implementation should be very short; ours is less than five lines. # + ########################################################################### + # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** + + pass + + # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** + ########################################################################### + # END OF YOUR CODE # + ########################################################################### + + return dx, dgamma, dbeta + + +def spatial_groupnorm_forward(x, gamma, beta, G, gn_param): + """ + Computes the forward pass for spatial group normalization. + In contrast to layer normalization, group normalization splits each entry + in the data into G contiguous pieces, which it then normalizes independently. + Per feature shifting and scaling are then applied to the data, in a manner identical to that of batch normalization and layer normalization. + + Inputs: + - x: Input data of shape (N, C, H, W) + - gamma: Scale parameter, of shape (C,) + - beta: Shift parameter, of shape (C,) + - G: Integer mumber of groups to split into, should be a divisor of C + - gn_param: Dictionary with the following keys: + - eps: Constant for numeric stability + + Returns a tuple of: + - out: Output data, of shape (N, C, H, W) + - cache: Values needed for the backward pass + """ + out, cache = None, None + eps = gn_param.get("eps", 1e-5) + ########################################################################### + # TODO: Implement the forward pass for spatial group normalization. # + # This will be extremely similar to the layer norm implementation. # + # In particular, think about how you could transform the matrix so that # + # the bulk of the code is similar to both train-time batch normalization # + # and layer normalization! # + ########################################################################### + # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** + + pass + + # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** + ########################################################################### + # END OF YOUR CODE # + ########################################################################### + return out, cache + + +def spatial_groupnorm_backward(dout, cache): + """ + Computes the backward pass for spatial group normalization. + + Inputs: + - dout: Upstream derivatives, of shape (N, C, H, W) + - cache: Values from the forward pass + + Returns a tuple of: + - dx: Gradient with respect to inputs, of shape (N, C, H, W) + - dgamma: Gradient with respect to scale parameter, of shape (C,) + - dbeta: Gradient with respect to shift parameter, of shape (C,) + """ + dx, dgamma, dbeta = None, None, None + + ########################################################################### + # TODO: Implement the backward pass for spatial group normalization. # + # This will be extremely similar to the layer norm implementation. # + ########################################################################### + # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** + + pass + + # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** + ########################################################################### + # END OF YOUR CODE # + ########################################################################### + return dx, dgamma, dbeta + + +def svm_loss(x, y): + """ + Computes the loss and gradient using for multiclass SVM classification. + + Inputs: + - x: Input data, of shape (N, C) where x[i, j] is the score for the jth + class for the ith input. + - y: Vector of labels, of shape (N,) where y[i] is the label for x[i] and + 0 <= y[i] < C + + Returns a tuple of: + - loss: Scalar giving the loss + - dx: Gradient of the loss with respect to x + """ + N = x.shape[0] + correct_class_scores = x[np.arange(N), y] + margins = np.maximum(0, x - correct_class_scores[:, np.newaxis] + 1.0) + margins[np.arange(N), y] = 0 + loss = np.sum(margins) / N + num_pos = np.sum(margins > 0, axis=1) + dx = np.zeros_like(x) + dx[margins > 0] = 1 + dx[np.arange(N), y] -= num_pos + dx /= N + return loss, dx + + +def softmax_loss(x, y): + """ + Computes the loss and gradient for softmax classification. + + Inputs: + - x: Input data, of shape (N, C) where x[i, j] is the score for the jth + class for the ith input. + - y: Vector of labels, of shape (N,) where y[i] is the label for x[i] and + 0 <= y[i] < C + + Returns a tuple of: + - loss: Scalar giving the loss + - dx: Gradient of the loss with respect to x + """ + shifted_logits = x - np.max(x, axis=1, keepdims=True) + Z = np.sum(np.exp(shifted_logits), axis=1, keepdims=True) + log_probs = shifted_logits - np.log(Z) + probs = np.exp(log_probs) + N = x.shape[0] + loss = -np.sum(log_probs[np.arange(N), y]) / N + dx = probs.copy() + dx[np.arange(N), y] -= 1 + dx /= N + return loss, dx diff --git a/lab_3/scripts/optim.py b/lab_3/scripts/optim.py new file mode 100644 index 0000000..2194b35 --- /dev/null +++ b/lab_3/scripts/optim.py @@ -0,0 +1,162 @@ +import numpy as np + +""" +This file implements various first-order update rules that are commonly used +for training neural networks. Each update rule accepts current weights and the +gradient of the loss with respect to those weights and produces the next set of +weights. Each update rule has the same interface: + +def update(w, dw, config=None): + +Inputs: + - w: A numpy array giving the current weights. + - dw: A numpy array of the same shape as w giving the gradient of the + loss with respect to w. + - config: A dictionary containing hyperparameter values such as learning + rate, momentum, etc. If the update rule requires caching values over many + iterations, then config will also hold these cached values. + +Returns: + - next_w: The next point after the update. + - config: The config dictionary to be passed to the next iteration of the + update rule. + +NOTE: For most update rules, the default learning rate will probably not +perform well; however the default values of the other hyperparameters should +work well for a variety of different problems. + +For efficiency, update rules may perform in-place updates, mutating w and +setting next_w equal to w. +""" + + +def sgd(w, dw, config=None): + """ + Performs vanilla stochastic gradient descent. + + config format: + - learning_rate: Scalar learning rate. + """ + if config is None: + config = {} + config.setdefault("learning_rate", 1e-2) + + w -= config["learning_rate"] * dw + return w, config + + +def sgd_momentum(w, dw, config=None): + """ + Performs stochastic gradient descent with momentum. + + config format: + - learning_rate: Scalar learning rate. + - momentum: Scalar between 0 and 1 giving the momentum value. + Setting momentum = 0 reduces to sgd. + - velocity: A numpy array of the same shape as w and dw used to store a + moving average of the gradients. + """ + if config is None: + config = {} + config.setdefault("learning_rate", 1e-2) + config.setdefault("momentum", 0.9) + v = config.get("velocity", np.zeros_like(w)) + + next_w = None + ########################################################################### + # TODO: Implement the momentum update formula. Store the updated value in # + # the next_w variable. You should also use and update the velocity v. # + ########################################################################### + # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** + + pass + + # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** + ########################################################################### + # END OF YOUR CODE # + ########################################################################### + config["velocity"] = v + + return next_w, config + + +def rmsprop(w, dw, config=None): + """ + Uses the RMSProp update rule, which uses a moving average of squared + gradient values to set adaptive per-parameter learning rates. + + config format: + - learning_rate: Scalar learning rate. + - decay_rate: Scalar between 0 and 1 giving the decay rate for the squared + gradient cache. + - epsilon: Small scalar used for smoothing to avoid dividing by zero. + - cache: Moving average of second moments of gradients. + """ + if config is None: + config = {} + config.setdefault("learning_rate", 1e-2) + config.setdefault("decay_rate", 0.99) + config.setdefault("epsilon", 1e-8) + config.setdefault("cache", np.zeros_like(w)) + + next_w = None + ########################################################################### + # TODO: Implement the RMSprop update formula, storing the next value of w # + # in the next_w variable. Don't forget to update cache value stored in # + # config['cache']. # + ########################################################################### + # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** + + pass + + # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** + ########################################################################### + # END OF YOUR CODE # + ########################################################################### + + return next_w, config + + +def adam(w, dw, config=None): + """ + Uses the Adam update rule, which incorporates moving averages of both the + gradient and its square and a bias correction term. + + config format: + - learning_rate: Scalar learning rate. + - beta1: Decay rate for moving average of first moment of gradient. + - beta2: Decay rate for moving average of second moment of gradient. + - epsilon: Small scalar used for smoothing to avoid dividing by zero. + - m: Moving average of gradient. + - v: Moving average of squared gradient. + - t: Iteration number. + """ + if config is None: + config = {} + config.setdefault("learning_rate", 1e-3) + config.setdefault("beta1", 0.9) + config.setdefault("beta2", 0.999) + config.setdefault("epsilon", 1e-8) + config.setdefault("m", np.zeros_like(w)) + config.setdefault("v", np.zeros_like(w)) + config.setdefault("t", 0) + + next_w = None + ########################################################################### + # TODO: Implement the Adam update formula, storing the next value of w in # + # the next_w variable. Don't forget to update the m, v, and t variables # + # stored in config. # + # # + # NOTE: In order to match the reference output, please modify t _before_ # + # using it in any calculations. # + ########################################################################### + # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** + + pass + + # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** + ########################################################################### + # END OF YOUR CODE # + ########################################################################### + + return next_w, config diff --git a/lab_3/scripts/setup.py b/lab_3/scripts/setup.py new file mode 100644 index 0000000..569bf77 --- /dev/null +++ b/lab_3/scripts/setup.py @@ -0,0 +1,12 @@ +from distutils.core import setup +from distutils.extension import Extension +from Cython.Build import cythonize +import numpy + +extensions = [ + Extension( + "im2col_cython", ["im2col_cython.pyx"], include_dirs=[numpy.get_include()] + ), +] + +setup(ext_modules=cythonize(extensions),) diff --git a/lab_3/scripts/solver.py b/lab_3/scripts/solver.py new file mode 100644 index 0000000..f797e21 --- /dev/null +++ b/lab_3/scripts/solver.py @@ -0,0 +1,309 @@ +from __future__ import print_function, division +from future import standard_library + +standard_library.install_aliases() +from builtins import range +from builtins import object +import os +import pickle as pickle + +import numpy as np + +from scripts import optim + + +class Solver(object): + """ + A Solver encapsulates all the logic necessary for training classification + models. The Solver performs stochastic gradient descent using different + update rules defined in optim.py. + + The solver accepts both training and validataion data and labels so it can + periodically check classification accuracy on both training and validation + data to watch out for overfitting. + + To train a model, you will first construct a Solver instance, passing the + model, dataset, and various options (learning rate, batch size, etc) to the + constructor. You will then call the train() method to run the optimization + procedure and train the model. + + After the train() method returns, model.params will contain the parameters + that performed best on the validation set over the course of training. + In addition, the instance variable solver.loss_history will contain a list + of all losses encountered during training and the instance variables + solver.train_acc_history and solver.val_acc_history will be lists of the + accuracies of the model on the training and validation set at each epoch. + + Example usage might look something like this: + + data = { + 'X_train': # training data + 'y_train': # training labels + 'X_val': # validation data + 'y_val': # validation labels + } + model = MyAwesomeModel(hidden_size=100, reg=10) + solver = Solver(model, data, + update_rule='sgd', + optim_config={ + 'learning_rate': 1e-3, + }, + lr_decay=0.95, + num_epochs=10, batch_size=100, + print_every=100) + solver.train() + + + A Solver works on a model object that must conform to the following API: + + - model.params must be a dictionary mapping string parameter names to numpy + arrays containing parameter values. + + - model.loss(X, y) must be a function that computes training-time loss and + gradients, and test-time classification scores, with the following inputs + and outputs: + + Inputs: + - X: Array giving a minibatch of input data of shape (N, d_1, ..., d_k) + - y: Array of labels, of shape (N,) giving labels for X where y[i] is the + label for X[i]. + + Returns: + If y is None, run a test-time forward pass and return: + - scores: Array of shape (N, C) giving classification scores for X where + scores[i, c] gives the score of class c for X[i]. + + If y is not None, run a training time forward and backward pass and + return a tuple of: + - loss: Scalar giving the loss + - grads: Dictionary with the same keys as self.params mapping parameter + names to gradients of the loss with respect to those parameters. + """ + + def __init__(self, model, data, **kwargs): + """ + Construct a new Solver instance. + + Required arguments: + - model: A model object conforming to the API described above + - data: A dictionary of training and validation data containing: + 'X_train': Array, shape (N_train, d_1, ..., d_k) of training images + 'X_val': Array, shape (N_val, d_1, ..., d_k) of validation images + 'y_train': Array, shape (N_train,) of labels for training images + 'y_val': Array, shape (N_val,) of labels for validation images + + Optional arguments: + - update_rule: A string giving the name of an update rule in optim.py. + Default is 'sgd'. + - optim_config: A dictionary containing hyperparameters that will be + passed to the chosen update rule. Each update rule requires different + hyperparameters (see optim.py) but all update rules require a + 'learning_rate' parameter so that should always be present. + - lr_decay: A scalar for learning rate decay; after each epoch the + learning rate is multiplied by this value. + - batch_size: Size of minibatches used to compute loss and gradient + during training. + - num_epochs: The number of epochs to run for during training. + - print_every: Integer; training losses will be printed every + print_every iterations. + - verbose: Boolean; if set to false then no output will be printed + during training. + - num_train_samples: Number of training samples used to check training + accuracy; default is 1000; set to None to use entire training set. + - num_val_samples: Number of validation samples to use to check val + accuracy; default is None, which uses the entire validation set. + - checkpoint_name: If not None, then save model checkpoints here every + epoch. + """ + self.model = model + self.X_train = data["X_train"] + self.y_train = data["y_train"] + self.X_val = data["X_val"] + self.y_val = data["y_val"] + + # Unpack keyword arguments + self.update_rule = kwargs.pop("update_rule", "sgd") + self.optim_config = kwargs.pop("optim_config", {}) + self.lr_decay = kwargs.pop("lr_decay", 1.0) + self.batch_size = kwargs.pop("batch_size", 100) + self.num_epochs = kwargs.pop("num_epochs", 10) + self.num_train_samples = kwargs.pop("num_train_samples", 1000) + self.num_val_samples = kwargs.pop("num_val_samples", None) + + self.checkpoint_name = kwargs.pop("checkpoint_name", None) + self.print_every = kwargs.pop("print_every", 10) + self.verbose = kwargs.pop("verbose", True) + + # Throw an error if there are extra keyword arguments + if len(kwargs) > 0: + extra = ", ".join('"%s"' % k for k in list(kwargs.keys())) + raise ValueError("Unrecognized arguments %s" % extra) + + # Make sure the update rule exists, then replace the string + # name with the actual function + if not hasattr(optim, self.update_rule): + raise ValueError('Invalid update_rule "%s"' % self.update_rule) + self.update_rule = getattr(optim, self.update_rule) + + self._reset() + + def _reset(self): + """ + Set up some book-keeping variables for optimization. Don't call this + manually. + """ + # Set up some variables for book-keeping + self.epoch = 0 + self.best_val_acc = 0 + self.best_params = {} + self.loss_history = [] + self.train_acc_history = [] + self.val_acc_history = [] + + # Make a deep copy of the optim_config for each parameter + self.optim_configs = {} + for p in self.model.params: + d = {k: v for k, v in self.optim_config.items()} + self.optim_configs[p] = d + + def _step(self): + """ + Make a single gradient update. This is called by train() and should not + be called manually. + """ + # Make a minibatch of training data + num_train = self.X_train.shape[0] + batch_mask = np.random.choice(num_train, self.batch_size) + X_batch = self.X_train[batch_mask] + y_batch = self.y_train[batch_mask] + + # Compute loss and gradient + loss, grads = self.model.loss(X_batch, y_batch) + self.loss_history.append(loss) + + # Perform a parameter update + for p, w in self.model.params.items(): + dw = grads[p] + config = self.optim_configs[p] + next_w, next_config = self.update_rule(w, dw, config) + self.model.params[p] = next_w + self.optim_configs[p] = next_config + + def _save_checkpoint(self): + if self.checkpoint_name is None: + return + checkpoint = { + "model": self.model, + "update_rule": self.update_rule, + "lr_decay": self.lr_decay, + "optim_config": self.optim_config, + "batch_size": self.batch_size, + "num_train_samples": self.num_train_samples, + "num_val_samples": self.num_val_samples, + "epoch": self.epoch, + "loss_history": self.loss_history, + "train_acc_history": self.train_acc_history, + "val_acc_history": self.val_acc_history, + } + filename = "%s_epoch_%d.pkl" % (self.checkpoint_name, self.epoch) + if self.verbose: + print('Saving checkpoint to "%s"' % filename) + with open(filename, "wb") as f: + pickle.dump(checkpoint, f) + + def check_accuracy(self, X, y, num_samples=None, batch_size=100): + """ + Check accuracy of the model on the provided data. + + Inputs: + - X: Array of data, of shape (N, d_1, ..., d_k) + - y: Array of labels, of shape (N,) + - num_samples: If not None, subsample the data and only test the model + on num_samples datapoints. + - batch_size: Split X and y into batches of this size to avoid using + too much memory. + + Returns: + - acc: Scalar giving the fraction of instances that were correctly + classified by the model. + """ + + # Maybe subsample the data + N = X.shape[0] + if num_samples is not None and N > num_samples: + mask = np.random.choice(N, num_samples) + N = num_samples + X = X[mask] + y = y[mask] + + # Compute predictions in batches + num_batches = N // batch_size + if N % batch_size != 0: + num_batches += 1 + y_pred = [] + for i in range(num_batches): + start = i * batch_size + end = (i + 1) * batch_size + scores = self.model.loss(X[start:end]) + y_pred.append(np.argmax(scores, axis=1)) + y_pred = np.hstack(y_pred) + acc = np.mean(y_pred == y) + + return acc + + def train(self): + """ + Run optimization to train the model. + """ + num_train = self.X_train.shape[0] + iterations_per_epoch = max(num_train // self.batch_size, 1) + num_iterations = self.num_epochs * iterations_per_epoch + + for t in range(num_iterations): + self._step() + + # Maybe print training loss + if self.verbose and t % self.print_every == 0: + print( + "(Iteration %d / %d) loss: %f" + % (t + 1, num_iterations, self.loss_history[-1]) + ) + + # At the end of every epoch, increment the epoch counter and decay + # the learning rate. + epoch_end = (t + 1) % iterations_per_epoch == 0 + if epoch_end: + self.epoch += 1 + for k in self.optim_configs: + self.optim_configs[k]["learning_rate"] *= self.lr_decay + + # Check train and val accuracy on the first iteration, the last + # iteration, and at the end of each epoch. + first_it = t == 0 + last_it = t == num_iterations - 1 + if first_it or last_it or epoch_end: + train_acc = self.check_accuracy( + self.X_train, self.y_train, num_samples=self.num_train_samples + ) + val_acc = self.check_accuracy( + self.X_val, self.y_val, num_samples=self.num_val_samples + ) + self.train_acc_history.append(train_acc) + self.val_acc_history.append(val_acc) + self._save_checkpoint() + + if self.verbose: + print( + "(Epoch %d / %d) train acc: %f; val_acc: %f" + % (self.epoch, self.num_epochs, train_acc, val_acc) + ) + + # Keep track of the best model + if val_acc > self.best_val_acc: + self.best_val_acc = val_acc + self.best_params = {} + for k, v in self.model.params.items(): + self.best_params[k] = v.copy() + + # At the end of training swap the best params into the model + self.model.params = self.best_params diff --git a/lab_3/scripts/vis_utils.py b/lab_3/scripts/vis_utils.py new file mode 100644 index 0000000..c1049a0 --- /dev/null +++ b/lab_3/scripts/vis_utils.py @@ -0,0 +1,78 @@ +from builtins import range +from past.builtins import xrange + +from math import sqrt, ceil +import numpy as np + + +def visualize_grid(Xs, ubound=255.0, padding=1): + """ + Reshape a 4D tensor of image data to a grid for easy visualization. + + Inputs: + - Xs: Data of shape (N, H, W, C) + - ubound: Output grid will have values scaled to the range [0, ubound] + - padding: The number of blank pixels between elements of the grid + """ + (N, H, W, C) = Xs.shape + grid_size = int(ceil(sqrt(N))) + grid_height = H * grid_size + padding * (grid_size - 1) + grid_width = W * grid_size + padding * (grid_size - 1) + grid = np.zeros((grid_height, grid_width, C)) + next_idx = 0 + y0, y1 = 0, H + for y in range(grid_size): + x0, x1 = 0, W + for x in range(grid_size): + if next_idx < N: + img = Xs[next_idx] + low, high = np.min(img), np.max(img) + grid[y0:y1, x0:x1] = ubound * (img - low) / (high - low) + # grid[y0:y1, x0:x1] = Xs[next_idx] + next_idx += 1 + x0 += W + padding + x1 += W + padding + y0 += H + padding + y1 += H + padding + # grid_max = np.max(grid) + # grid_min = np.min(grid) + # grid = ubound * (grid - grid_min) / (grid_max - grid_min) + return grid + + +def vis_grid(Xs): + """ visualize a grid of images """ + (N, H, W, C) = Xs.shape + A = int(ceil(sqrt(N))) + G = np.ones((A * H + A, A * W + A, C), Xs.dtype) + G *= np.min(Xs) + n = 0 + for y in range(A): + for x in range(A): + if n < N: + G[y * H + y : (y + 1) * H + y, x * W + x : (x + 1) * W + x, :] = Xs[ + n, :, :, : + ] + n += 1 + # normalize to [0,1] + maxg = G.max() + ming = G.min() + G = (G - ming) / (maxg - ming) + return G + + +def vis_nn(rows): + """ visualize array of arrays of images """ + N = len(rows) + D = len(rows[0]) + H, W, C = rows[0][0].shape + Xs = rows[0][0] + G = np.ones((N * H + N, D * W + D, C), Xs.dtype) + for y in range(N): + for x in range(D): + G[y * H + y : (y + 1) * H + y, x * W + x : (x + 1) * W + x, :] = rows[y][x] + # normalize to [0,1] + maxg = G.max() + ming = G.min() + G = (G - ming) / (maxg - ming) + return G