add lab_3

master
Viktoria Kutikova 4 years ago
parent 32c19488ad
commit 6a053ab090

File diff suppressed because it is too large Load Diff

@ -0,0 +1,135 @@
from builtins import object
import numpy as np
from ..layers import *
from ..fast_layers import *
from ..layer_utils import *
class ThreeLayerConvNet(object):
"""
A three-layer convolutional network with the following architecture:
conv - relu - 2x2 max pool - affine - relu - affine - softmax
The network operates on minibatches of data that have shape (N, C, H, W)
consisting of N images, each with height H and width W and with C input
channels.
"""
def __init__(
self,
input_dim=(3, 32, 32),
num_filters=32,
filter_size=7,
hidden_dim=100,
num_classes=10,
weight_scale=1e-3,
reg=0.0,
dtype=np.float32,
):
"""
Initialize a new network.
Inputs:
- input_dim: Tuple (C, H, W) giving size of input data
- num_filters: Number of filters to use in the convolutional layer
- filter_size: Width/height of filters to use in the convolutional layer
- hidden_dim: Number of units to use in the fully-connected hidden layer
- num_classes: Number of scores to produce from the final affine layer.
- weight_scale: Scalar giving standard deviation for random initialization
of weights.
- reg: Scalar giving L2 regularization strength
- dtype: numpy datatype to use for computation.
"""
self.params = {}
self.reg = reg
self.dtype = dtype
############################################################################
# TODO: Initialize weights and biases for the three-layer convolutional #
# network. Weights should be initialized from a Gaussian centered at 0.0 #
# with standard deviation equal to weight_scale; biases should be #
# initialized to zero. All weights and biases should be stored in the #
# dictionary self.params. Store weights and biases for the convolutional #
# layer using the keys 'W1' and 'b1'; use keys 'W2' and 'b2' for the #
# weights and biases of the hidden affine layer, and keys 'W3' and 'b3' #
# for the weights and biases of the output affine layer. #
# #
# IMPORTANT: For this assignment, you can assume that the padding #
# and stride of the first convolutional layer are chosen so that #
# **the width and height of the input are preserved**. Take a look at #
# the start of the loss() function to see how that happens. #
############################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
pass
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
############################################################################
# END OF YOUR CODE #
############################################################################
for k, v in self.params.items():
self.params[k] = v.astype(dtype)
def loss(self, X, y=None):
"""
Evaluate loss and gradient for the three-layer convolutional network.
Input / output: Same API as TwoLayerNet in fc_net.py.
"""
W1, b1 = self.params["W1"], self.params["b1"]
W2, b2 = self.params["W2"], self.params["b2"]
W3, b3 = self.params["W3"], self.params["b3"]
# pass conv_param to the forward pass for the convolutional layer
# Padding and stride chosen to preserve the input spatial size
filter_size = W1.shape[2]
conv_param = {"stride": 1, "pad": (filter_size - 1) // 2}
# pass pool_param to the forward pass for the max-pooling layer
pool_param = {"pool_height": 2, "pool_width": 2, "stride": 2}
scores = None
############################################################################
# TODO: Implement the forward pass for the three-layer convolutional net, #
# computing the class scores for X and storing them in the scores #
# variable. #
# #
# Remember you can use the functions defined in cs231n/fast_layers.py and #
# cs231n/layer_utils.py in your implementation (already imported). #
############################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
pass
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
############################################################################
# END OF YOUR CODE #
############################################################################
if y is None:
return scores
loss, grads = 0, {}
############################################################################
# TODO: Implement the backward pass for the three-layer convolutional net, #
# storing the loss and gradients in the loss and grads variables. Compute #
# data loss using softmax, and make sure that grads[k] holds the gradients #
# for self.params[k]. Don't forget to add L2 regularization! #
# #
# NOTE: To ensure that your implementation matches ours and you pass the #
# automated tests, make sure that your L2 regularization includes a factor #
# of 0.5 to simplify the expression for the gradient. #
############################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
pass
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
############################################################################
# END OF YOUR CODE #
############################################################################
return loss, grads

@ -0,0 +1,291 @@
from builtins import range
from builtins import object
import numpy as np
from ..layers import *
from ..layer_utils import *
class TwoLayerNet(object):
"""
A two-layer fully-connected neural network with ReLU nonlinearity and
softmax loss that uses a modular layer design. We assume an input dimension
of D, a hidden dimension of H, and perform classification over C classes.
The architecure should be affine - relu - affine - softmax.
Note that this class does not implement gradient descent; instead, it
will interact with a separate Solver object that is responsible for running
optimization.
The learnable parameters of the model are stored in the dictionary
self.params that maps parameter names to numpy arrays.
"""
def __init__(
self,
input_dim=3 * 32 * 32,
hidden_dim=100,
num_classes=10,
weight_scale=1e-3,
reg=0.0,
):
"""
Initialize a new network.
Inputs:
- input_dim: An integer giving the size of the input
- hidden_dim: An integer giving the size of the hidden layer
- num_classes: An integer giving the number of classes to classify
- weight_scale: Scalar giving the standard deviation for random
initialization of the weights.
- reg: Scalar giving L2 regularization strength.
"""
self.params = {}
self.reg = reg
############################################################################
# TODO: Initialize the weights and biases of the two-layer net. Weights #
# should be initialized from a Gaussian centered at 0.0 with #
# standard deviation equal to weight_scale, and biases should be #
# initialized to zero. All weights and biases should be stored in the #
# dictionary self.params, with first layer weights #
# and biases using the keys 'W1' and 'b1' and second layer #
# weights and biases using the keys 'W2' and 'b2'. #
############################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
pass
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
############################################################################
# END OF YOUR CODE #
############################################################################
def loss(self, X, y=None):
"""
Compute loss and gradient for a minibatch of data.
Inputs:
- X: Array of input data of shape (N, d_1, ..., d_k)
- y: Array of labels, of shape (N,). y[i] gives the label for X[i].
Returns:
If y is None, then run a test-time forward pass of the model and return:
- scores: Array of shape (N, C) giving classification scores, where
scores[i, c] is the classification score for X[i] and class c.
If y is not None, then run a training-time forward and backward pass and
return a tuple of:
- loss: Scalar value giving the loss
- grads: Dictionary with the same keys as self.params, mapping parameter
names to gradients of the loss with respect to those parameters.
"""
scores = None
############################################################################
# TODO: Implement the forward pass for the two-layer net, computing the #
# class scores for X and storing them in the scores variable. #
############################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
pass
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
############################################################################
# END OF YOUR CODE #
############################################################################
# If y is None then we are in test mode so just return scores
if y is None:
return scores
loss, grads = 0, {}
############################################################################
# TODO: Implement the backward pass for the two-layer net. Store the loss #
# in the loss variable and gradients in the grads dictionary. Compute data #
# loss using softmax, and make sure that grads[k] holds the gradients for #
# self.params[k]. Don't forget to add L2 regularization! #
# #
# NOTE: To ensure that your implementation matches ours and you pass the #
# automated tests, make sure that your L2 regularization includes a factor #
# of 0.5 to simplify the expression for the gradient. #
############################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
pass
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
############################################################################
# END OF YOUR CODE #
############################################################################
return loss, grads
class FullyConnectedNet(object):
"""
A fully-connected neural network with an arbitrary number of hidden layers,
ReLU nonlinearities, and a softmax loss function. This will also implement
dropout and batch/layer normalization as options. For a network with L layers,
the architecture will be
{affine - [batch/layer norm] - relu - [dropout]} x (L - 1) - affine - softmax
where batch/layer normalization and dropout are optional, and the {...} block is
repeated L - 1 times.
Similar to the TwoLayerNet above, learnable parameters are stored in the
self.params dictionary and will be learned using the Solver class.
"""
def __init__(
self,
hidden_dims,
input_dim=3 * 32 * 32,
num_classes=10,
dropout=1,
normalization=None,
reg=0.0,
weight_scale=1e-2,
dtype=np.float32,
seed=None,
):
"""
Initialize a new FullyConnectedNet.
Inputs:
- hidden_dims: A list of integers giving the size of each hidden layer.
- input_dim: An integer giving the size of the input.
- num_classes: An integer giving the number of classes to classify.
- dropout: Scalar between 0 and 1 giving dropout strength. If dropout=1 then
the network should not use dropout at all.
- normalization: What type of normalization the network should use. Valid values
are "batchnorm", "layernorm", or None for no normalization (the default).
- reg: Scalar giving L2 regularization strength.
- weight_scale: Scalar giving the standard deviation for random
initialization of the weights.
- dtype: A numpy datatype object; all computations will be performed using
this datatype. float32 is faster but less accurate, so you should use
float64 for numeric gradient checking.
- seed: If not None, then pass this random seed to the dropout layers. This
will make the dropout layers deteriminstic so we can gradient check the
model.
"""
self.normalization = normalization
self.use_dropout = dropout != 1
self.reg = reg
self.num_layers = 1 + len(hidden_dims)
self.dtype = dtype
self.params = {}
############################################################################
# TODO: Initialize the parameters of the network, storing all values in #
# the self.params dictionary. Store weights and biases for the first layer #
# in W1 and b1; for the second layer use W2 and b2, etc. Weights should be #
# initialized from a normal distribution centered at 0 with standard #
# deviation equal to weight_scale. Biases should be initialized to zero. #
# #
# When using batch normalization, store scale and shift parameters for the #
# first layer in gamma1 and beta1; for the second layer use gamma2 and #
# beta2, etc. Scale parameters should be initialized to ones and shift #
# parameters should be initialized to zeros. #
############################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
pass
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
############################################################################
# END OF YOUR CODE #
############################################################################
# When using dropout we need to pass a dropout_param dictionary to each
# dropout layer so that the layer knows the dropout probability and the mode
# (train / test). You can pass the same dropout_param to each dropout layer.
self.dropout_param = {}
if self.use_dropout:
self.dropout_param = {"mode": "train", "p": dropout}
if seed is not None:
self.dropout_param["seed"] = seed
# With batch normalization we need to keep track of running means and
# variances, so we need to pass a special bn_param object to each batch
# normalization layer. You should pass self.bn_params[0] to the forward pass
# of the first batch normalization layer, self.bn_params[1] to the forward
# pass of the second batch normalization layer, etc.
self.bn_params = []
if self.normalization == "batchnorm":
self.bn_params = [{"mode": "train"} for i in range(self.num_layers - 1)]
if self.normalization == "layernorm":
self.bn_params = [{} for i in range(self.num_layers - 1)]
# Cast all parameters to the correct datatype
for k, v in self.params.items():
self.params[k] = v.astype(dtype)
def loss(self, X, y=None):
"""
Compute loss and gradient for the fully-connected net.
Input / output: Same as TwoLayerNet above.
"""
X = X.astype(self.dtype)
mode = "test" if y is None else "train"
# Set train/test mode for batchnorm params and dropout param since they
# behave differently during training and testing.
if self.use_dropout:
self.dropout_param["mode"] = mode
if self.normalization == "batchnorm":
for bn_param in self.bn_params:
bn_param["mode"] = mode
scores = None
############################################################################
# TODO: Implement the forward pass for the fully-connected net, computing #
# the class scores for X and storing them in the scores variable. #
# #
# When using dropout, you'll need to pass self.dropout_param to each #
# dropout forward pass. #
# #
# When using batch normalization, you'll need to pass self.bn_params[0] to #
# the forward pass for the first batch normalization layer, pass #
# self.bn_params[1] to the forward pass for the second batch normalization #
# layer, etc. #
############################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
pass
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
############################################################################
# END OF YOUR CODE #
############################################################################
# If test mode return early
if mode == "test":
return scores
loss, grads = 0.0, {}
############################################################################
# TODO: Implement the backward pass for the fully-connected net. Store the #
# loss in the loss variable and gradients in the grads dictionary. Compute #
# data loss using softmax, and make sure that grads[k] holds the gradients #
# for self.params[k]. Don't forget to add L2 regularization! #
# #
# When using batch/layer normalization, you don't need to regularize the scale #
# and shift parameters. #
# #
# NOTE: To ensure that your implementation matches ours and you pass the #
# automated tests, make sure that your L2 regularization includes a factor #
# of 0.5 to simplify the expression for the gradient. #
############################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
pass
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
############################################################################
# END OF YOUR CODE #
############################################################################
return loss, grads

@ -0,0 +1,270 @@
from __future__ import print_function
from builtins import range
from six.moves import cPickle as pickle
import numpy as np
import os
from imageio import imread
import platform
def load_pickle(f):
version = platform.python_version_tuple()
if version[0] == "2":
return pickle.load(f)
elif version[0] == "3":
return pickle.load(f, encoding="latin1")
raise ValueError("invalid python version: {}".format(version))
def load_CIFAR_batch(filename):
""" load single batch of cifar """
with open(filename, "rb") as f:
datadict = load_pickle(f)
X = datadict["data"]
Y = datadict["labels"]
X = X.reshape(10000, 3, 32, 32).transpose(0, 2, 3, 1).astype("float")
Y = np.array(Y)
return X, Y
def load_CIFAR10(ROOT):
""" load all of cifar """
xs = []
ys = []
for b in range(1, 6):
f = os.path.join(ROOT, "data_batch_%d" % (b,))
X, Y = load_CIFAR_batch(f)
xs.append(X)
ys.append(Y)
Xtr = np.concatenate(xs)
Ytr = np.concatenate(ys)
del X, Y
Xte, Yte = load_CIFAR_batch(os.path.join(ROOT, "test_batch"))
return Xtr, Ytr, Xte, Yte
def get_CIFAR10_data(
num_training=49000, num_validation=1000, num_test=1000, subtract_mean=True
):
"""
Load the CIFAR-10 dataset from disk and perform preprocessing to prepare
it for classifiers. These are the same steps as we used for the SVM, but
condensed to a single function.
"""
# Load the raw CIFAR-10 data
cifar10_dir = os.path.join(
os.path.dirname(__file__), "datasets/cifar-10-batches-py"
)
X_train, y_train, X_test, y_test = load_CIFAR10(cifar10_dir)
# Subsample the data
mask = list(range(num_training, num_training + num_validation))
X_val = X_train[mask]
y_val = y_train[mask]
mask = list(range(num_training))
X_train = X_train[mask]
y_train = y_train[mask]
mask = list(range(num_test))
X_test = X_test[mask]
y_test = y_test[mask]
# Normalize the data: subtract the mean image
if subtract_mean:
mean_image = np.mean(X_train, axis=0)
X_train -= mean_image
X_val -= mean_image
X_test -= mean_image
# Transpose so that channels come first
X_train = X_train.transpose(0, 3, 1, 2).copy()
X_val = X_val.transpose(0, 3, 1, 2).copy()
X_test = X_test.transpose(0, 3, 1, 2).copy()
# Package data into a dictionary
return {
"X_train": X_train,
"y_train": y_train,
"X_val": X_val,
"y_val": y_val,
"X_test": X_test,
"y_test": y_test,
}
def load_tiny_imagenet(path, dtype=np.float32, subtract_mean=True):
"""
Load TinyImageNet. Each of TinyImageNet-100-A, TinyImageNet-100-B, and
TinyImageNet-200 have the same directory structure, so this can be used
to load any of them.
Inputs:
- path: String giving path to the directory to load.
- dtype: numpy datatype used to load the data.
- subtract_mean: Whether to subtract the mean training image.
Returns: A dictionary with the following entries:
- class_names: A list where class_names[i] is a list of strings giving the
WordNet names for class i in the loaded dataset.
- X_train: (N_tr, 3, 64, 64) array of training images
- y_train: (N_tr,) array of training labels
- X_val: (N_val, 3, 64, 64) array of validation images
- y_val: (N_val,) array of validation labels
- X_test: (N_test, 3, 64, 64) array of testing images.
- y_test: (N_test,) array of test labels; if test labels are not available
(such as in student code) then y_test will be None.
- mean_image: (3, 64, 64) array giving mean training image
"""
# First load wnids
with open(os.path.join(path, "wnids.txt"), "r") as f:
wnids = [x.strip() for x in f]
# Map wnids to integer labels
wnid_to_label = {wnid: i for i, wnid in enumerate(wnids)}
# Use words.txt to get names for each class
with open(os.path.join(path, "words.txt"), "r") as f:
wnid_to_words = dict(line.split("\t") for line in f)
for wnid, words in wnid_to_words.items():
wnid_to_words[wnid] = [w.strip() for w in words.split(",")]
class_names = [wnid_to_words[wnid] for wnid in wnids]
# Next load training data.
X_train = []
y_train = []
for i, wnid in enumerate(wnids):
if (i + 1) % 20 == 0:
print("loading training data for synset %d / %d" % (i + 1, len(wnids)))
# To figure out the filenames we need to open the boxes file
boxes_file = os.path.join(path, "train", wnid, "%s_boxes.txt" % wnid)
with open(boxes_file, "r") as f:
filenames = [x.split("\t")[0] for x in f]
num_images = len(filenames)
X_train_block = np.zeros((num_images, 3, 64, 64), dtype=dtype)
y_train_block = wnid_to_label[wnid] * np.ones(num_images, dtype=np.int64)
for j, img_file in enumerate(filenames):
img_file = os.path.join(path, "train", wnid, "images", img_file)
img = imread(img_file)
if img.ndim == 2:
## grayscale file
img.shape = (64, 64, 1)
X_train_block[j] = img.transpose(2, 0, 1)
X_train.append(X_train_block)
y_train.append(y_train_block)
# We need to concatenate all training data
X_train = np.concatenate(X_train, axis=0)
y_train = np.concatenate(y_train, axis=0)
# Next load validation data
with open(os.path.join(path, "val", "val_annotations.txt"), "r") as f:
img_files = []
val_wnids = []
for line in f:
img_file, wnid = line.split("\t")[:2]
img_files.append(img_file)
val_wnids.append(wnid)
num_val = len(img_files)
y_val = np.array([wnid_to_label[wnid] for wnid in val_wnids])
X_val = np.zeros((num_val, 3, 64, 64), dtype=dtype)
for i, img_file in enumerate(img_files):
img_file = os.path.join(path, "val", "images", img_file)
img = imread(img_file)
if img.ndim == 2:
img.shape = (64, 64, 1)
X_val[i] = img.transpose(2, 0, 1)
# Next load test images
# Students won't have test labels, so we need to iterate over files in the
# images directory.
img_files = os.listdir(os.path.join(path, "test", "images"))
X_test = np.zeros((len(img_files), 3, 64, 64), dtype=dtype)
for i, img_file in enumerate(img_files):
img_file = os.path.join(path, "test", "images", img_file)
img = imread(img_file)
if img.ndim == 2:
img.shape = (64, 64, 1)
X_test[i] = img.transpose(2, 0, 1)
y_test = None
y_test_file = os.path.join(path, "test", "test_annotations.txt")
if os.path.isfile(y_test_file):
with open(y_test_file, "r") as f:
img_file_to_wnid = {}
for line in f:
line = line.split("\t")
img_file_to_wnid[line[0]] = line[1]
y_test = [wnid_to_label[img_file_to_wnid[img_file]] for img_file in img_files]
y_test = np.array(y_test)
mean_image = X_train.mean(axis=0)
if subtract_mean:
X_train -= mean_image[None]
X_val -= mean_image[None]
X_test -= mean_image[None]
return {
"class_names": class_names,
"X_train": X_train,
"y_train": y_train,
"X_val": X_val,
"y_val": y_val,
"X_test": X_test,
"y_test": y_test,
"class_names": class_names,
"mean_image": mean_image,
}
def load_models(models_dir):
"""
Load saved models from disk. This will attempt to unpickle all files in a
directory; any files that give errors on unpickling (such as README.txt)
will be skipped.
Inputs:
- models_dir: String giving the path to a directory containing model files.
Each model file is a pickled dictionary with a 'model' field.
Returns:
A dictionary mapping model file names to models.
"""
models = {}
for model_file in os.listdir(models_dir):
with open(os.path.join(models_dir, model_file), "rb") as f:
try:
models[model_file] = load_pickle(f)["model"]
except pickle.UnpicklingError:
continue
return models
def load_imagenet_val(num=None):
"""Load a handful of validation images from ImageNet.
Inputs:
- num: Number of images to load (max of 25)
Returns:
- X: numpy array with shape [num, 224, 224, 3]
- y: numpy array of integer image labels, shape [num]
- class_names: dict mapping integer label to class name
"""
imagenet_fn = os.path.join(
os.path.dirname(__file__), "datasets/imagenet_val_25.npz"
)
if not os.path.isfile(imagenet_fn):
print("file %s not found" % imagenet_fn)
print("Run the following:")
print("cd cs231n/datasets")
print("bash get_imagenet_val.sh")
assert False, "Need to download imagenet_val_25.npz"
f = np.load(imagenet_fn)
X = f["X"]
y = f["y"]
class_names = f["label_map"].item()
if num is not None:
X = X[:num]
y = y[:num]
return X, y, class_names

@ -0,0 +1,5 @@
if [ ! -d "cifar-10-batches-py" ]; then
wget http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz -O cifar-10-python.tar.gz
tar -xzvf cifar-10-python.tar.gz
rm cifar-10-python.tar.gz
fi

@ -0,0 +1,283 @@
from __future__ import print_function
import numpy as np
try:
from .im2col_cython import col2im_cython, im2col_cython
from .im2col_cython import col2im_6d_cython
except ImportError:
print("""=========== You can safely ignore the message below if you are NOT working on ConvolutionalNetworks.ipynb ===========""")
print("\tYou will need to compile a Cython extension for a portion of this assignment.")
print("\tThe instructions to do this will be given in a section of the notebook below.")
print("\tThere will be an option for Colab users and another for Jupyter (local) users.")
from .im2col import *
def conv_forward_im2col(x, w, b, conv_param):
"""
A fast implementation of the forward pass for a convolutional layer
based on im2col and col2im.
"""
N, C, H, W = x.shape
num_filters, _, filter_height, filter_width = w.shape
stride, pad = conv_param["stride"], conv_param["pad"]
# Check dimensions
assert (W + 2 * pad - filter_width) % stride == 0, "width does not work"
assert (H + 2 * pad - filter_height) % stride == 0, "height does not work"
# Create output
out_height = (H + 2 * pad - filter_height) // stride + 1
out_width = (W + 2 * pad - filter_width) // stride + 1
out = np.zeros((N, num_filters, out_height, out_width), dtype=x.dtype)
# x_cols = im2col_indices(x, w.shape[2], w.shape[3], pad, stride)
x_cols = im2col_cython(x, w.shape[2], w.shape[3], pad, stride)
res = w.reshape((w.shape[0], -1)).dot(x_cols) + b.reshape(-1, 1)
out = res.reshape(w.shape[0], out.shape[2], out.shape[3], x.shape[0])
out = out.transpose(3, 0, 1, 2)
cache = (x, w, b, conv_param, x_cols)
return out, cache
def conv_forward_strides(x, w, b, conv_param):
N, C, H, W = x.shape
F, _, HH, WW = w.shape
stride, pad = conv_param["stride"], conv_param["pad"]
# Check dimensions
# assert (W + 2 * pad - WW) % stride == 0, 'width does not work'
# assert (H + 2 * pad - HH) % stride == 0, 'height does not work'
# Pad the input
p = pad
x_padded = np.pad(x, ((0, 0), (0, 0), (p, p), (p, p)), mode="constant")
# Figure out output dimensions
H += 2 * pad
W += 2 * pad
out_h = (H - HH) // stride + 1
out_w = (W - WW) // stride + 1
# Perform an im2col operation by picking clever strides
shape = (C, HH, WW, N, out_h, out_w)
strides = (H * W, W, 1, C * H * W, stride * W, stride)
strides = x.itemsize * np.array(strides)
x_stride = np.lib.stride_tricks.as_strided(x_padded, shape=shape, strides=strides)
x_cols = np.ascontiguousarray(x_stride)
x_cols.shape = (C * HH * WW, N * out_h * out_w)
# Now all our convolutions are a big matrix multiply
res = w.reshape(F, -1).dot(x_cols) + b.reshape(-1, 1)
# Reshape the output
res.shape = (F, N, out_h, out_w)
out = res.transpose(1, 0, 2, 3)
# Be nice and return a contiguous array
# The old version of conv_forward_fast doesn't do this, so for a fair
# comparison we won't either
out = np.ascontiguousarray(out)
cache = (x, w, b, conv_param, x_cols)
return out, cache
def conv_backward_strides(dout, cache):
x, w, b, conv_param, x_cols = cache
stride, pad = conv_param["stride"], conv_param["pad"]
N, C, H, W = x.shape
F, _, HH, WW = w.shape
_, _, out_h, out_w = dout.shape
db = np.sum(dout, axis=(0, 2, 3))
dout_reshaped = dout.transpose(1, 0, 2, 3).reshape(F, -1)
dw = dout_reshaped.dot(x_cols.T).reshape(w.shape)
dx_cols = w.reshape(F, -1).T.dot(dout_reshaped)
dx_cols.shape = (C, HH, WW, N, out_h, out_w)
dx = col2im_6d_cython(dx_cols, N, C, H, W, HH, WW, pad, stride)
return dx, dw, db
def conv_backward_im2col(dout, cache):
"""
A fast implementation of the backward pass for a convolutional layer
based on im2col and col2im.
"""
x, w, b, conv_param, x_cols = cache
stride, pad = conv_param["stride"], conv_param["pad"]
db = np.sum(dout, axis=(0, 2, 3))
num_filters, _, filter_height, filter_width = w.shape
dout_reshaped = dout.transpose(1, 2, 3, 0).reshape(num_filters, -1)
dw = dout_reshaped.dot(x_cols.T).reshape(w.shape)
dx_cols = w.reshape(num_filters, -1).T.dot(dout_reshaped)
# dx = col2im_indices(dx_cols, x.shape, filter_height, filter_width, pad, stride)
dx = col2im_cython(
dx_cols,
x.shape[0],
x.shape[1],
x.shape[2],
x.shape[3],
filter_height,
filter_width,
pad,
stride,
)
return dx, dw, db
conv_forward_fast = conv_forward_strides
conv_backward_fast = conv_backward_strides
def max_pool_forward_fast(x, pool_param):
"""
A fast implementation of the forward pass for a max pooling layer.
This chooses between the reshape method and the im2col method. If the pooling
regions are square and tile the input image, then we can use the reshape
method which is very fast. Otherwise we fall back on the im2col method, which
is not much faster than the naive method.
"""
N, C, H, W = x.shape
pool_height, pool_width = pool_param["pool_height"], pool_param["pool_width"]
stride = pool_param["stride"]
same_size = pool_height == pool_width == stride
tiles = H % pool_height == 0 and W % pool_width == 0
if same_size and tiles:
out, reshape_cache = max_pool_forward_reshape(x, pool_param)
cache = ("reshape", reshape_cache)
else:
out, im2col_cache = max_pool_forward_im2col(x, pool_param)
cache = ("im2col", im2col_cache)
return out, cache
def max_pool_backward_fast(dout, cache):
"""
A fast implementation of the backward pass for a max pooling layer.
This switches between the reshape method an the im2col method depending on
which method was used to generate the cache.
"""
method, real_cache = cache
if method == "reshape":
return max_pool_backward_reshape(dout, real_cache)
elif method == "im2col":
return max_pool_backward_im2col(dout, real_cache)
else:
raise ValueError('Unrecognized method "%s"' % method)
def max_pool_forward_reshape(x, pool_param):
"""
A fast implementation of the forward pass for the max pooling layer that uses
some clever reshaping.
This can only be used for square pooling regions that tile the input.
"""
N, C, H, W = x.shape
pool_height, pool_width = pool_param["pool_height"], pool_param["pool_width"]
stride = pool_param["stride"]
assert pool_height == pool_width == stride, "Invalid pool params"
assert H % pool_height == 0
assert W % pool_height == 0
x_reshaped = x.reshape(
N, C, H // pool_height, pool_height, W // pool_width, pool_width
)
out = x_reshaped.max(axis=3).max(axis=4)
cache = (x, x_reshaped, out)
return out, cache
def max_pool_backward_reshape(dout, cache):
"""
A fast implementation of the backward pass for the max pooling layer that
uses some clever broadcasting and reshaping.
This can only be used if the forward pass was computed using
max_pool_forward_reshape.
NOTE: If there are multiple argmaxes, this method will assign gradient to
ALL argmax elements of the input rather than picking one. In this case the
gradient will actually be incorrect. However this is unlikely to occur in
practice, so it shouldn't matter much. One possible solution is to split the
upstream gradient equally among all argmax elements; this should result in a
valid subgradient. You can make this happen by uncommenting the line below;
however this results in a significant performance penalty (about 40% slower)
and is unlikely to matter in practice so we don't do it.
"""
x, x_reshaped, out = cache
dx_reshaped = np.zeros_like(x_reshaped)
out_newaxis = out[:, :, :, np.newaxis, :, np.newaxis]
mask = x_reshaped == out_newaxis
dout_newaxis = dout[:, :, :, np.newaxis, :, np.newaxis]
dout_broadcast, _ = np.broadcast_arrays(dout_newaxis, dx_reshaped)
dx_reshaped[mask] = dout_broadcast[mask]
dx_reshaped /= np.sum(mask, axis=(3, 5), keepdims=True)
dx = dx_reshaped.reshape(x.shape)
return dx
def max_pool_forward_im2col(x, pool_param):
"""
An implementation of the forward pass for max pooling based on im2col.
This isn't much faster than the naive version, so it should be avoided if
possible.
"""
N, C, H, W = x.shape
pool_height, pool_width = pool_param["pool_height"], pool_param["pool_width"]
stride = pool_param["stride"]
assert (H - pool_height) % stride == 0, "Invalid height"
assert (W - pool_width) % stride == 0, "Invalid width"
out_height = (H - pool_height) // stride + 1
out_width = (W - pool_width) // stride + 1
x_split = x.reshape(N * C, 1, H, W)
x_cols = im2col(x_split, pool_height, pool_width, padding=0, stride=stride)
x_cols_argmax = np.argmax(x_cols, axis=0)
x_cols_max = x_cols[x_cols_argmax, np.arange(x_cols.shape[1])]
out = x_cols_max.reshape(out_height, out_width, N, C).transpose(2, 3, 0, 1)
cache = (x, x_cols, x_cols_argmax, pool_param)
return out, cache
def max_pool_backward_im2col(dout, cache):
"""
An implementation of the backward pass for max pooling based on im2col.
This isn't much faster than the naive version, so it should be avoided if
possible.
"""
x, x_cols, x_cols_argmax, pool_param = cache
N, C, H, W = x.shape
pool_height, pool_width = pool_param["pool_height"], pool_param["pool_width"]
stride = pool_param["stride"]
dout_reshaped = dout.transpose(2, 3, 0, 1).flatten()
dx_cols = np.zeros_like(x_cols)
dx_cols[x_cols_argmax, np.arange(dx_cols.shape[1])] = dout_reshaped
dx = col2im_indices(
dx_cols, (N * C, 1, H, W), pool_height, pool_width, padding=0, stride=stride
)
dx = dx.reshape(x.shape)
return dx

@ -0,0 +1,133 @@
from __future__ import print_function
from builtins import range
from past.builtins import xrange
import numpy as np
from random import randrange
def eval_numerical_gradient(f, x, verbose=True, h=0.00001):
"""
a naive implementation of numerical gradient of f at x
- f should be a function that takes a single argument
- x is the point (numpy array) to evaluate the gradient at
"""
fx = f(x) # evaluate function value at original point
grad = np.zeros_like(x)
# iterate over all indexes in x
it = np.nditer(x, flags=["multi_index"], op_flags=["readwrite"])
while not it.finished:
# evaluate function at x+h
ix = it.multi_index
oldval = x[ix]
x[ix] = oldval + h # increment by h
fxph = f(x) # evalute f(x + h)
x[ix] = oldval - h
fxmh = f(x) # evaluate f(x - h)
x[ix] = oldval # restore
# compute the partial derivative with centered formula
grad[ix] = (fxph - fxmh) / (2 * h) # the slope
if verbose:
print(ix, grad[ix])
it.iternext() # step to next dimension
return grad
def eval_numerical_gradient_array(f, x, df, h=1e-5):
"""
Evaluate a numeric gradient for a function that accepts a numpy
array and returns a numpy array.
"""
grad = np.zeros_like(x)
it = np.nditer(x, flags=["multi_index"], op_flags=["readwrite"])
while not it.finished:
ix = it.multi_index
oldval = x[ix]
x[ix] = oldval + h
pos = f(x).copy()
x[ix] = oldval - h
neg = f(x).copy()
x[ix] = oldval
grad[ix] = np.sum((pos - neg) * df) / (2 * h)
it.iternext()
return grad
def eval_numerical_gradient_blobs(f, inputs, output, h=1e-5):
"""
Compute numeric gradients for a function that operates on input
and output blobs.
We assume that f accepts several input blobs as arguments, followed by a
blob where outputs will be written. For example, f might be called like:
f(x, w, out)
where x and w are input Blobs, and the result of f will be written to out.
Inputs:
- f: function
- inputs: tuple of input blobs
- output: output blob
- h: step size
"""
numeric_diffs = []
for input_blob in inputs:
diff = np.zeros_like(input_blob.diffs)
it = np.nditer(input_blob.vals, flags=["multi_index"], op_flags=["readwrite"])
while not it.finished:
idx = it.multi_index
orig = input_blob.vals[idx]
input_blob.vals[idx] = orig + h
f(*(inputs + (output,)))
pos = np.copy(output.vals)
input_blob.vals[idx] = orig - h
f(*(inputs + (output,)))
neg = np.copy(output.vals)
input_blob.vals[idx] = orig
diff[idx] = np.sum((pos - neg) * output.diffs) / (2.0 * h)
it.iternext()
numeric_diffs.append(diff)
return numeric_diffs
def eval_numerical_gradient_net(net, inputs, output, h=1e-5):
return eval_numerical_gradient_blobs(
lambda *args: net.forward(), inputs, output, h=h
)
def grad_check_sparse(f, x, analytic_grad, num_checks=10, h=1e-5):
"""
sample a few random elements and only return numerical
in this dimensions.
"""
for i in range(num_checks):
ix = tuple([randrange(m) for m in x.shape])
oldval = x[ix]
x[ix] = oldval + h # increment by h
fxph = f(x) # evaluate f(x + h)
x[ix] = oldval - h # increment by h
fxmh = f(x) # evaluate f(x - h)
x[ix] = oldval # reset
grad_numerical = (fxph - fxmh) / (2 * h)
grad_analytic = analytic_grad[ix]
rel_error = abs(grad_numerical - grad_analytic) / (
abs(grad_numerical) + abs(grad_analytic)
)
print(
"numerical: %f analytic: %f, relative error: %e"
% (grad_numerical, grad_analytic, rel_error)
)

@ -0,0 +1,58 @@
from builtins import range
import numpy as np
def get_im2col_indices(x_shape, field_height, field_width, padding=1, stride=1):
# First figure out what the size of the output should be
N, C, H, W = x_shape
assert (H + 2 * padding - field_height) % stride == 0
assert (W + 2 * padding - field_height) % stride == 0
out_height = (H + 2 * padding - field_height) / stride + 1
out_width = (W + 2 * padding - field_width) / stride + 1
i0 = np.repeat(np.arange(field_height), field_width)
i0 = np.tile(i0, C)
i1 = stride * np.repeat(np.arange(out_height), out_width)
j0 = np.tile(np.arange(field_width), field_height * C)
j1 = stride * np.tile(np.arange(out_width), out_height)
i = i0.reshape(-1, 1) + i1.reshape(1, -1)
j = j0.reshape(-1, 1) + j1.reshape(1, -1)
k = np.repeat(np.arange(C), field_height * field_width).reshape(-1, 1)
return (k, i, j)
def im2col_indices(x, field_height, field_width, padding=1, stride=1):
""" An implementation of im2col based on some fancy indexing """
# Zero-pad the input
p = padding
x_padded = np.pad(x, ((0, 0), (0, 0), (p, p), (p, p)), mode="constant")
k, i, j = get_im2col_indices(x.shape, field_height, field_width, padding, stride)
cols = x_padded[:, k, i, j]
C = x.shape[1]
cols = cols.transpose(1, 2, 0).reshape(field_height * field_width * C, -1)
return cols
def col2im_indices(cols, x_shape, field_height=3, field_width=3, padding=1, stride=1):
""" An implementation of col2im based on fancy indexing and np.add.at """
N, C, H, W = x_shape
H_padded, W_padded = H + 2 * padding, W + 2 * padding
x_padded = np.zeros((N, C, H_padded, W_padded), dtype=cols.dtype)
k, i, j = get_im2col_indices(x_shape, field_height, field_width, padding, stride)
cols_reshaped = cols.reshape(C * field_height * field_width, -1, N)
cols_reshaped = cols_reshaped.transpose(2, 0, 1)
np.add.at(x_padded, (slice(None), k, i, j), cols_reshaped)
if padding == 0:
return x_padded
return x_padded[:, :, padding:-padding, padding:-padding]
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
pass
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

@ -0,0 +1,121 @@
import numpy as np
cimport numpy as np
cimport cython
# DTYPE = np.float64
# ctypedef np.float64_t DTYPE_t
ctypedef fused DTYPE_t:
np.float32_t
np.float64_t
def im2col_cython(np.ndarray[DTYPE_t, ndim=4] x, int field_height,
int field_width, int padding, int stride):
cdef int N = x.shape[0]
cdef int C = x.shape[1]
cdef int H = x.shape[2]
cdef int W = x.shape[3]
cdef int HH = (H + 2 * padding - field_height) / stride + 1
cdef int WW = (W + 2 * padding - field_width) / stride + 1
cdef int p = padding
cdef np.ndarray[DTYPE_t, ndim=4] x_padded = np.pad(x,
((0, 0), (0, 0), (p, p), (p, p)), mode='constant')
cdef np.ndarray[DTYPE_t, ndim=2] cols = np.zeros(
(C * field_height * field_width, N * HH * WW),
dtype=x.dtype)
# Moving the inner loop to a C function with no bounds checking works, but does
# not seem to help performance in any measurable way.
im2col_cython_inner(cols, x_padded, N, C, H, W, HH, WW,
field_height, field_width, padding, stride)
return cols
@cython.boundscheck(False)
cdef int im2col_cython_inner(np.ndarray[DTYPE_t, ndim=2] cols,
np.ndarray[DTYPE_t, ndim=4] x_padded,
int N, int C, int H, int W, int HH, int WW,
int field_height, int field_width, int padding, int stride) except? -1:
cdef int c, ii, jj, row, yy, xx, i, col
for c in range(C):
for yy in range(HH):
for xx in range(WW):
for ii in range(field_height):
for jj in range(field_width):
row = c * field_width * field_height + ii * field_height + jj
for i in range(N):
col = yy * WW * N + xx * N + i
cols[row, col] = x_padded[i, c, stride * yy + ii, stride * xx + jj]
def col2im_cython(np.ndarray[DTYPE_t, ndim=2] cols, int N, int C, int H, int W,
int field_height, int field_width, int padding, int stride):
cdef np.ndarray x = np.empty((N, C, H, W), dtype=cols.dtype)
cdef int HH = (H + 2 * padding - field_height) / stride + 1
cdef int WW = (W + 2 * padding - field_width) / stride + 1
cdef np.ndarray[DTYPE_t, ndim=4] x_padded = np.zeros((N, C, H + 2 * padding, W + 2 * padding),
dtype=cols.dtype)
# Moving the inner loop to a C-function with no bounds checking improves
# performance quite a bit for col2im.
col2im_cython_inner(cols, x_padded, N, C, H, W, HH, WW,
field_height, field_width, padding, stride)
if padding > 0:
return x_padded[:, :, padding:-padding, padding:-padding]
return x_padded
@cython.boundscheck(False)
cdef int col2im_cython_inner(np.ndarray[DTYPE_t, ndim=2] cols,
np.ndarray[DTYPE_t, ndim=4] x_padded,
int N, int C, int H, int W, int HH, int WW,
int field_height, int field_width, int padding, int stride) except? -1:
cdef int c, ii, jj, row, yy, xx, i, col
for c in range(C):
for ii in range(field_height):
for jj in range(field_width):
row = c * field_width * field_height + ii * field_height + jj
for yy in range(HH):
for xx in range(WW):
for i in range(N):
col = yy * WW * N + xx * N + i
x_padded[i, c, stride * yy + ii, stride * xx + jj] += cols[row, col]
@cython.boundscheck(False)
@cython.wraparound(False)
cdef col2im_6d_cython_inner(np.ndarray[DTYPE_t, ndim=6] cols,
np.ndarray[DTYPE_t, ndim=4] x_padded,
int N, int C, int H, int W, int HH, int WW,
int out_h, int out_w, int pad, int stride):
cdef int c, hh, ww, n, h, w
for n in range(N):
for c in range(C):
for hh in range(HH):
for ww in range(WW):
for h in range(out_h):
for w in range(out_w):
x_padded[n, c, stride * h + hh, stride * w + ww] += cols[c, hh, ww, n, h, w]
def col2im_6d_cython(np.ndarray[DTYPE_t, ndim=6] cols, int N, int C, int H, int W,
int HH, int WW, int pad, int stride):
cdef np.ndarray x = np.empty((N, C, H, W), dtype=cols.dtype)
cdef int out_h = (H + 2 * pad - HH) / stride + 1
cdef int out_w = (W + 2 * pad - WW) / stride + 1
cdef np.ndarray[DTYPE_t, ndim=4] x_padded = np.zeros((N, C, H + 2 * pad, W + 2 * pad),
dtype=cols.dtype)
col2im_6d_cython_inner(cols, x_padded, N, C, H, W, HH, WW, out_h, out_w, pad, stride)
if pad > 0:
return x_padded[:, :, pad:-pad, pad:-pad]
return x_padded

@ -0,0 +1,110 @@
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
pass
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
from .layers import *
from .fast_layers import *
def affine_relu_forward(x, w, b):
"""
Convenience layer that perorms an affine transform followed by a ReLU
Inputs:
- x: Input to the affine layer
- w, b: Weights for the affine layer
Returns a tuple of:
- out: Output from the ReLU
- cache: Object to give to the backward pass
"""
a, fc_cache = affine_forward(x, w, b)
out, relu_cache = relu_forward(a)
cache = (fc_cache, relu_cache)
return out, cache
def affine_relu_backward(dout, cache):
"""
Backward pass for the affine-relu convenience layer
"""
fc_cache, relu_cache = cache
da = relu_backward(dout, relu_cache)
dx, dw, db = affine_backward(da, fc_cache)
return dx, dw, db
def conv_relu_forward(x, w, b, conv_param):
"""
A convenience layer that performs a convolution followed by a ReLU.
Inputs:
- x: Input to the convolutional layer
- w, b, conv_param: Weights and parameters for the convolutional layer
Returns a tuple of:
- out: Output from the ReLU
- cache: Object to give to the backward pass
"""
a, conv_cache = conv_forward_fast(x, w, b, conv_param)
out, relu_cache = relu_forward(a)
cache = (conv_cache, relu_cache)
return out, cache
def conv_relu_backward(dout, cache):
"""
Backward pass for the conv-relu convenience layer.
"""
conv_cache, relu_cache = cache
da = relu_backward(dout, relu_cache)
dx, dw, db = conv_backward_fast(da, conv_cache)
return dx, dw, db
def conv_bn_relu_forward(x, w, b, gamma, beta, conv_param, bn_param):
a, conv_cache = conv_forward_fast(x, w, b, conv_param)
an, bn_cache = spatial_batchnorm_forward(a, gamma, beta, bn_param)
out, relu_cache = relu_forward(an)
cache = (conv_cache, bn_cache, relu_cache)
return out, cache
def conv_bn_relu_backward(dout, cache):
conv_cache, bn_cache, relu_cache = cache
dan = relu_backward(dout, relu_cache)
da, dgamma, dbeta = spatial_batchnorm_backward(dan, bn_cache)
dx, dw, db = conv_backward_fast(da, conv_cache)
return dx, dw, db, dgamma, dbeta
def conv_relu_pool_forward(x, w, b, conv_param, pool_param):
"""
Convenience layer that performs a convolution, a ReLU, and a pool.
Inputs:
- x: Input to the convolutional layer
- w, b, conv_param: Weights and parameters for the convolutional layer
- pool_param: Parameters for the pooling layer
Returns a tuple of:
- out: Output from the pooling layer
- cache: Object to give to the backward pass
"""
a, conv_cache = conv_forward_fast(x, w, b, conv_param)
s, relu_cache = relu_forward(a)
out, pool_cache = max_pool_forward_fast(s, pool_param)
cache = (conv_cache, relu_cache, pool_cache)
return out, cache
def conv_relu_pool_backward(dout, cache):
"""
Backward pass for the conv-relu-pool convenience layer
"""
conv_cache, relu_cache, pool_cache = cache
ds = max_pool_backward_fast(dout, pool_cache)
da = relu_backward(ds, relu_cache)
dx, dw, db = conv_backward_fast(da, conv_cache)
return dx, dw, db

@ -0,0 +1,696 @@
from builtins import range
import numpy as np
def affine_forward(x, w, b):
"""
Computes the forward pass for an affine (fully-connected) layer.
The input x has shape (N, d_1, ..., d_k) and contains a minibatch of N
examples, where each example x[i] has shape (d_1, ..., d_k). We will
reshape each input into a vector of dimension D = d_1 * ... * d_k, and
then transform it to an output vector of dimension M.
Inputs:
- x: A numpy array containing input data, of shape (N, d_1, ..., d_k)
- w: A numpy array of weights, of shape (D, M)
- b: A numpy array of biases, of shape (M,)
Returns a tuple of:
- out: output, of shape (N, M)
- cache: (x, w, b)
"""
out = None
###########################################################################
# TODO: Implement the affine forward pass. Store the result in out. You #
# will need to reshape the input into rows. #
###########################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
pass
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
###########################################################################
# END OF YOUR CODE #
###########################################################################
cache = (x, w, b)
return out, cache
def affine_backward(dout, cache):
"""
Computes the backward pass for an affine layer.
Inputs:
- dout: Upstream derivative, of shape (N, M)
- cache: Tuple of:
- x: Input data, of shape (N, d_1, ... d_k)
- w: Weights, of shape (D, M)
- b: Biases, of shape (M,)
Returns a tuple of:
- dx: Gradient with respect to x, of shape (N, d1, ..., d_k)
- dw: Gradient with respect to w, of shape (D, M)
- db: Gradient with respect to b, of shape (M,)
"""
x, w, b = cache
dx, dw, db = None, None, None
###########################################################################
# TODO: Implement the affine backward pass. #
###########################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
pass
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
###########################################################################
# END OF YOUR CODE #
###########################################################################
return dx, dw, db
def relu_forward(x):
"""
Computes the forward pass for a layer of rectified linear units (ReLUs).
Input:
- x: Inputs, of any shape
Returns a tuple of:
- out: Output, of the same shape as x
- cache: x
"""
out = None
###########################################################################
# TODO: Implement the ReLU forward pass. #
###########################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
pass
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
###########################################################################
# END OF YOUR CODE #
###########################################################################
cache = x
return out, cache
def relu_backward(dout, cache):
"""
Computes the backward pass for a layer of rectified linear units (ReLUs).
Input:
- dout: Upstream derivatives, of any shape
- cache: Input x, of same shape as dout
Returns:
- dx: Gradient with respect to x
"""
dx, x = None, cache
###########################################################################
# TODO: Implement the ReLU backward pass. #
###########################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
pass
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
###########################################################################
# END OF YOUR CODE #
###########################################################################
return dx
def batchnorm_forward(x, gamma, beta, bn_param):
"""
Forward pass for batch normalization.
During training the sample mean and (uncorrected) sample variance are
computed from minibatch statistics and used to normalize the incoming data.
During training we also keep an exponentially decaying running mean of the
mean and variance of each feature, and these averages are used to normalize
data at test-time.
At each timestep we update the running averages for mean and variance using
an exponential decay based on the momentum parameter:
running_mean = momentum * running_mean + (1 - momentum) * sample_mean
running_var = momentum * running_var + (1 - momentum) * sample_var
Note that the batch normalization paper suggests a different test-time
behavior: they compute sample mean and variance for each feature using a
large number of training images rather than using a running average. For
this implementation we have chosen to use running averages instead since
they do not require an additional estimation step; the torch7
implementation of batch normalization also uses running averages.
Input:
- x: Data of shape (N, D)
- gamma: Scale parameter of shape (D,)
- beta: Shift paremeter of shape (D,)
- bn_param: Dictionary with the following keys:
- mode: 'train' or 'test'; required
- eps: Constant for numeric stability
- momentum: Constant for running mean / variance.
- running_mean: Array of shape (D,) giving running mean of features
- running_var Array of shape (D,) giving running variance of features
Returns a tuple of:
- out: of shape (N, D)
- cache: A tuple of values needed in the backward pass
"""
mode = bn_param["mode"]
eps = bn_param.get("eps", 1e-5)
momentum = bn_param.get("momentum", 0.9)
N, D = x.shape
running_mean = bn_param.get("running_mean", np.zeros(D, dtype=x.dtype))
running_var = bn_param.get("running_var", np.zeros(D, dtype=x.dtype))
out, cache = None, None
if mode == "train":
#######################################################################
# TODO: Implement the training-time forward pass for batch norm. #
# Use minibatch statistics to compute the mean and variance, use #
# these statistics to normalize the incoming data, and scale and #
# shift the normalized data using gamma and beta. #
# #
# You should store the output in the variable out. Any intermediates #
# that you need for the backward pass should be stored in the cache #
# variable. #
# #
# You should also use your computed sample mean and variance together #
# with the momentum variable to update the running mean and running #
# variance, storing your result in the running_mean and running_var #
# variables. #
# #
# Note that though you should be keeping track of the running #
# variance, you should normalize the data based on the standard #
# deviation (square root of variance) instead! #
# Referencing the original paper (https://arxiv.org/abs/1502.03167) #
# might prove to be helpful. #
#######################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
pass
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
#######################################################################
# END OF YOUR CODE #
#######################################################################
elif mode == "test":
#######################################################################
# TODO: Implement the test-time forward pass for batch normalization. #
# Use the running mean and variance to normalize the incoming data, #
# then scale and shift the normalized data using gamma and beta. #
# Store the result in the out variable. #
#######################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
pass
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
#######################################################################
# END OF YOUR CODE #
#######################################################################
else:
raise ValueError('Invalid forward batchnorm mode "%s"' % mode)
# Store the updated running means back into bn_param
bn_param["running_mean"] = running_mean
bn_param["running_var"] = running_var
return out, cache
def batchnorm_backward(dout, cache):
"""
Backward pass for batch normalization.
For this implementation, you should write out a computation graph for
batch normalization on paper and propagate gradients backward through
intermediate nodes.
Inputs:
- dout: Upstream derivatives, of shape (N, D)
- cache: Variable of intermediates from batchnorm_forward.
Returns a tuple of:
- dx: Gradient with respect to inputs x, of shape (N, D)
- dgamma: Gradient with respect to scale parameter gamma, of shape (D,)
- dbeta: Gradient with respect to shift parameter beta, of shape (D,)
"""
dx, dgamma, dbeta = None, None, None
###########################################################################
# TODO: Implement the backward pass for batch normalization. Store the #
# results in the dx, dgamma, and dbeta variables. #
# Referencing the original paper (https://arxiv.org/abs/1502.03167) #
# might prove to be helpful. #
###########################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
pass
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
###########################################################################
# END OF YOUR CODE #
###########################################################################
return dx, dgamma, dbeta
def dropout_forward(x, dropout_param):
"""
Performs the forward pass for (inverted) dropout.
Inputs:
- x: Input data, of any shape
- dropout_param: A dictionary with the following keys:
- p: Dropout parameter. We keep each neuron output with probability p.
- mode: 'test' or 'train'. If the mode is train, then perform dropout;
if the mode is test, then just return the input.
- seed: Seed for the random number generator. Passing seed makes this
function deterministic, which is needed for gradient checking but not
in real networks.
Outputs:
- out: Array of the same shape as x.
- cache: tuple (dropout_param, mask). In training mode, mask is the dropout
mask that was used to multiply the input; in test mode, mask is None.
NOTE: Please implement **inverted** dropout, not the vanilla version of dropout.
See http://cs231n.github.io/neural-networks-2/#reg for more details.
NOTE 2: Keep in mind that p is the probability of **keep** a neuron
output; this might be contrary to some sources, where it is referred to
as the probability of dropping a neuron output.
"""
p, mode = dropout_param["p"], dropout_param["mode"]
if "seed" in dropout_param:
np.random.seed(dropout_param["seed"])
mask = None
out = None
if mode == "train":
#######################################################################
# TODO: Implement training phase forward pass for inverted dropout. #
# Store the dropout mask in the mask variable. #
#######################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
pass
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
#######################################################################
# END OF YOUR CODE #
#######################################################################
elif mode == "test":
#######################################################################
# TODO: Implement the test phase forward pass for inverted dropout. #
#######################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
pass
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
#######################################################################
# END OF YOUR CODE #
#######################################################################
cache = (dropout_param, mask)
out = out.astype(x.dtype, copy=False)
return out, cache
def dropout_backward(dout, cache):
"""
Perform the backward pass for (inverted) dropout.
Inputs:
- dout: Upstream derivatives, of any shape
- cache: (dropout_param, mask) from dropout_forward.
"""
dropout_param, mask = cache
mode = dropout_param["mode"]
dx = None
if mode == "train":
#######################################################################
# TODO: Implement training phase backward pass for inverted dropout #
#######################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
pass
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
#######################################################################
# END OF YOUR CODE #
#######################################################################
elif mode == "test":
dx = dout
return dx
def conv_forward_naive(x, w, b, conv_param):
"""
A naive implementation of the forward pass for a convolutional layer.
The input consists of N data points, each with C channels, height H and
width W. We convolve each input with F different filters, where each filter
spans all C channels and has height HH and width WW.
Input:
- x: Input data of shape (N, C, H, W)
- w: Filter weights of shape (F, C, HH, WW)
- b: Biases, of shape (F,)
- conv_param: A dictionary with the following keys:
- 'stride': The number of pixels between adjacent receptive fields in the
horizontal and vertical directions.
- 'pad': The number of pixels that will be used to zero-pad the input.
During padding, 'pad' zeros should be placed symmetrically (i.e equally on both sides)
along the height and width axes of the input. Be careful not to modfiy the original
input x directly.
Returns a tuple of:
- out: Output data, of shape (N, F, H', W') where H' and W' are given by
H' = 1 + (H + 2 * pad - HH) / stride
W' = 1 + (W + 2 * pad - WW) / stride
- cache: (x, w, b, conv_param)
"""
out = None
###########################################################################
# TODO: Implement the convolutional forward pass. #
# Hint: you can use the function np.pad for padding. #
###########################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
pass
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
###########################################################################
# END OF YOUR CODE #
###########################################################################
cache = (x, w, b, conv_param)
return out, cache
def conv_backward_naive(dout, cache):
"""
A naive implementation of the backward pass for a convolutional layer.
Inputs:
- dout: Upstream derivatives.
- cache: A tuple of (x, w, b, conv_param) as in conv_forward_naive
Returns a tuple of:
- dx: Gradient with respect to x
- dw: Gradient with respect to w
- db: Gradient with respect to b
"""
dx, dw, db = None, None, None
###########################################################################
# TODO: Implement the convolutional backward pass. #
###########################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
pass
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
###########################################################################
# END OF YOUR CODE #
###########################################################################
return dx, dw, db
def max_pool_forward_naive(x, pool_param):
"""
A naive implementation of the forward pass for a max-pooling layer.
Inputs:
- x: Input data, of shape (N, C, H, W)
- pool_param: dictionary with the following keys:
- 'pool_height': The height of each pooling region
- 'pool_width': The width of each pooling region
- 'stride': The distance between adjacent pooling regions
No padding is necessary here. Output size is given by
Returns a tuple of:
- out: Output data, of shape (N, C, H', W') where H' and W' are given by
H' = 1 + (H - pool_height) / stride
W' = 1 + (W - pool_width) / stride
- cache: (x, pool_param)
"""
out = None
###########################################################################
# TODO: Implement the max-pooling forward pass #
###########################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
pass
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
###########################################################################
# END OF YOUR CODE #
###########################################################################
cache = (x, pool_param)
return out, cache
def max_pool_backward_naive(dout, cache):
"""
A naive implementation of the backward pass for a max-pooling layer.
Inputs:
- dout: Upstream derivatives
- cache: A tuple of (x, pool_param) as in the forward pass.
Returns:
- dx: Gradient with respect to x
"""
dx = None
###########################################################################
# TODO: Implement the max-pooling backward pass #
###########################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
pass
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
###########################################################################
# END OF YOUR CODE #
###########################################################################
return dx
def spatial_batchnorm_forward(x, gamma, beta, bn_param):
"""
Computes the forward pass for spatial batch normalization.
Inputs:
- x: Input data of shape (N, C, H, W)
- gamma: Scale parameter, of shape (C,)
- beta: Shift parameter, of shape (C,)
- bn_param: Dictionary with the following keys:
- mode: 'train' or 'test'; required
- eps: Constant for numeric stability
- momentum: Constant for running mean / variance. momentum=0 means that
old information is discarded completely at every time step, while
momentum=1 means that new information is never incorporated. The
default of momentum=0.9 should work well in most situations.
- running_mean: Array of shape (D,) giving running mean of features
- running_var Array of shape (D,) giving running variance of features
Returns a tuple of:
- out: Output data, of shape (N, C, H, W)
- cache: Values needed for the backward pass
"""
out, cache = None, None
###########################################################################
# TODO: Implement the forward pass for spatial batch normalization. #
# #
# HINT: You can implement spatial batch normalization by calling the #
# vanilla version of batch normalization you implemented above. #
# Your implementation should be very short; ours is less than five lines. #
###########################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
pass
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
###########################################################################
# END OF YOUR CODE #
###########################################################################
return out, cache
def spatial_batchnorm_backward(dout, cache):
"""
Computes the backward pass for spatial batch normalization.
Inputs:
- dout: Upstream derivatives, of shape (N, C, H, W)
- cache: Values from the forward pass
Returns a tuple of:
- dx: Gradient with respect to inputs, of shape (N, C, H, W)
- dgamma: Gradient with respect to scale parameter, of shape (C,)
- dbeta: Gradient with respect to shift parameter, of shape (C,)
"""
dx, dgamma, dbeta = None, None, None
###########################################################################
# TODO: Implement the backward pass for spatial batch normalization. #
# #
# HINT: You can implement spatial batch normalization by calling the #
# vanilla version of batch normalization you implemented above. #
# Your implementation should be very short; ours is less than five lines. #
###########################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
pass
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
###########################################################################
# END OF YOUR CODE #
###########################################################################
return dx, dgamma, dbeta
def spatial_groupnorm_forward(x, gamma, beta, G, gn_param):
"""
Computes the forward pass for spatial group normalization.
In contrast to layer normalization, group normalization splits each entry
in the data into G contiguous pieces, which it then normalizes independently.
Per feature shifting and scaling are then applied to the data, in a manner identical to that of batch normalization and layer normalization.
Inputs:
- x: Input data of shape (N, C, H, W)
- gamma: Scale parameter, of shape (C,)
- beta: Shift parameter, of shape (C,)
- G: Integer mumber of groups to split into, should be a divisor of C
- gn_param: Dictionary with the following keys:
- eps: Constant for numeric stability
Returns a tuple of:
- out: Output data, of shape (N, C, H, W)
- cache: Values needed for the backward pass
"""
out, cache = None, None
eps = gn_param.get("eps", 1e-5)
###########################################################################
# TODO: Implement the forward pass for spatial group normalization. #
# This will be extremely similar to the layer norm implementation. #
# In particular, think about how you could transform the matrix so that #
# the bulk of the code is similar to both train-time batch normalization #
# and layer normalization! #
###########################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
pass
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
###########################################################################
# END OF YOUR CODE #
###########################################################################
return out, cache
def spatial_groupnorm_backward(dout, cache):
"""
Computes the backward pass for spatial group normalization.
Inputs:
- dout: Upstream derivatives, of shape (N, C, H, W)
- cache: Values from the forward pass
Returns a tuple of:
- dx: Gradient with respect to inputs, of shape (N, C, H, W)
- dgamma: Gradient with respect to scale parameter, of shape (C,)
- dbeta: Gradient with respect to shift parameter, of shape (C,)
"""
dx, dgamma, dbeta = None, None, None
###########################################################################
# TODO: Implement the backward pass for spatial group normalization. #
# This will be extremely similar to the layer norm implementation. #
###########################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
pass
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
###########################################################################
# END OF YOUR CODE #
###########################################################################
return dx, dgamma, dbeta
def svm_loss(x, y):
"""
Computes the loss and gradient using for multiclass SVM classification.
Inputs:
- x: Input data, of shape (N, C) where x[i, j] is the score for the jth
class for the ith input.
- y: Vector of labels, of shape (N,) where y[i] is the label for x[i] and
0 <= y[i] < C
Returns a tuple of:
- loss: Scalar giving the loss
- dx: Gradient of the loss with respect to x
"""
N = x.shape[0]
correct_class_scores = x[np.arange(N), y]
margins = np.maximum(0, x - correct_class_scores[:, np.newaxis] + 1.0)
margins[np.arange(N), y] = 0
loss = np.sum(margins) / N
num_pos = np.sum(margins > 0, axis=1)
dx = np.zeros_like(x)
dx[margins > 0] = 1
dx[np.arange(N), y] -= num_pos
dx /= N
return loss, dx
def softmax_loss(x, y):
"""
Computes the loss and gradient for softmax classification.
Inputs:
- x: Input data, of shape (N, C) where x[i, j] is the score for the jth
class for the ith input.
- y: Vector of labels, of shape (N,) where y[i] is the label for x[i] and
0 <= y[i] < C
Returns a tuple of:
- loss: Scalar giving the loss
- dx: Gradient of the loss with respect to x
"""
shifted_logits = x - np.max(x, axis=1, keepdims=True)
Z = np.sum(np.exp(shifted_logits), axis=1, keepdims=True)
log_probs = shifted_logits - np.log(Z)
probs = np.exp(log_probs)
N = x.shape[0]
loss = -np.sum(log_probs[np.arange(N), y]) / N
dx = probs.copy()
dx[np.arange(N), y] -= 1
dx /= N
return loss, dx

@ -0,0 +1,162 @@
import numpy as np
"""
This file implements various first-order update rules that are commonly used
for training neural networks. Each update rule accepts current weights and the
gradient of the loss with respect to those weights and produces the next set of
weights. Each update rule has the same interface:
def update(w, dw, config=None):
Inputs:
- w: A numpy array giving the current weights.
- dw: A numpy array of the same shape as w giving the gradient of the
loss with respect to w.
- config: A dictionary containing hyperparameter values such as learning
rate, momentum, etc. If the update rule requires caching values over many
iterations, then config will also hold these cached values.
Returns:
- next_w: The next point after the update.
- config: The config dictionary to be passed to the next iteration of the
update rule.
NOTE: For most update rules, the default learning rate will probably not
perform well; however the default values of the other hyperparameters should
work well for a variety of different problems.
For efficiency, update rules may perform in-place updates, mutating w and
setting next_w equal to w.
"""
def sgd(w, dw, config=None):
"""
Performs vanilla stochastic gradient descent.
config format:
- learning_rate: Scalar learning rate.
"""
if config is None:
config = {}
config.setdefault("learning_rate", 1e-2)
w -= config["learning_rate"] * dw
return w, config
def sgd_momentum(w, dw, config=None):
"""
Performs stochastic gradient descent with momentum.
config format:
- learning_rate: Scalar learning rate.
- momentum: Scalar between 0 and 1 giving the momentum value.
Setting momentum = 0 reduces to sgd.
- velocity: A numpy array of the same shape as w and dw used to store a
moving average of the gradients.
"""
if config is None:
config = {}
config.setdefault("learning_rate", 1e-2)
config.setdefault("momentum", 0.9)
v = config.get("velocity", np.zeros_like(w))
next_w = None
###########################################################################
# TODO: Implement the momentum update formula. Store the updated value in #
# the next_w variable. You should also use and update the velocity v. #
###########################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
pass
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
###########################################################################
# END OF YOUR CODE #
###########################################################################
config["velocity"] = v
return next_w, config
def rmsprop(w, dw, config=None):
"""
Uses the RMSProp update rule, which uses a moving average of squared
gradient values to set adaptive per-parameter learning rates.
config format:
- learning_rate: Scalar learning rate.
- decay_rate: Scalar between 0 and 1 giving the decay rate for the squared
gradient cache.
- epsilon: Small scalar used for smoothing to avoid dividing by zero.
- cache: Moving average of second moments of gradients.
"""
if config is None:
config = {}
config.setdefault("learning_rate", 1e-2)
config.setdefault("decay_rate", 0.99)
config.setdefault("epsilon", 1e-8)
config.setdefault("cache", np.zeros_like(w))
next_w = None
###########################################################################
# TODO: Implement the RMSprop update formula, storing the next value of w #
# in the next_w variable. Don't forget to update cache value stored in #
# config['cache']. #
###########################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
pass
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
###########################################################################
# END OF YOUR CODE #
###########################################################################
return next_w, config
def adam(w, dw, config=None):
"""
Uses the Adam update rule, which incorporates moving averages of both the
gradient and its square and a bias correction term.
config format:
- learning_rate: Scalar learning rate.
- beta1: Decay rate for moving average of first moment of gradient.
- beta2: Decay rate for moving average of second moment of gradient.
- epsilon: Small scalar used for smoothing to avoid dividing by zero.
- m: Moving average of gradient.
- v: Moving average of squared gradient.
- t: Iteration number.
"""
if config is None:
config = {}
config.setdefault("learning_rate", 1e-3)
config.setdefault("beta1", 0.9)
config.setdefault("beta2", 0.999)
config.setdefault("epsilon", 1e-8)
config.setdefault("m", np.zeros_like(w))
config.setdefault("v", np.zeros_like(w))
config.setdefault("t", 0)
next_w = None
###########################################################################
# TODO: Implement the Adam update formula, storing the next value of w in #
# the next_w variable. Don't forget to update the m, v, and t variables #
# stored in config. #
# #
# NOTE: In order to match the reference output, please modify t _before_ #
# using it in any calculations. #
###########################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
pass
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
###########################################################################
# END OF YOUR CODE #
###########################################################################
return next_w, config

@ -0,0 +1,12 @@
from distutils.core import setup
from distutils.extension import Extension
from Cython.Build import cythonize
import numpy
extensions = [
Extension(
"im2col_cython", ["im2col_cython.pyx"], include_dirs=[numpy.get_include()]
),
]
setup(ext_modules=cythonize(extensions),)

@ -0,0 +1,309 @@
from __future__ import print_function, division
from future import standard_library
standard_library.install_aliases()
from builtins import range
from builtins import object
import os
import pickle as pickle
import numpy as np
from scripts import optim
class Solver(object):
"""
A Solver encapsulates all the logic necessary for training classification
models. The Solver performs stochastic gradient descent using different
update rules defined in optim.py.
The solver accepts both training and validataion data and labels so it can
periodically check classification accuracy on both training and validation
data to watch out for overfitting.
To train a model, you will first construct a Solver instance, passing the
model, dataset, and various options (learning rate, batch size, etc) to the
constructor. You will then call the train() method to run the optimization
procedure and train the model.
After the train() method returns, model.params will contain the parameters
that performed best on the validation set over the course of training.
In addition, the instance variable solver.loss_history will contain a list
of all losses encountered during training and the instance variables
solver.train_acc_history and solver.val_acc_history will be lists of the
accuracies of the model on the training and validation set at each epoch.
Example usage might look something like this:
data = {
'X_train': # training data
'y_train': # training labels
'X_val': # validation data
'y_val': # validation labels
}
model = MyAwesomeModel(hidden_size=100, reg=10)
solver = Solver(model, data,
update_rule='sgd',
optim_config={
'learning_rate': 1e-3,
},
lr_decay=0.95,
num_epochs=10, batch_size=100,
print_every=100)
solver.train()
A Solver works on a model object that must conform to the following API:
- model.params must be a dictionary mapping string parameter names to numpy
arrays containing parameter values.
- model.loss(X, y) must be a function that computes training-time loss and
gradients, and test-time classification scores, with the following inputs
and outputs:
Inputs:
- X: Array giving a minibatch of input data of shape (N, d_1, ..., d_k)
- y: Array of labels, of shape (N,) giving labels for X where y[i] is the
label for X[i].
Returns:
If y is None, run a test-time forward pass and return:
- scores: Array of shape (N, C) giving classification scores for X where
scores[i, c] gives the score of class c for X[i].
If y is not None, run a training time forward and backward pass and
return a tuple of:
- loss: Scalar giving the loss
- grads: Dictionary with the same keys as self.params mapping parameter
names to gradients of the loss with respect to those parameters.
"""
def __init__(self, model, data, **kwargs):
"""
Construct a new Solver instance.
Required arguments:
- model: A model object conforming to the API described above
- data: A dictionary of training and validation data containing:
'X_train': Array, shape (N_train, d_1, ..., d_k) of training images
'X_val': Array, shape (N_val, d_1, ..., d_k) of validation images
'y_train': Array, shape (N_train,) of labels for training images
'y_val': Array, shape (N_val,) of labels for validation images
Optional arguments:
- update_rule: A string giving the name of an update rule in optim.py.
Default is 'sgd'.
- optim_config: A dictionary containing hyperparameters that will be
passed to the chosen update rule. Each update rule requires different
hyperparameters (see optim.py) but all update rules require a
'learning_rate' parameter so that should always be present.
- lr_decay: A scalar for learning rate decay; after each epoch the
learning rate is multiplied by this value.
- batch_size: Size of minibatches used to compute loss and gradient
during training.
- num_epochs: The number of epochs to run for during training.
- print_every: Integer; training losses will be printed every
print_every iterations.
- verbose: Boolean; if set to false then no output will be printed
during training.
- num_train_samples: Number of training samples used to check training
accuracy; default is 1000; set to None to use entire training set.
- num_val_samples: Number of validation samples to use to check val
accuracy; default is None, which uses the entire validation set.
- checkpoint_name: If not None, then save model checkpoints here every
epoch.
"""
self.model = model
self.X_train = data["X_train"]
self.y_train = data["y_train"]
self.X_val = data["X_val"]
self.y_val = data["y_val"]
# Unpack keyword arguments
self.update_rule = kwargs.pop("update_rule", "sgd")
self.optim_config = kwargs.pop("optim_config", {})
self.lr_decay = kwargs.pop("lr_decay", 1.0)
self.batch_size = kwargs.pop("batch_size", 100)
self.num_epochs = kwargs.pop("num_epochs", 10)
self.num_train_samples = kwargs.pop("num_train_samples", 1000)
self.num_val_samples = kwargs.pop("num_val_samples", None)
self.checkpoint_name = kwargs.pop("checkpoint_name", None)
self.print_every = kwargs.pop("print_every", 10)
self.verbose = kwargs.pop("verbose", True)
# Throw an error if there are extra keyword arguments
if len(kwargs) > 0:
extra = ", ".join('"%s"' % k for k in list(kwargs.keys()))
raise ValueError("Unrecognized arguments %s" % extra)
# Make sure the update rule exists, then replace the string
# name with the actual function
if not hasattr(optim, self.update_rule):
raise ValueError('Invalid update_rule "%s"' % self.update_rule)
self.update_rule = getattr(optim, self.update_rule)
self._reset()
def _reset(self):
"""
Set up some book-keeping variables for optimization. Don't call this
manually.
"""
# Set up some variables for book-keeping
self.epoch = 0
self.best_val_acc = 0
self.best_params = {}
self.loss_history = []
self.train_acc_history = []
self.val_acc_history = []
# Make a deep copy of the optim_config for each parameter
self.optim_configs = {}
for p in self.model.params:
d = {k: v for k, v in self.optim_config.items()}
self.optim_configs[p] = d
def _step(self):
"""
Make a single gradient update. This is called by train() and should not
be called manually.
"""
# Make a minibatch of training data
num_train = self.X_train.shape[0]
batch_mask = np.random.choice(num_train, self.batch_size)
X_batch = self.X_train[batch_mask]
y_batch = self.y_train[batch_mask]
# Compute loss and gradient
loss, grads = self.model.loss(X_batch, y_batch)
self.loss_history.append(loss)
# Perform a parameter update
for p, w in self.model.params.items():
dw = grads[p]
config = self.optim_configs[p]
next_w, next_config = self.update_rule(w, dw, config)
self.model.params[p] = next_w
self.optim_configs[p] = next_config
def _save_checkpoint(self):
if self.checkpoint_name is None:
return
checkpoint = {
"model": self.model,
"update_rule": self.update_rule,
"lr_decay": self.lr_decay,
"optim_config": self.optim_config,
"batch_size": self.batch_size,
"num_train_samples": self.num_train_samples,
"num_val_samples": self.num_val_samples,
"epoch": self.epoch,
"loss_history": self.loss_history,
"train_acc_history": self.train_acc_history,
"val_acc_history": self.val_acc_history,
}
filename = "%s_epoch_%d.pkl" % (self.checkpoint_name, self.epoch)
if self.verbose:
print('Saving checkpoint to "%s"' % filename)
with open(filename, "wb") as f:
pickle.dump(checkpoint, f)
def check_accuracy(self, X, y, num_samples=None, batch_size=100):
"""
Check accuracy of the model on the provided data.
Inputs:
- X: Array of data, of shape (N, d_1, ..., d_k)
- y: Array of labels, of shape (N,)
- num_samples: If not None, subsample the data and only test the model
on num_samples datapoints.
- batch_size: Split X and y into batches of this size to avoid using
too much memory.
Returns:
- acc: Scalar giving the fraction of instances that were correctly
classified by the model.
"""
# Maybe subsample the data
N = X.shape[0]
if num_samples is not None and N > num_samples:
mask = np.random.choice(N, num_samples)
N = num_samples
X = X[mask]
y = y[mask]
# Compute predictions in batches
num_batches = N // batch_size
if N % batch_size != 0:
num_batches += 1
y_pred = []
for i in range(num_batches):
start = i * batch_size
end = (i + 1) * batch_size
scores = self.model.loss(X[start:end])
y_pred.append(np.argmax(scores, axis=1))
y_pred = np.hstack(y_pred)
acc = np.mean(y_pred == y)
return acc
def train(self):
"""
Run optimization to train the model.
"""
num_train = self.X_train.shape[0]
iterations_per_epoch = max(num_train // self.batch_size, 1)
num_iterations = self.num_epochs * iterations_per_epoch
for t in range(num_iterations):
self._step()
# Maybe print training loss
if self.verbose and t % self.print_every == 0:
print(
"(Iteration %d / %d) loss: %f"
% (t + 1, num_iterations, self.loss_history[-1])
)
# At the end of every epoch, increment the epoch counter and decay
# the learning rate.
epoch_end = (t + 1) % iterations_per_epoch == 0
if epoch_end:
self.epoch += 1
for k in self.optim_configs:
self.optim_configs[k]["learning_rate"] *= self.lr_decay
# Check train and val accuracy on the first iteration, the last
# iteration, and at the end of each epoch.
first_it = t == 0
last_it = t == num_iterations - 1
if first_it or last_it or epoch_end:
train_acc = self.check_accuracy(
self.X_train, self.y_train, num_samples=self.num_train_samples
)
val_acc = self.check_accuracy(
self.X_val, self.y_val, num_samples=self.num_val_samples
)
self.train_acc_history.append(train_acc)
self.val_acc_history.append(val_acc)
self._save_checkpoint()
if self.verbose:
print(
"(Epoch %d / %d) train acc: %f; val_acc: %f"
% (self.epoch, self.num_epochs, train_acc, val_acc)
)
# Keep track of the best model
if val_acc > self.best_val_acc:
self.best_val_acc = val_acc
self.best_params = {}
for k, v in self.model.params.items():
self.best_params[k] = v.copy()
# At the end of training swap the best params into the model
self.model.params = self.best_params

@ -0,0 +1,78 @@
from builtins import range
from past.builtins import xrange
from math import sqrt, ceil
import numpy as np
def visualize_grid(Xs, ubound=255.0, padding=1):
"""
Reshape a 4D tensor of image data to a grid for easy visualization.
Inputs:
- Xs: Data of shape (N, H, W, C)
- ubound: Output grid will have values scaled to the range [0, ubound]
- padding: The number of blank pixels between elements of the grid
"""
(N, H, W, C) = Xs.shape
grid_size = int(ceil(sqrt(N)))
grid_height = H * grid_size + padding * (grid_size - 1)
grid_width = W * grid_size + padding * (grid_size - 1)
grid = np.zeros((grid_height, grid_width, C))
next_idx = 0
y0, y1 = 0, H
for y in range(grid_size):
x0, x1 = 0, W
for x in range(grid_size):
if next_idx < N:
img = Xs[next_idx]
low, high = np.min(img), np.max(img)
grid[y0:y1, x0:x1] = ubound * (img - low) / (high - low)
# grid[y0:y1, x0:x1] = Xs[next_idx]
next_idx += 1
x0 += W + padding
x1 += W + padding
y0 += H + padding
y1 += H + padding
# grid_max = np.max(grid)
# grid_min = np.min(grid)
# grid = ubound * (grid - grid_min) / (grid_max - grid_min)
return grid
def vis_grid(Xs):
""" visualize a grid of images """
(N, H, W, C) = Xs.shape
A = int(ceil(sqrt(N)))
G = np.ones((A * H + A, A * W + A, C), Xs.dtype)
G *= np.min(Xs)
n = 0
for y in range(A):
for x in range(A):
if n < N:
G[y * H + y : (y + 1) * H + y, x * W + x : (x + 1) * W + x, :] = Xs[
n, :, :, :
]
n += 1
# normalize to [0,1]
maxg = G.max()
ming = G.min()
G = (G - ming) / (maxg - ming)
return G
def vis_nn(rows):
""" visualize array of arrays of images """
N = len(rows)
D = len(rows[0])
H, W, C = rows[0][0].shape
Xs = rows[0][0]
G = np.ones((N * H + N, D * W + D, C), Xs.dtype)
for y in range(N):
for x in range(D):
G[y * H + y : (y + 1) * H + y, x * W + x : (x + 1) * W + x, :] = rows[y][x]
# normalize to [0,1]
maxg = G.max()
ming = G.min()
G = (G - ming) / (maxg - ming)
return G
Loading…
Cancel
Save