mirror of https://github.com/da0c/DL_Course_SamU
				
				
				
			add lab_3
							parent
							
								
									32c19488ad
								
							
						
					
					
						commit
						6a053ab090
					
				
											
												
													File diff suppressed because it is too large
													Load Diff
												
											
										
									
								@ -0,0 +1,135 @@
 | 
			
		||||
from builtins import object
 | 
			
		||||
import numpy as np
 | 
			
		||||
 | 
			
		||||
from ..layers import *
 | 
			
		||||
from ..fast_layers import *
 | 
			
		||||
from ..layer_utils import *
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class ThreeLayerConvNet(object):
 | 
			
		||||
    """
 | 
			
		||||
    A three-layer convolutional network with the following architecture:
 | 
			
		||||
 | 
			
		||||
    conv - relu - 2x2 max pool - affine - relu - affine - softmax
 | 
			
		||||
 | 
			
		||||
    The network operates on minibatches of data that have shape (N, C, H, W)
 | 
			
		||||
    consisting of N images, each with height H and width W and with C input
 | 
			
		||||
    channels.
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    def __init__(
 | 
			
		||||
        self,
 | 
			
		||||
        input_dim=(3, 32, 32),
 | 
			
		||||
        num_filters=32,
 | 
			
		||||
        filter_size=7,
 | 
			
		||||
        hidden_dim=100,
 | 
			
		||||
        num_classes=10,
 | 
			
		||||
        weight_scale=1e-3,
 | 
			
		||||
        reg=0.0,
 | 
			
		||||
        dtype=np.float32,
 | 
			
		||||
    ):
 | 
			
		||||
        """
 | 
			
		||||
        Initialize a new network.
 | 
			
		||||
 | 
			
		||||
        Inputs:
 | 
			
		||||
        - input_dim: Tuple (C, H, W) giving size of input data
 | 
			
		||||
        - num_filters: Number of filters to use in the convolutional layer
 | 
			
		||||
        - filter_size: Width/height of filters to use in the convolutional layer
 | 
			
		||||
        - hidden_dim: Number of units to use in the fully-connected hidden layer
 | 
			
		||||
        - num_classes: Number of scores to produce from the final affine layer.
 | 
			
		||||
        - weight_scale: Scalar giving standard deviation for random initialization
 | 
			
		||||
          of weights.
 | 
			
		||||
        - reg: Scalar giving L2 regularization strength
 | 
			
		||||
        - dtype: numpy datatype to use for computation.
 | 
			
		||||
        """
 | 
			
		||||
        self.params = {}
 | 
			
		||||
        self.reg = reg
 | 
			
		||||
        self.dtype = dtype
 | 
			
		||||
 | 
			
		||||
        ############################################################################
 | 
			
		||||
        # TODO: Initialize weights and biases for the three-layer convolutional    #
 | 
			
		||||
        # network. Weights should be initialized from a Gaussian centered at 0.0   #
 | 
			
		||||
        # with standard deviation equal to weight_scale; biases should be          #
 | 
			
		||||
        # initialized to zero. All weights and biases should be stored in the      #
 | 
			
		||||
        #  dictionary self.params. Store weights and biases for the convolutional  #
 | 
			
		||||
        # layer using the keys 'W1' and 'b1'; use keys 'W2' and 'b2' for the       #
 | 
			
		||||
        # weights and biases of the hidden affine layer, and keys 'W3' and 'b3'    #
 | 
			
		||||
        # for the weights and biases of the output affine layer.                   #
 | 
			
		||||
        #                                                                          #
 | 
			
		||||
        # IMPORTANT: For this assignment, you can assume that the padding          #
 | 
			
		||||
        # and stride of the first convolutional layer are chosen so that           #
 | 
			
		||||
        # **the width and height of the input are preserved**. Take a look at      #
 | 
			
		||||
        # the start of the loss() function to see how that happens.                #
 | 
			
		||||
        ############################################################################
 | 
			
		||||
        # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
 | 
			
		||||
 | 
			
		||||
        pass
 | 
			
		||||
 | 
			
		||||
        # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
 | 
			
		||||
        ############################################################################
 | 
			
		||||
        #                             END OF YOUR CODE                             #
 | 
			
		||||
        ############################################################################
 | 
			
		||||
 | 
			
		||||
        for k, v in self.params.items():
 | 
			
		||||
            self.params[k] = v.astype(dtype)
 | 
			
		||||
 | 
			
		||||
    def loss(self, X, y=None):
 | 
			
		||||
        """
 | 
			
		||||
        Evaluate loss and gradient for the three-layer convolutional network.
 | 
			
		||||
 | 
			
		||||
        Input / output: Same API as TwoLayerNet in fc_net.py.
 | 
			
		||||
        """
 | 
			
		||||
        W1, b1 = self.params["W1"], self.params["b1"]
 | 
			
		||||
        W2, b2 = self.params["W2"], self.params["b2"]
 | 
			
		||||
        W3, b3 = self.params["W3"], self.params["b3"]
 | 
			
		||||
 | 
			
		||||
        # pass conv_param to the forward pass for the convolutional layer
 | 
			
		||||
        # Padding and stride chosen to preserve the input spatial size
 | 
			
		||||
        filter_size = W1.shape[2]
 | 
			
		||||
        conv_param = {"stride": 1, "pad": (filter_size - 1) // 2}
 | 
			
		||||
 | 
			
		||||
        # pass pool_param to the forward pass for the max-pooling layer
 | 
			
		||||
        pool_param = {"pool_height": 2, "pool_width": 2, "stride": 2}
 | 
			
		||||
 | 
			
		||||
        scores = None
 | 
			
		||||
        ############################################################################
 | 
			
		||||
        # TODO: Implement the forward pass for the three-layer convolutional net,  #
 | 
			
		||||
        # computing the class scores for X and storing them in the scores          #
 | 
			
		||||
        # variable.                                                                #
 | 
			
		||||
        #                                                                          #
 | 
			
		||||
        # Remember you can use the functions defined in cs231n/fast_layers.py and  #
 | 
			
		||||
        # cs231n/layer_utils.py in your implementation (already imported).         #
 | 
			
		||||
        ############################################################################
 | 
			
		||||
        # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
 | 
			
		||||
 | 
			
		||||
        pass
 | 
			
		||||
 | 
			
		||||
        # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
 | 
			
		||||
        ############################################################################
 | 
			
		||||
        #                             END OF YOUR CODE                             #
 | 
			
		||||
        ############################################################################
 | 
			
		||||
 | 
			
		||||
        if y is None:
 | 
			
		||||
            return scores
 | 
			
		||||
 | 
			
		||||
        loss, grads = 0, {}
 | 
			
		||||
        ############################################################################
 | 
			
		||||
        # TODO: Implement the backward pass for the three-layer convolutional net, #
 | 
			
		||||
        # storing the loss and gradients in the loss and grads variables. Compute  #
 | 
			
		||||
        # data loss using softmax, and make sure that grads[k] holds the gradients #
 | 
			
		||||
        # for self.params[k]. Don't forget to add L2 regularization!               #
 | 
			
		||||
        #                                                                          #
 | 
			
		||||
        # NOTE: To ensure that your implementation matches ours and you pass the   #
 | 
			
		||||
        # automated tests, make sure that your L2 regularization includes a factor #
 | 
			
		||||
        # of 0.5 to simplify the expression for the gradient.                      #
 | 
			
		||||
        ############################################################################
 | 
			
		||||
        # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
 | 
			
		||||
 | 
			
		||||
        pass
 | 
			
		||||
 | 
			
		||||
        # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
 | 
			
		||||
        ############################################################################
 | 
			
		||||
        #                             END OF YOUR CODE                             #
 | 
			
		||||
        ############################################################################
 | 
			
		||||
 | 
			
		||||
        return loss, grads
 | 
			
		||||
@ -0,0 +1,291 @@
 | 
			
		||||
from builtins import range
 | 
			
		||||
from builtins import object
 | 
			
		||||
import numpy as np
 | 
			
		||||
 | 
			
		||||
from ..layers import *
 | 
			
		||||
from ..layer_utils import *
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class TwoLayerNet(object):
 | 
			
		||||
    """
 | 
			
		||||
    A two-layer fully-connected neural network with ReLU nonlinearity and
 | 
			
		||||
    softmax loss that uses a modular layer design. We assume an input dimension
 | 
			
		||||
    of D, a hidden dimension of H, and perform classification over C classes.
 | 
			
		||||
 | 
			
		||||
    The architecure should be affine - relu - affine - softmax.
 | 
			
		||||
 | 
			
		||||
    Note that this class does not implement gradient descent; instead, it
 | 
			
		||||
    will interact with a separate Solver object that is responsible for running
 | 
			
		||||
    optimization.
 | 
			
		||||
 | 
			
		||||
    The learnable parameters of the model are stored in the dictionary
 | 
			
		||||
    self.params that maps parameter names to numpy arrays.
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    def __init__(
 | 
			
		||||
        self,
 | 
			
		||||
        input_dim=3 * 32 * 32,
 | 
			
		||||
        hidden_dim=100,
 | 
			
		||||
        num_classes=10,
 | 
			
		||||
        weight_scale=1e-3,
 | 
			
		||||
        reg=0.0,
 | 
			
		||||
    ):
 | 
			
		||||
        """
 | 
			
		||||
        Initialize a new network.
 | 
			
		||||
 | 
			
		||||
        Inputs:
 | 
			
		||||
        - input_dim: An integer giving the size of the input
 | 
			
		||||
        - hidden_dim: An integer giving the size of the hidden layer
 | 
			
		||||
        - num_classes: An integer giving the number of classes to classify
 | 
			
		||||
        - weight_scale: Scalar giving the standard deviation for random
 | 
			
		||||
          initialization of the weights.
 | 
			
		||||
        - reg: Scalar giving L2 regularization strength.
 | 
			
		||||
        """
 | 
			
		||||
        self.params = {}
 | 
			
		||||
        self.reg = reg
 | 
			
		||||
 | 
			
		||||
        ############################################################################
 | 
			
		||||
        # TODO: Initialize the weights and biases of the two-layer net. Weights    #
 | 
			
		||||
        # should be initialized from a Gaussian centered at 0.0 with               #
 | 
			
		||||
        # standard deviation equal to weight_scale, and biases should be           #
 | 
			
		||||
        # initialized to zero. All weights and biases should be stored in the      #
 | 
			
		||||
        # dictionary self.params, with first layer weights                         #
 | 
			
		||||
        # and biases using the keys 'W1' and 'b1' and second layer                 #
 | 
			
		||||
        # weights and biases using the keys 'W2' and 'b2'.                         #
 | 
			
		||||
        ############################################################################
 | 
			
		||||
        # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
 | 
			
		||||
 | 
			
		||||
        pass
 | 
			
		||||
 | 
			
		||||
        # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
 | 
			
		||||
        ############################################################################
 | 
			
		||||
        #                             END OF YOUR CODE                             #
 | 
			
		||||
        ############################################################################
 | 
			
		||||
 | 
			
		||||
    def loss(self, X, y=None):
 | 
			
		||||
        """
 | 
			
		||||
        Compute loss and gradient for a minibatch of data.
 | 
			
		||||
 | 
			
		||||
        Inputs:
 | 
			
		||||
        - X: Array of input data of shape (N, d_1, ..., d_k)
 | 
			
		||||
        - y: Array of labels, of shape (N,). y[i] gives the label for X[i].
 | 
			
		||||
 | 
			
		||||
        Returns:
 | 
			
		||||
        If y is None, then run a test-time forward pass of the model and return:
 | 
			
		||||
        - scores: Array of shape (N, C) giving classification scores, where
 | 
			
		||||
          scores[i, c] is the classification score for X[i] and class c.
 | 
			
		||||
 | 
			
		||||
        If y is not None, then run a training-time forward and backward pass and
 | 
			
		||||
        return a tuple of:
 | 
			
		||||
        - loss: Scalar value giving the loss
 | 
			
		||||
        - grads: Dictionary with the same keys as self.params, mapping parameter
 | 
			
		||||
          names to gradients of the loss with respect to those parameters.
 | 
			
		||||
        """
 | 
			
		||||
        scores = None
 | 
			
		||||
        ############################################################################
 | 
			
		||||
        # TODO: Implement the forward pass for the two-layer net, computing the    #
 | 
			
		||||
        # class scores for X and storing them in the scores variable.              #
 | 
			
		||||
        ############################################################################
 | 
			
		||||
        # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
 | 
			
		||||
 | 
			
		||||
        pass
 | 
			
		||||
 | 
			
		||||
        # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
 | 
			
		||||
        ############################################################################
 | 
			
		||||
        #                             END OF YOUR CODE                             #
 | 
			
		||||
        ############################################################################
 | 
			
		||||
 | 
			
		||||
        # If y is None then we are in test mode so just return scores
 | 
			
		||||
        if y is None:
 | 
			
		||||
            return scores
 | 
			
		||||
 | 
			
		||||
        loss, grads = 0, {}
 | 
			
		||||
        ############################################################################
 | 
			
		||||
        # TODO: Implement the backward pass for the two-layer net. Store the loss  #
 | 
			
		||||
        # in the loss variable and gradients in the grads dictionary. Compute data #
 | 
			
		||||
        # loss using softmax, and make sure that grads[k] holds the gradients for  #
 | 
			
		||||
        # self.params[k]. Don't forget to add L2 regularization!                   #
 | 
			
		||||
        #                                                                          #
 | 
			
		||||
        # NOTE: To ensure that your implementation matches ours and you pass the   #
 | 
			
		||||
        # automated tests, make sure that your L2 regularization includes a factor #
 | 
			
		||||
        # of 0.5 to simplify the expression for the gradient.                      #
 | 
			
		||||
        ############################################################################
 | 
			
		||||
        # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
 | 
			
		||||
 | 
			
		||||
        pass
 | 
			
		||||
 | 
			
		||||
        # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
 | 
			
		||||
        ############################################################################
 | 
			
		||||
        #                             END OF YOUR CODE                             #
 | 
			
		||||
        ############################################################################
 | 
			
		||||
 | 
			
		||||
        return loss, grads
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class FullyConnectedNet(object):
 | 
			
		||||
    """
 | 
			
		||||
    A fully-connected neural network with an arbitrary number of hidden layers,
 | 
			
		||||
    ReLU nonlinearities, and a softmax loss function. This will also implement
 | 
			
		||||
    dropout and batch/layer normalization as options. For a network with L layers,
 | 
			
		||||
    the architecture will be
 | 
			
		||||
 | 
			
		||||
    {affine - [batch/layer norm] - relu - [dropout]} x (L - 1) - affine - softmax
 | 
			
		||||
 | 
			
		||||
    where batch/layer normalization and dropout are optional, and the {...} block is
 | 
			
		||||
    repeated L - 1 times.
 | 
			
		||||
 | 
			
		||||
    Similar to the TwoLayerNet above, learnable parameters are stored in the
 | 
			
		||||
    self.params dictionary and will be learned using the Solver class.
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    def __init__(
 | 
			
		||||
        self,
 | 
			
		||||
        hidden_dims,
 | 
			
		||||
        input_dim=3 * 32 * 32,
 | 
			
		||||
        num_classes=10,
 | 
			
		||||
        dropout=1,
 | 
			
		||||
        normalization=None,
 | 
			
		||||
        reg=0.0,
 | 
			
		||||
        weight_scale=1e-2,
 | 
			
		||||
        dtype=np.float32,
 | 
			
		||||
        seed=None,
 | 
			
		||||
    ):
 | 
			
		||||
        """
 | 
			
		||||
        Initialize a new FullyConnectedNet.
 | 
			
		||||
 | 
			
		||||
        Inputs:
 | 
			
		||||
        - hidden_dims: A list of integers giving the size of each hidden layer.
 | 
			
		||||
        - input_dim: An integer giving the size of the input.
 | 
			
		||||
        - num_classes: An integer giving the number of classes to classify.
 | 
			
		||||
        - dropout: Scalar between 0 and 1 giving dropout strength. If dropout=1 then
 | 
			
		||||
          the network should not use dropout at all.
 | 
			
		||||
        - normalization: What type of normalization the network should use. Valid values
 | 
			
		||||
          are "batchnorm", "layernorm", or None for no normalization (the default).
 | 
			
		||||
        - reg: Scalar giving L2 regularization strength.
 | 
			
		||||
        - weight_scale: Scalar giving the standard deviation for random
 | 
			
		||||
          initialization of the weights.
 | 
			
		||||
        - dtype: A numpy datatype object; all computations will be performed using
 | 
			
		||||
          this datatype. float32 is faster but less accurate, so you should use
 | 
			
		||||
          float64 for numeric gradient checking.
 | 
			
		||||
        - seed: If not None, then pass this random seed to the dropout layers. This
 | 
			
		||||
          will make the dropout layers deteriminstic so we can gradient check the
 | 
			
		||||
          model.
 | 
			
		||||
        """
 | 
			
		||||
        self.normalization = normalization
 | 
			
		||||
        self.use_dropout = dropout != 1
 | 
			
		||||
        self.reg = reg
 | 
			
		||||
        self.num_layers = 1 + len(hidden_dims)
 | 
			
		||||
        self.dtype = dtype
 | 
			
		||||
        self.params = {}
 | 
			
		||||
 | 
			
		||||
        ############################################################################
 | 
			
		||||
        # TODO: Initialize the parameters of the network, storing all values in    #
 | 
			
		||||
        # the self.params dictionary. Store weights and biases for the first layer #
 | 
			
		||||
        # in W1 and b1; for the second layer use W2 and b2, etc. Weights should be #
 | 
			
		||||
        # initialized from a normal distribution centered at 0 with standard       #
 | 
			
		||||
        # deviation equal to weight_scale. Biases should be initialized to zero.   #
 | 
			
		||||
        #                                                                          #
 | 
			
		||||
        # When using batch normalization, store scale and shift parameters for the #
 | 
			
		||||
        # first layer in gamma1 and beta1; for the second layer use gamma2 and     #
 | 
			
		||||
        # beta2, etc. Scale parameters should be initialized to ones and shift     #
 | 
			
		||||
        # parameters should be initialized to zeros.                               #
 | 
			
		||||
        ############################################################################
 | 
			
		||||
        # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
 | 
			
		||||
 | 
			
		||||
        pass
 | 
			
		||||
 | 
			
		||||
        # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
 | 
			
		||||
        ############################################################################
 | 
			
		||||
        #                             END OF YOUR CODE                             #
 | 
			
		||||
        ############################################################################
 | 
			
		||||
 | 
			
		||||
        # When using dropout we need to pass a dropout_param dictionary to each
 | 
			
		||||
        # dropout layer so that the layer knows the dropout probability and the mode
 | 
			
		||||
        # (train / test). You can pass the same dropout_param to each dropout layer.
 | 
			
		||||
        self.dropout_param = {}
 | 
			
		||||
        if self.use_dropout:
 | 
			
		||||
            self.dropout_param = {"mode": "train", "p": dropout}
 | 
			
		||||
            if seed is not None:
 | 
			
		||||
                self.dropout_param["seed"] = seed
 | 
			
		||||
 | 
			
		||||
        # With batch normalization we need to keep track of running means and
 | 
			
		||||
        # variances, so we need to pass a special bn_param object to each batch
 | 
			
		||||
        # normalization layer. You should pass self.bn_params[0] to the forward pass
 | 
			
		||||
        # of the first batch normalization layer, self.bn_params[1] to the forward
 | 
			
		||||
        # pass of the second batch normalization layer, etc.
 | 
			
		||||
        self.bn_params = []
 | 
			
		||||
        if self.normalization == "batchnorm":
 | 
			
		||||
            self.bn_params = [{"mode": "train"} for i in range(self.num_layers - 1)]
 | 
			
		||||
        if self.normalization == "layernorm":
 | 
			
		||||
            self.bn_params = [{} for i in range(self.num_layers - 1)]
 | 
			
		||||
 | 
			
		||||
        # Cast all parameters to the correct datatype
 | 
			
		||||
        for k, v in self.params.items():
 | 
			
		||||
            self.params[k] = v.astype(dtype)
 | 
			
		||||
 | 
			
		||||
    def loss(self, X, y=None):
 | 
			
		||||
        """
 | 
			
		||||
        Compute loss and gradient for the fully-connected net.
 | 
			
		||||
 | 
			
		||||
        Input / output: Same as TwoLayerNet above.
 | 
			
		||||
        """
 | 
			
		||||
        X = X.astype(self.dtype)
 | 
			
		||||
        mode = "test" if y is None else "train"
 | 
			
		||||
 | 
			
		||||
        # Set train/test mode for batchnorm params and dropout param since they
 | 
			
		||||
        # behave differently during training and testing.
 | 
			
		||||
        if self.use_dropout:
 | 
			
		||||
            self.dropout_param["mode"] = mode
 | 
			
		||||
        if self.normalization == "batchnorm":
 | 
			
		||||
            for bn_param in self.bn_params:
 | 
			
		||||
                bn_param["mode"] = mode
 | 
			
		||||
        scores = None
 | 
			
		||||
        ############################################################################
 | 
			
		||||
        # TODO: Implement the forward pass for the fully-connected net, computing  #
 | 
			
		||||
        # the class scores for X and storing them in the scores variable.          #
 | 
			
		||||
        #                                                                          #
 | 
			
		||||
        # When using dropout, you'll need to pass self.dropout_param to each       #
 | 
			
		||||
        # dropout forward pass.                                                    #
 | 
			
		||||
        #                                                                          #
 | 
			
		||||
        # When using batch normalization, you'll need to pass self.bn_params[0] to #
 | 
			
		||||
        # the forward pass for the first batch normalization layer, pass           #
 | 
			
		||||
        # self.bn_params[1] to the forward pass for the second batch normalization #
 | 
			
		||||
        # layer, etc.                                                              #
 | 
			
		||||
        ############################################################################
 | 
			
		||||
        # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
 | 
			
		||||
 | 
			
		||||
        pass
 | 
			
		||||
 | 
			
		||||
        # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
 | 
			
		||||
        ############################################################################
 | 
			
		||||
        #                             END OF YOUR CODE                             #
 | 
			
		||||
        ############################################################################
 | 
			
		||||
 | 
			
		||||
        # If test mode return early
 | 
			
		||||
        if mode == "test":
 | 
			
		||||
            return scores
 | 
			
		||||
 | 
			
		||||
        loss, grads = 0.0, {}
 | 
			
		||||
        ############################################################################
 | 
			
		||||
        # TODO: Implement the backward pass for the fully-connected net. Store the #
 | 
			
		||||
        # loss in the loss variable and gradients in the grads dictionary. Compute #
 | 
			
		||||
        # data loss using softmax, and make sure that grads[k] holds the gradients #
 | 
			
		||||
        # for self.params[k]. Don't forget to add L2 regularization!               #
 | 
			
		||||
        #                                                                          #
 | 
			
		||||
        # When using batch/layer normalization, you don't need to regularize the scale   #
 | 
			
		||||
        # and shift parameters.                                                    #
 | 
			
		||||
        #                                                                          #
 | 
			
		||||
        # NOTE: To ensure that your implementation matches ours and you pass the   #
 | 
			
		||||
        # automated tests, make sure that your L2 regularization includes a factor #
 | 
			
		||||
        # of 0.5 to simplify the expression for the gradient.                      #
 | 
			
		||||
        ############################################################################
 | 
			
		||||
        # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
 | 
			
		||||
 | 
			
		||||
        pass
 | 
			
		||||
 | 
			
		||||
        # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
 | 
			
		||||
        ############################################################################
 | 
			
		||||
        #                             END OF YOUR CODE                             #
 | 
			
		||||
        ############################################################################
 | 
			
		||||
 | 
			
		||||
        return loss, grads
 | 
			
		||||
@ -0,0 +1,270 @@
 | 
			
		||||
from __future__ import print_function
 | 
			
		||||
 | 
			
		||||
from builtins import range
 | 
			
		||||
from six.moves import cPickle as pickle
 | 
			
		||||
import numpy as np
 | 
			
		||||
import os
 | 
			
		||||
from imageio import imread
 | 
			
		||||
import platform
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def load_pickle(f):
 | 
			
		||||
    version = platform.python_version_tuple()
 | 
			
		||||
    if version[0] == "2":
 | 
			
		||||
        return pickle.load(f)
 | 
			
		||||
    elif version[0] == "3":
 | 
			
		||||
        return pickle.load(f, encoding="latin1")
 | 
			
		||||
    raise ValueError("invalid python version: {}".format(version))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def load_CIFAR_batch(filename):
 | 
			
		||||
    """ load single batch of cifar """
 | 
			
		||||
    with open(filename, "rb") as f:
 | 
			
		||||
        datadict = load_pickle(f)
 | 
			
		||||
        X = datadict["data"]
 | 
			
		||||
        Y = datadict["labels"]
 | 
			
		||||
        X = X.reshape(10000, 3, 32, 32).transpose(0, 2, 3, 1).astype("float")
 | 
			
		||||
        Y = np.array(Y)
 | 
			
		||||
        return X, Y
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def load_CIFAR10(ROOT):
 | 
			
		||||
    """ load all of cifar """
 | 
			
		||||
    xs = []
 | 
			
		||||
    ys = []
 | 
			
		||||
    for b in range(1, 6):
 | 
			
		||||
        f = os.path.join(ROOT, "data_batch_%d" % (b,))
 | 
			
		||||
        X, Y = load_CIFAR_batch(f)
 | 
			
		||||
        xs.append(X)
 | 
			
		||||
        ys.append(Y)
 | 
			
		||||
    Xtr = np.concatenate(xs)
 | 
			
		||||
    Ytr = np.concatenate(ys)
 | 
			
		||||
    del X, Y
 | 
			
		||||
    Xte, Yte = load_CIFAR_batch(os.path.join(ROOT, "test_batch"))
 | 
			
		||||
    return Xtr, Ytr, Xte, Yte
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_CIFAR10_data(
 | 
			
		||||
    num_training=49000, num_validation=1000, num_test=1000, subtract_mean=True
 | 
			
		||||
):
 | 
			
		||||
    """
 | 
			
		||||
    Load the CIFAR-10 dataset from disk and perform preprocessing to prepare
 | 
			
		||||
    it for classifiers. These are the same steps as we used for the SVM, but
 | 
			
		||||
    condensed to a single function.
 | 
			
		||||
    """
 | 
			
		||||
    # Load the raw CIFAR-10 data
 | 
			
		||||
    cifar10_dir = os.path.join(
 | 
			
		||||
        os.path.dirname(__file__), "datasets/cifar-10-batches-py"
 | 
			
		||||
    )
 | 
			
		||||
    X_train, y_train, X_test, y_test = load_CIFAR10(cifar10_dir)
 | 
			
		||||
 | 
			
		||||
    # Subsample the data
 | 
			
		||||
    mask = list(range(num_training, num_training + num_validation))
 | 
			
		||||
    X_val = X_train[mask]
 | 
			
		||||
    y_val = y_train[mask]
 | 
			
		||||
    mask = list(range(num_training))
 | 
			
		||||
    X_train = X_train[mask]
 | 
			
		||||
    y_train = y_train[mask]
 | 
			
		||||
    mask = list(range(num_test))
 | 
			
		||||
    X_test = X_test[mask]
 | 
			
		||||
    y_test = y_test[mask]
 | 
			
		||||
 | 
			
		||||
    # Normalize the data: subtract the mean image
 | 
			
		||||
    if subtract_mean:
 | 
			
		||||
        mean_image = np.mean(X_train, axis=0)
 | 
			
		||||
        X_train -= mean_image
 | 
			
		||||
        X_val -= mean_image
 | 
			
		||||
        X_test -= mean_image
 | 
			
		||||
 | 
			
		||||
    # Transpose so that channels come first
 | 
			
		||||
    X_train = X_train.transpose(0, 3, 1, 2).copy()
 | 
			
		||||
    X_val = X_val.transpose(0, 3, 1, 2).copy()
 | 
			
		||||
    X_test = X_test.transpose(0, 3, 1, 2).copy()
 | 
			
		||||
 | 
			
		||||
    # Package data into a dictionary
 | 
			
		||||
    return {
 | 
			
		||||
        "X_train": X_train,
 | 
			
		||||
        "y_train": y_train,
 | 
			
		||||
        "X_val": X_val,
 | 
			
		||||
        "y_val": y_val,
 | 
			
		||||
        "X_test": X_test,
 | 
			
		||||
        "y_test": y_test,
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def load_tiny_imagenet(path, dtype=np.float32, subtract_mean=True):
 | 
			
		||||
    """
 | 
			
		||||
    Load TinyImageNet. Each of TinyImageNet-100-A, TinyImageNet-100-B, and
 | 
			
		||||
    TinyImageNet-200 have the same directory structure, so this can be used
 | 
			
		||||
    to load any of them.
 | 
			
		||||
 | 
			
		||||
    Inputs:
 | 
			
		||||
    - path: String giving path to the directory to load.
 | 
			
		||||
    - dtype: numpy datatype used to load the data.
 | 
			
		||||
    - subtract_mean: Whether to subtract the mean training image.
 | 
			
		||||
 | 
			
		||||
    Returns: A dictionary with the following entries:
 | 
			
		||||
    - class_names: A list where class_names[i] is a list of strings giving the
 | 
			
		||||
      WordNet names for class i in the loaded dataset.
 | 
			
		||||
    - X_train: (N_tr, 3, 64, 64) array of training images
 | 
			
		||||
    - y_train: (N_tr,) array of training labels
 | 
			
		||||
    - X_val: (N_val, 3, 64, 64) array of validation images
 | 
			
		||||
    - y_val: (N_val,) array of validation labels
 | 
			
		||||
    - X_test: (N_test, 3, 64, 64) array of testing images.
 | 
			
		||||
    - y_test: (N_test,) array of test labels; if test labels are not available
 | 
			
		||||
      (such as in student code) then y_test will be None.
 | 
			
		||||
    - mean_image: (3, 64, 64) array giving mean training image
 | 
			
		||||
    """
 | 
			
		||||
    # First load wnids
 | 
			
		||||
    with open(os.path.join(path, "wnids.txt"), "r") as f:
 | 
			
		||||
        wnids = [x.strip() for x in f]
 | 
			
		||||
 | 
			
		||||
    # Map wnids to integer labels
 | 
			
		||||
    wnid_to_label = {wnid: i for i, wnid in enumerate(wnids)}
 | 
			
		||||
 | 
			
		||||
    # Use words.txt to get names for each class
 | 
			
		||||
    with open(os.path.join(path, "words.txt"), "r") as f:
 | 
			
		||||
        wnid_to_words = dict(line.split("\t") for line in f)
 | 
			
		||||
        for wnid, words in wnid_to_words.items():
 | 
			
		||||
            wnid_to_words[wnid] = [w.strip() for w in words.split(",")]
 | 
			
		||||
    class_names = [wnid_to_words[wnid] for wnid in wnids]
 | 
			
		||||
 | 
			
		||||
    # Next load training data.
 | 
			
		||||
    X_train = []
 | 
			
		||||
    y_train = []
 | 
			
		||||
    for i, wnid in enumerate(wnids):
 | 
			
		||||
        if (i + 1) % 20 == 0:
 | 
			
		||||
            print("loading training data for synset %d / %d" % (i + 1, len(wnids)))
 | 
			
		||||
        # To figure out the filenames we need to open the boxes file
 | 
			
		||||
        boxes_file = os.path.join(path, "train", wnid, "%s_boxes.txt" % wnid)
 | 
			
		||||
        with open(boxes_file, "r") as f:
 | 
			
		||||
            filenames = [x.split("\t")[0] for x in f]
 | 
			
		||||
        num_images = len(filenames)
 | 
			
		||||
 | 
			
		||||
        X_train_block = np.zeros((num_images, 3, 64, 64), dtype=dtype)
 | 
			
		||||
        y_train_block = wnid_to_label[wnid] * np.ones(num_images, dtype=np.int64)
 | 
			
		||||
        for j, img_file in enumerate(filenames):
 | 
			
		||||
            img_file = os.path.join(path, "train", wnid, "images", img_file)
 | 
			
		||||
            img = imread(img_file)
 | 
			
		||||
            if img.ndim == 2:
 | 
			
		||||
                ## grayscale file
 | 
			
		||||
                img.shape = (64, 64, 1)
 | 
			
		||||
            X_train_block[j] = img.transpose(2, 0, 1)
 | 
			
		||||
        X_train.append(X_train_block)
 | 
			
		||||
        y_train.append(y_train_block)
 | 
			
		||||
 | 
			
		||||
    # We need to concatenate all training data
 | 
			
		||||
    X_train = np.concatenate(X_train, axis=0)
 | 
			
		||||
    y_train = np.concatenate(y_train, axis=0)
 | 
			
		||||
 | 
			
		||||
    # Next load validation data
 | 
			
		||||
    with open(os.path.join(path, "val", "val_annotations.txt"), "r") as f:
 | 
			
		||||
        img_files = []
 | 
			
		||||
        val_wnids = []
 | 
			
		||||
        for line in f:
 | 
			
		||||
            img_file, wnid = line.split("\t")[:2]
 | 
			
		||||
            img_files.append(img_file)
 | 
			
		||||
            val_wnids.append(wnid)
 | 
			
		||||
        num_val = len(img_files)
 | 
			
		||||
        y_val = np.array([wnid_to_label[wnid] for wnid in val_wnids])
 | 
			
		||||
        X_val = np.zeros((num_val, 3, 64, 64), dtype=dtype)
 | 
			
		||||
        for i, img_file in enumerate(img_files):
 | 
			
		||||
            img_file = os.path.join(path, "val", "images", img_file)
 | 
			
		||||
            img = imread(img_file)
 | 
			
		||||
            if img.ndim == 2:
 | 
			
		||||
                img.shape = (64, 64, 1)
 | 
			
		||||
            X_val[i] = img.transpose(2, 0, 1)
 | 
			
		||||
 | 
			
		||||
    # Next load test images
 | 
			
		||||
    # Students won't have test labels, so we need to iterate over files in the
 | 
			
		||||
    # images directory.
 | 
			
		||||
    img_files = os.listdir(os.path.join(path, "test", "images"))
 | 
			
		||||
    X_test = np.zeros((len(img_files), 3, 64, 64), dtype=dtype)
 | 
			
		||||
    for i, img_file in enumerate(img_files):
 | 
			
		||||
        img_file = os.path.join(path, "test", "images", img_file)
 | 
			
		||||
        img = imread(img_file)
 | 
			
		||||
        if img.ndim == 2:
 | 
			
		||||
            img.shape = (64, 64, 1)
 | 
			
		||||
        X_test[i] = img.transpose(2, 0, 1)
 | 
			
		||||
 | 
			
		||||
    y_test = None
 | 
			
		||||
    y_test_file = os.path.join(path, "test", "test_annotations.txt")
 | 
			
		||||
    if os.path.isfile(y_test_file):
 | 
			
		||||
        with open(y_test_file, "r") as f:
 | 
			
		||||
            img_file_to_wnid = {}
 | 
			
		||||
            for line in f:
 | 
			
		||||
                line = line.split("\t")
 | 
			
		||||
                img_file_to_wnid[line[0]] = line[1]
 | 
			
		||||
        y_test = [wnid_to_label[img_file_to_wnid[img_file]] for img_file in img_files]
 | 
			
		||||
        y_test = np.array(y_test)
 | 
			
		||||
 | 
			
		||||
    mean_image = X_train.mean(axis=0)
 | 
			
		||||
    if subtract_mean:
 | 
			
		||||
        X_train -= mean_image[None]
 | 
			
		||||
        X_val -= mean_image[None]
 | 
			
		||||
        X_test -= mean_image[None]
 | 
			
		||||
 | 
			
		||||
    return {
 | 
			
		||||
        "class_names": class_names,
 | 
			
		||||
        "X_train": X_train,
 | 
			
		||||
        "y_train": y_train,
 | 
			
		||||
        "X_val": X_val,
 | 
			
		||||
        "y_val": y_val,
 | 
			
		||||
        "X_test": X_test,
 | 
			
		||||
        "y_test": y_test,
 | 
			
		||||
        "class_names": class_names,
 | 
			
		||||
        "mean_image": mean_image,
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def load_models(models_dir):
 | 
			
		||||
    """
 | 
			
		||||
    Load saved models from disk. This will attempt to unpickle all files in a
 | 
			
		||||
    directory; any files that give errors on unpickling (such as README.txt)
 | 
			
		||||
    will be skipped.
 | 
			
		||||
 | 
			
		||||
    Inputs:
 | 
			
		||||
    - models_dir: String giving the path to a directory containing model files.
 | 
			
		||||
      Each model file is a pickled dictionary with a 'model' field.
 | 
			
		||||
 | 
			
		||||
    Returns:
 | 
			
		||||
    A dictionary mapping model file names to models.
 | 
			
		||||
    """
 | 
			
		||||
    models = {}
 | 
			
		||||
    for model_file in os.listdir(models_dir):
 | 
			
		||||
        with open(os.path.join(models_dir, model_file), "rb") as f:
 | 
			
		||||
            try:
 | 
			
		||||
                models[model_file] = load_pickle(f)["model"]
 | 
			
		||||
            except pickle.UnpicklingError:
 | 
			
		||||
                continue
 | 
			
		||||
    return models
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def load_imagenet_val(num=None):
 | 
			
		||||
    """Load a handful of validation images from ImageNet.
 | 
			
		||||
 | 
			
		||||
    Inputs:
 | 
			
		||||
    - num: Number of images to load (max of 25)
 | 
			
		||||
 | 
			
		||||
    Returns:
 | 
			
		||||
    - X: numpy array with shape [num, 224, 224, 3]
 | 
			
		||||
    - y: numpy array of integer image labels, shape [num]
 | 
			
		||||
    - class_names: dict mapping integer label to class name
 | 
			
		||||
    """
 | 
			
		||||
    imagenet_fn = os.path.join(
 | 
			
		||||
        os.path.dirname(__file__), "datasets/imagenet_val_25.npz"
 | 
			
		||||
    )
 | 
			
		||||
    if not os.path.isfile(imagenet_fn):
 | 
			
		||||
        print("file %s not found" % imagenet_fn)
 | 
			
		||||
        print("Run the following:")
 | 
			
		||||
        print("cd cs231n/datasets")
 | 
			
		||||
        print("bash get_imagenet_val.sh")
 | 
			
		||||
        assert False, "Need to download imagenet_val_25.npz"
 | 
			
		||||
    f = np.load(imagenet_fn)
 | 
			
		||||
    X = f["X"]
 | 
			
		||||
    y = f["y"]
 | 
			
		||||
    class_names = f["label_map"].item()
 | 
			
		||||
    if num is not None:
 | 
			
		||||
        X = X[:num]
 | 
			
		||||
        y = y[:num]
 | 
			
		||||
    return X, y, class_names
 | 
			
		||||
@ -0,0 +1,5 @@
 | 
			
		||||
if [ ! -d "cifar-10-batches-py" ]; then
 | 
			
		||||
  wget http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz -O cifar-10-python.tar.gz
 | 
			
		||||
  tar -xzvf cifar-10-python.tar.gz
 | 
			
		||||
  rm cifar-10-python.tar.gz
 | 
			
		||||
fi
 | 
			
		||||
@ -0,0 +1,283 @@
 | 
			
		||||
from __future__ import print_function
 | 
			
		||||
import numpy as np
 | 
			
		||||
 | 
			
		||||
try:
 | 
			
		||||
    from .im2col_cython import col2im_cython, im2col_cython
 | 
			
		||||
    from .im2col_cython import col2im_6d_cython
 | 
			
		||||
except ImportError:
 | 
			
		||||
    print("""=========== You can safely ignore the message below if you are NOT working on ConvolutionalNetworks.ipynb ===========""")
 | 
			
		||||
    print("\tYou will need to compile a Cython extension for a portion of this assignment.")
 | 
			
		||||
    print("\tThe instructions to do this will be given in a section of the notebook below.")
 | 
			
		||||
    print("\tThere will be an option for Colab users and another for Jupyter (local) users.")
 | 
			
		||||
 | 
			
		||||
from .im2col import *
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def conv_forward_im2col(x, w, b, conv_param):
 | 
			
		||||
    """
 | 
			
		||||
    A fast implementation of the forward pass for a convolutional layer
 | 
			
		||||
    based on im2col and col2im.
 | 
			
		||||
    """
 | 
			
		||||
    N, C, H, W = x.shape
 | 
			
		||||
    num_filters, _, filter_height, filter_width = w.shape
 | 
			
		||||
    stride, pad = conv_param["stride"], conv_param["pad"]
 | 
			
		||||
 | 
			
		||||
    # Check dimensions
 | 
			
		||||
    assert (W + 2 * pad - filter_width) % stride == 0, "width does not work"
 | 
			
		||||
    assert (H + 2 * pad - filter_height) % stride == 0, "height does not work"
 | 
			
		||||
 | 
			
		||||
    # Create output
 | 
			
		||||
    out_height = (H + 2 * pad - filter_height) // stride + 1
 | 
			
		||||
    out_width = (W + 2 * pad - filter_width) // stride + 1
 | 
			
		||||
    out = np.zeros((N, num_filters, out_height, out_width), dtype=x.dtype)
 | 
			
		||||
 | 
			
		||||
    # x_cols = im2col_indices(x, w.shape[2], w.shape[3], pad, stride)
 | 
			
		||||
    x_cols = im2col_cython(x, w.shape[2], w.shape[3], pad, stride)
 | 
			
		||||
    res = w.reshape((w.shape[0], -1)).dot(x_cols) + b.reshape(-1, 1)
 | 
			
		||||
 | 
			
		||||
    out = res.reshape(w.shape[0], out.shape[2], out.shape[3], x.shape[0])
 | 
			
		||||
    out = out.transpose(3, 0, 1, 2)
 | 
			
		||||
 | 
			
		||||
    cache = (x, w, b, conv_param, x_cols)
 | 
			
		||||
    return out, cache
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def conv_forward_strides(x, w, b, conv_param):
 | 
			
		||||
    N, C, H, W = x.shape
 | 
			
		||||
    F, _, HH, WW = w.shape
 | 
			
		||||
    stride, pad = conv_param["stride"], conv_param["pad"]
 | 
			
		||||
 | 
			
		||||
    # Check dimensions
 | 
			
		||||
    # assert (W + 2 * pad - WW) % stride == 0, 'width does not work'
 | 
			
		||||
    # assert (H + 2 * pad - HH) % stride == 0, 'height does not work'
 | 
			
		||||
 | 
			
		||||
    # Pad the input
 | 
			
		||||
    p = pad
 | 
			
		||||
    x_padded = np.pad(x, ((0, 0), (0, 0), (p, p), (p, p)), mode="constant")
 | 
			
		||||
 | 
			
		||||
    # Figure out output dimensions
 | 
			
		||||
    H += 2 * pad
 | 
			
		||||
    W += 2 * pad
 | 
			
		||||
    out_h = (H - HH) // stride + 1
 | 
			
		||||
    out_w = (W - WW) // stride + 1
 | 
			
		||||
 | 
			
		||||
    # Perform an im2col operation by picking clever strides
 | 
			
		||||
    shape = (C, HH, WW, N, out_h, out_w)
 | 
			
		||||
    strides = (H * W, W, 1, C * H * W, stride * W, stride)
 | 
			
		||||
    strides = x.itemsize * np.array(strides)
 | 
			
		||||
    x_stride = np.lib.stride_tricks.as_strided(x_padded, shape=shape, strides=strides)
 | 
			
		||||
    x_cols = np.ascontiguousarray(x_stride)
 | 
			
		||||
    x_cols.shape = (C * HH * WW, N * out_h * out_w)
 | 
			
		||||
 | 
			
		||||
    # Now all our convolutions are a big matrix multiply
 | 
			
		||||
    res = w.reshape(F, -1).dot(x_cols) + b.reshape(-1, 1)
 | 
			
		||||
 | 
			
		||||
    # Reshape the output
 | 
			
		||||
    res.shape = (F, N, out_h, out_w)
 | 
			
		||||
    out = res.transpose(1, 0, 2, 3)
 | 
			
		||||
 | 
			
		||||
    # Be nice and return a contiguous array
 | 
			
		||||
    # The old version of conv_forward_fast doesn't do this, so for a fair
 | 
			
		||||
    # comparison we won't either
 | 
			
		||||
    out = np.ascontiguousarray(out)
 | 
			
		||||
 | 
			
		||||
    cache = (x, w, b, conv_param, x_cols)
 | 
			
		||||
    return out, cache
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def conv_backward_strides(dout, cache):
 | 
			
		||||
    x, w, b, conv_param, x_cols = cache
 | 
			
		||||
    stride, pad = conv_param["stride"], conv_param["pad"]
 | 
			
		||||
 | 
			
		||||
    N, C, H, W = x.shape
 | 
			
		||||
    F, _, HH, WW = w.shape
 | 
			
		||||
    _, _, out_h, out_w = dout.shape
 | 
			
		||||
 | 
			
		||||
    db = np.sum(dout, axis=(0, 2, 3))
 | 
			
		||||
 | 
			
		||||
    dout_reshaped = dout.transpose(1, 0, 2, 3).reshape(F, -1)
 | 
			
		||||
    dw = dout_reshaped.dot(x_cols.T).reshape(w.shape)
 | 
			
		||||
 | 
			
		||||
    dx_cols = w.reshape(F, -1).T.dot(dout_reshaped)
 | 
			
		||||
    dx_cols.shape = (C, HH, WW, N, out_h, out_w)
 | 
			
		||||
    dx = col2im_6d_cython(dx_cols, N, C, H, W, HH, WW, pad, stride)
 | 
			
		||||
 | 
			
		||||
    return dx, dw, db
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def conv_backward_im2col(dout, cache):
 | 
			
		||||
    """
 | 
			
		||||
    A fast implementation of the backward pass for a convolutional layer
 | 
			
		||||
    based on im2col and col2im.
 | 
			
		||||
    """
 | 
			
		||||
    x, w, b, conv_param, x_cols = cache
 | 
			
		||||
    stride, pad = conv_param["stride"], conv_param["pad"]
 | 
			
		||||
 | 
			
		||||
    db = np.sum(dout, axis=(0, 2, 3))
 | 
			
		||||
 | 
			
		||||
    num_filters, _, filter_height, filter_width = w.shape
 | 
			
		||||
    dout_reshaped = dout.transpose(1, 2, 3, 0).reshape(num_filters, -1)
 | 
			
		||||
    dw = dout_reshaped.dot(x_cols.T).reshape(w.shape)
 | 
			
		||||
 | 
			
		||||
    dx_cols = w.reshape(num_filters, -1).T.dot(dout_reshaped)
 | 
			
		||||
    # dx = col2im_indices(dx_cols, x.shape, filter_height, filter_width, pad, stride)
 | 
			
		||||
    dx = col2im_cython(
 | 
			
		||||
        dx_cols,
 | 
			
		||||
        x.shape[0],
 | 
			
		||||
        x.shape[1],
 | 
			
		||||
        x.shape[2],
 | 
			
		||||
        x.shape[3],
 | 
			
		||||
        filter_height,
 | 
			
		||||
        filter_width,
 | 
			
		||||
        pad,
 | 
			
		||||
        stride,
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    return dx, dw, db
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
conv_forward_fast = conv_forward_strides
 | 
			
		||||
conv_backward_fast = conv_backward_strides
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def max_pool_forward_fast(x, pool_param):
 | 
			
		||||
    """
 | 
			
		||||
    A fast implementation of the forward pass for a max pooling layer.
 | 
			
		||||
 | 
			
		||||
    This chooses between the reshape method and the im2col method. If the pooling
 | 
			
		||||
    regions are square and tile the input image, then we can use the reshape
 | 
			
		||||
    method which is very fast. Otherwise we fall back on the im2col method, which
 | 
			
		||||
    is not much faster than the naive method.
 | 
			
		||||
    """
 | 
			
		||||
    N, C, H, W = x.shape
 | 
			
		||||
    pool_height, pool_width = pool_param["pool_height"], pool_param["pool_width"]
 | 
			
		||||
    stride = pool_param["stride"]
 | 
			
		||||
 | 
			
		||||
    same_size = pool_height == pool_width == stride
 | 
			
		||||
    tiles = H % pool_height == 0 and W % pool_width == 0
 | 
			
		||||
    if same_size and tiles:
 | 
			
		||||
        out, reshape_cache = max_pool_forward_reshape(x, pool_param)
 | 
			
		||||
        cache = ("reshape", reshape_cache)
 | 
			
		||||
    else:
 | 
			
		||||
        out, im2col_cache = max_pool_forward_im2col(x, pool_param)
 | 
			
		||||
        cache = ("im2col", im2col_cache)
 | 
			
		||||
    return out, cache
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def max_pool_backward_fast(dout, cache):
 | 
			
		||||
    """
 | 
			
		||||
    A fast implementation of the backward pass for a max pooling layer.
 | 
			
		||||
 | 
			
		||||
    This switches between the reshape method an the im2col method depending on
 | 
			
		||||
    which method was used to generate the cache.
 | 
			
		||||
    """
 | 
			
		||||
    method, real_cache = cache
 | 
			
		||||
    if method == "reshape":
 | 
			
		||||
        return max_pool_backward_reshape(dout, real_cache)
 | 
			
		||||
    elif method == "im2col":
 | 
			
		||||
        return max_pool_backward_im2col(dout, real_cache)
 | 
			
		||||
    else:
 | 
			
		||||
        raise ValueError('Unrecognized method "%s"' % method)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def max_pool_forward_reshape(x, pool_param):
 | 
			
		||||
    """
 | 
			
		||||
    A fast implementation of the forward pass for the max pooling layer that uses
 | 
			
		||||
    some clever reshaping.
 | 
			
		||||
 | 
			
		||||
    This can only be used for square pooling regions that tile the input.
 | 
			
		||||
    """
 | 
			
		||||
    N, C, H, W = x.shape
 | 
			
		||||
    pool_height, pool_width = pool_param["pool_height"], pool_param["pool_width"]
 | 
			
		||||
    stride = pool_param["stride"]
 | 
			
		||||
    assert pool_height == pool_width == stride, "Invalid pool params"
 | 
			
		||||
    assert H % pool_height == 0
 | 
			
		||||
    assert W % pool_height == 0
 | 
			
		||||
    x_reshaped = x.reshape(
 | 
			
		||||
        N, C, H // pool_height, pool_height, W // pool_width, pool_width
 | 
			
		||||
    )
 | 
			
		||||
    out = x_reshaped.max(axis=3).max(axis=4)
 | 
			
		||||
 | 
			
		||||
    cache = (x, x_reshaped, out)
 | 
			
		||||
    return out, cache
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def max_pool_backward_reshape(dout, cache):
 | 
			
		||||
    """
 | 
			
		||||
    A fast implementation of the backward pass for the max pooling layer that
 | 
			
		||||
    uses some clever broadcasting and reshaping.
 | 
			
		||||
 | 
			
		||||
    This can only be used if the forward pass was computed using
 | 
			
		||||
    max_pool_forward_reshape.
 | 
			
		||||
 | 
			
		||||
    NOTE: If there are multiple argmaxes, this method will assign gradient to
 | 
			
		||||
    ALL argmax elements of the input rather than picking one. In this case the
 | 
			
		||||
    gradient will actually be incorrect. However this is unlikely to occur in
 | 
			
		||||
    practice, so it shouldn't matter much. One possible solution is to split the
 | 
			
		||||
    upstream gradient equally among all argmax elements; this should result in a
 | 
			
		||||
    valid subgradient. You can make this happen by uncommenting the line below;
 | 
			
		||||
    however this results in a significant performance penalty (about 40% slower)
 | 
			
		||||
    and is unlikely to matter in practice so we don't do it.
 | 
			
		||||
    """
 | 
			
		||||
    x, x_reshaped, out = cache
 | 
			
		||||
 | 
			
		||||
    dx_reshaped = np.zeros_like(x_reshaped)
 | 
			
		||||
    out_newaxis = out[:, :, :, np.newaxis, :, np.newaxis]
 | 
			
		||||
    mask = x_reshaped == out_newaxis
 | 
			
		||||
    dout_newaxis = dout[:, :, :, np.newaxis, :, np.newaxis]
 | 
			
		||||
    dout_broadcast, _ = np.broadcast_arrays(dout_newaxis, dx_reshaped)
 | 
			
		||||
    dx_reshaped[mask] = dout_broadcast[mask]
 | 
			
		||||
    dx_reshaped /= np.sum(mask, axis=(3, 5), keepdims=True)
 | 
			
		||||
    dx = dx_reshaped.reshape(x.shape)
 | 
			
		||||
 | 
			
		||||
    return dx
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def max_pool_forward_im2col(x, pool_param):
 | 
			
		||||
    """
 | 
			
		||||
    An implementation of the forward pass for max pooling based on im2col.
 | 
			
		||||
 | 
			
		||||
    This isn't much faster than the naive version, so it should be avoided if
 | 
			
		||||
    possible.
 | 
			
		||||
    """
 | 
			
		||||
    N, C, H, W = x.shape
 | 
			
		||||
    pool_height, pool_width = pool_param["pool_height"], pool_param["pool_width"]
 | 
			
		||||
    stride = pool_param["stride"]
 | 
			
		||||
 | 
			
		||||
    assert (H - pool_height) % stride == 0, "Invalid height"
 | 
			
		||||
    assert (W - pool_width) % stride == 0, "Invalid width"
 | 
			
		||||
 | 
			
		||||
    out_height = (H - pool_height) // stride + 1
 | 
			
		||||
    out_width = (W - pool_width) // stride + 1
 | 
			
		||||
 | 
			
		||||
    x_split = x.reshape(N * C, 1, H, W)
 | 
			
		||||
    x_cols = im2col(x_split, pool_height, pool_width, padding=0, stride=stride)
 | 
			
		||||
    x_cols_argmax = np.argmax(x_cols, axis=0)
 | 
			
		||||
    x_cols_max = x_cols[x_cols_argmax, np.arange(x_cols.shape[1])]
 | 
			
		||||
    out = x_cols_max.reshape(out_height, out_width, N, C).transpose(2, 3, 0, 1)
 | 
			
		||||
 | 
			
		||||
    cache = (x, x_cols, x_cols_argmax, pool_param)
 | 
			
		||||
    return out, cache
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def max_pool_backward_im2col(dout, cache):
 | 
			
		||||
    """
 | 
			
		||||
    An implementation of the backward pass for max pooling based on im2col.
 | 
			
		||||
 | 
			
		||||
    This isn't much faster than the naive version, so it should be avoided if
 | 
			
		||||
    possible.
 | 
			
		||||
    """
 | 
			
		||||
    x, x_cols, x_cols_argmax, pool_param = cache
 | 
			
		||||
    N, C, H, W = x.shape
 | 
			
		||||
    pool_height, pool_width = pool_param["pool_height"], pool_param["pool_width"]
 | 
			
		||||
    stride = pool_param["stride"]
 | 
			
		||||
 | 
			
		||||
    dout_reshaped = dout.transpose(2, 3, 0, 1).flatten()
 | 
			
		||||
    dx_cols = np.zeros_like(x_cols)
 | 
			
		||||
    dx_cols[x_cols_argmax, np.arange(dx_cols.shape[1])] = dout_reshaped
 | 
			
		||||
    dx = col2im_indices(
 | 
			
		||||
        dx_cols, (N * C, 1, H, W), pool_height, pool_width, padding=0, stride=stride
 | 
			
		||||
    )
 | 
			
		||||
    dx = dx.reshape(x.shape)
 | 
			
		||||
 | 
			
		||||
    return dx
 | 
			
		||||
@ -0,0 +1,133 @@
 | 
			
		||||
from __future__ import print_function
 | 
			
		||||
from builtins import range
 | 
			
		||||
from past.builtins import xrange
 | 
			
		||||
 | 
			
		||||
import numpy as np
 | 
			
		||||
from random import randrange
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def eval_numerical_gradient(f, x, verbose=True, h=0.00001):
 | 
			
		||||
    """
 | 
			
		||||
    a naive implementation of numerical gradient of f at x
 | 
			
		||||
    - f should be a function that takes a single argument
 | 
			
		||||
    - x is the point (numpy array) to evaluate the gradient at
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    fx = f(x)  # evaluate function value at original point
 | 
			
		||||
    grad = np.zeros_like(x)
 | 
			
		||||
    # iterate over all indexes in x
 | 
			
		||||
    it = np.nditer(x, flags=["multi_index"], op_flags=["readwrite"])
 | 
			
		||||
    while not it.finished:
 | 
			
		||||
 | 
			
		||||
        # evaluate function at x+h
 | 
			
		||||
        ix = it.multi_index
 | 
			
		||||
        oldval = x[ix]
 | 
			
		||||
        x[ix] = oldval + h  # increment by h
 | 
			
		||||
        fxph = f(x)  # evalute f(x + h)
 | 
			
		||||
        x[ix] = oldval - h
 | 
			
		||||
        fxmh = f(x)  # evaluate f(x - h)
 | 
			
		||||
        x[ix] = oldval  # restore
 | 
			
		||||
 | 
			
		||||
        # compute the partial derivative with centered formula
 | 
			
		||||
        grad[ix] = (fxph - fxmh) / (2 * h)  # the slope
 | 
			
		||||
        if verbose:
 | 
			
		||||
            print(ix, grad[ix])
 | 
			
		||||
        it.iternext()  # step to next dimension
 | 
			
		||||
 | 
			
		||||
    return grad
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def eval_numerical_gradient_array(f, x, df, h=1e-5):
 | 
			
		||||
    """
 | 
			
		||||
    Evaluate a numeric gradient for a function that accepts a numpy
 | 
			
		||||
    array and returns a numpy array.
 | 
			
		||||
    """
 | 
			
		||||
    grad = np.zeros_like(x)
 | 
			
		||||
    it = np.nditer(x, flags=["multi_index"], op_flags=["readwrite"])
 | 
			
		||||
    while not it.finished:
 | 
			
		||||
        ix = it.multi_index
 | 
			
		||||
 | 
			
		||||
        oldval = x[ix]
 | 
			
		||||
        x[ix] = oldval + h
 | 
			
		||||
        pos = f(x).copy()
 | 
			
		||||
        x[ix] = oldval - h
 | 
			
		||||
        neg = f(x).copy()
 | 
			
		||||
        x[ix] = oldval
 | 
			
		||||
 | 
			
		||||
        grad[ix] = np.sum((pos - neg) * df) / (2 * h)
 | 
			
		||||
        it.iternext()
 | 
			
		||||
    return grad
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def eval_numerical_gradient_blobs(f, inputs, output, h=1e-5):
 | 
			
		||||
    """
 | 
			
		||||
    Compute numeric gradients for a function that operates on input
 | 
			
		||||
    and output blobs.
 | 
			
		||||
 | 
			
		||||
    We assume that f accepts several input blobs as arguments, followed by a
 | 
			
		||||
    blob where outputs will be written. For example, f might be called like:
 | 
			
		||||
 | 
			
		||||
    f(x, w, out)
 | 
			
		||||
 | 
			
		||||
    where x and w are input Blobs, and the result of f will be written to out.
 | 
			
		||||
 | 
			
		||||
    Inputs:
 | 
			
		||||
    - f: function
 | 
			
		||||
    - inputs: tuple of input blobs
 | 
			
		||||
    - output: output blob
 | 
			
		||||
    - h: step size
 | 
			
		||||
    """
 | 
			
		||||
    numeric_diffs = []
 | 
			
		||||
    for input_blob in inputs:
 | 
			
		||||
        diff = np.zeros_like(input_blob.diffs)
 | 
			
		||||
        it = np.nditer(input_blob.vals, flags=["multi_index"], op_flags=["readwrite"])
 | 
			
		||||
        while not it.finished:
 | 
			
		||||
            idx = it.multi_index
 | 
			
		||||
            orig = input_blob.vals[idx]
 | 
			
		||||
 | 
			
		||||
            input_blob.vals[idx] = orig + h
 | 
			
		||||
            f(*(inputs + (output,)))
 | 
			
		||||
            pos = np.copy(output.vals)
 | 
			
		||||
            input_blob.vals[idx] = orig - h
 | 
			
		||||
            f(*(inputs + (output,)))
 | 
			
		||||
            neg = np.copy(output.vals)
 | 
			
		||||
            input_blob.vals[idx] = orig
 | 
			
		||||
 | 
			
		||||
            diff[idx] = np.sum((pos - neg) * output.diffs) / (2.0 * h)
 | 
			
		||||
 | 
			
		||||
            it.iternext()
 | 
			
		||||
        numeric_diffs.append(diff)
 | 
			
		||||
    return numeric_diffs
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def eval_numerical_gradient_net(net, inputs, output, h=1e-5):
 | 
			
		||||
    return eval_numerical_gradient_blobs(
 | 
			
		||||
        lambda *args: net.forward(), inputs, output, h=h
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def grad_check_sparse(f, x, analytic_grad, num_checks=10, h=1e-5):
 | 
			
		||||
    """
 | 
			
		||||
    sample a few random elements and only return numerical
 | 
			
		||||
    in this dimensions.
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    for i in range(num_checks):
 | 
			
		||||
        ix = tuple([randrange(m) for m in x.shape])
 | 
			
		||||
 | 
			
		||||
        oldval = x[ix]
 | 
			
		||||
        x[ix] = oldval + h  # increment by h
 | 
			
		||||
        fxph = f(x)  # evaluate f(x + h)
 | 
			
		||||
        x[ix] = oldval - h  # increment by h
 | 
			
		||||
        fxmh = f(x)  # evaluate f(x - h)
 | 
			
		||||
        x[ix] = oldval  # reset
 | 
			
		||||
 | 
			
		||||
        grad_numerical = (fxph - fxmh) / (2 * h)
 | 
			
		||||
        grad_analytic = analytic_grad[ix]
 | 
			
		||||
        rel_error = abs(grad_numerical - grad_analytic) / (
 | 
			
		||||
            abs(grad_numerical) + abs(grad_analytic)
 | 
			
		||||
        )
 | 
			
		||||
        print(
 | 
			
		||||
            "numerical: %f analytic: %f, relative error: %e"
 | 
			
		||||
            % (grad_numerical, grad_analytic, rel_error)
 | 
			
		||||
        )
 | 
			
		||||
@ -0,0 +1,58 @@
 | 
			
		||||
from builtins import range
 | 
			
		||||
import numpy as np
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_im2col_indices(x_shape, field_height, field_width, padding=1, stride=1):
 | 
			
		||||
    # First figure out what the size of the output should be
 | 
			
		||||
    N, C, H, W = x_shape
 | 
			
		||||
    assert (H + 2 * padding - field_height) % stride == 0
 | 
			
		||||
    assert (W + 2 * padding - field_height) % stride == 0
 | 
			
		||||
    out_height = (H + 2 * padding - field_height) / stride + 1
 | 
			
		||||
    out_width = (W + 2 * padding - field_width) / stride + 1
 | 
			
		||||
 | 
			
		||||
    i0 = np.repeat(np.arange(field_height), field_width)
 | 
			
		||||
    i0 = np.tile(i0, C)
 | 
			
		||||
    i1 = stride * np.repeat(np.arange(out_height), out_width)
 | 
			
		||||
    j0 = np.tile(np.arange(field_width), field_height * C)
 | 
			
		||||
    j1 = stride * np.tile(np.arange(out_width), out_height)
 | 
			
		||||
    i = i0.reshape(-1, 1) + i1.reshape(1, -1)
 | 
			
		||||
    j = j0.reshape(-1, 1) + j1.reshape(1, -1)
 | 
			
		||||
 | 
			
		||||
    k = np.repeat(np.arange(C), field_height * field_width).reshape(-1, 1)
 | 
			
		||||
 | 
			
		||||
    return (k, i, j)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def im2col_indices(x, field_height, field_width, padding=1, stride=1):
 | 
			
		||||
    """ An implementation of im2col based on some fancy indexing """
 | 
			
		||||
    # Zero-pad the input
 | 
			
		||||
    p = padding
 | 
			
		||||
    x_padded = np.pad(x, ((0, 0), (0, 0), (p, p), (p, p)), mode="constant")
 | 
			
		||||
 | 
			
		||||
    k, i, j = get_im2col_indices(x.shape, field_height, field_width, padding, stride)
 | 
			
		||||
 | 
			
		||||
    cols = x_padded[:, k, i, j]
 | 
			
		||||
    C = x.shape[1]
 | 
			
		||||
    cols = cols.transpose(1, 2, 0).reshape(field_height * field_width * C, -1)
 | 
			
		||||
    return cols
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def col2im_indices(cols, x_shape, field_height=3, field_width=3, padding=1, stride=1):
 | 
			
		||||
    """ An implementation of col2im based on fancy indexing and np.add.at """
 | 
			
		||||
    N, C, H, W = x_shape
 | 
			
		||||
    H_padded, W_padded = H + 2 * padding, W + 2 * padding
 | 
			
		||||
    x_padded = np.zeros((N, C, H_padded, W_padded), dtype=cols.dtype)
 | 
			
		||||
    k, i, j = get_im2col_indices(x_shape, field_height, field_width, padding, stride)
 | 
			
		||||
    cols_reshaped = cols.reshape(C * field_height * field_width, -1, N)
 | 
			
		||||
    cols_reshaped = cols_reshaped.transpose(2, 0, 1)
 | 
			
		||||
    np.add.at(x_padded, (slice(None), k, i, j), cols_reshaped)
 | 
			
		||||
    if padding == 0:
 | 
			
		||||
        return x_padded
 | 
			
		||||
    return x_padded[:, :, padding:-padding, padding:-padding]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
 | 
			
		||||
 | 
			
		||||
pass
 | 
			
		||||
 | 
			
		||||
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
 | 
			
		||||
@ -0,0 +1,121 @@
 | 
			
		||||
import numpy as np
 | 
			
		||||
cimport numpy as np
 | 
			
		||||
cimport cython
 | 
			
		||||
 | 
			
		||||
# DTYPE = np.float64
 | 
			
		||||
# ctypedef np.float64_t DTYPE_t
 | 
			
		||||
 | 
			
		||||
ctypedef fused DTYPE_t:
 | 
			
		||||
    np.float32_t
 | 
			
		||||
    np.float64_t
 | 
			
		||||
 | 
			
		||||
def im2col_cython(np.ndarray[DTYPE_t, ndim=4] x, int field_height,
 | 
			
		||||
                  int field_width, int padding, int stride):
 | 
			
		||||
    cdef int N = x.shape[0]
 | 
			
		||||
    cdef int C = x.shape[1]
 | 
			
		||||
    cdef int H = x.shape[2]
 | 
			
		||||
    cdef int W = x.shape[3]
 | 
			
		||||
    
 | 
			
		||||
    cdef int HH = (H + 2 * padding - field_height) / stride + 1
 | 
			
		||||
    cdef int WW = (W + 2 * padding - field_width) / stride + 1
 | 
			
		||||
 | 
			
		||||
    cdef int p = padding
 | 
			
		||||
    cdef np.ndarray[DTYPE_t, ndim=4] x_padded = np.pad(x,
 | 
			
		||||
            ((0, 0), (0, 0), (p, p), (p, p)), mode='constant')
 | 
			
		||||
 | 
			
		||||
    cdef np.ndarray[DTYPE_t, ndim=2] cols = np.zeros(
 | 
			
		||||
            (C * field_height * field_width, N * HH * WW),
 | 
			
		||||
            dtype=x.dtype)
 | 
			
		||||
 | 
			
		||||
    # Moving the inner loop to a C function with no bounds checking works, but does
 | 
			
		||||
    # not seem to help performance in any measurable way.
 | 
			
		||||
 | 
			
		||||
    im2col_cython_inner(cols, x_padded, N, C, H, W, HH, WW,
 | 
			
		||||
                        field_height, field_width, padding, stride)
 | 
			
		||||
    return cols
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@cython.boundscheck(False)
 | 
			
		||||
cdef int im2col_cython_inner(np.ndarray[DTYPE_t, ndim=2] cols,
 | 
			
		||||
                             np.ndarray[DTYPE_t, ndim=4] x_padded,
 | 
			
		||||
                             int N, int C, int H, int W, int HH, int WW,
 | 
			
		||||
                             int field_height, int field_width, int padding, int stride) except? -1:
 | 
			
		||||
    cdef int c, ii, jj, row, yy, xx, i, col
 | 
			
		||||
 | 
			
		||||
    for c in range(C):
 | 
			
		||||
        for yy in range(HH):
 | 
			
		||||
            for xx in range(WW):
 | 
			
		||||
                for ii in range(field_height):
 | 
			
		||||
                    for jj in range(field_width):
 | 
			
		||||
                        row = c * field_width * field_height + ii * field_height + jj
 | 
			
		||||
                        for i in range(N):
 | 
			
		||||
                            col = yy * WW * N + xx * N + i
 | 
			
		||||
                            cols[row, col] = x_padded[i, c, stride * yy + ii, stride * xx + jj]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def col2im_cython(np.ndarray[DTYPE_t, ndim=2] cols, int N, int C, int H, int W,
 | 
			
		||||
                  int field_height, int field_width, int padding, int stride):
 | 
			
		||||
    cdef np.ndarray x = np.empty((N, C, H, W), dtype=cols.dtype)
 | 
			
		||||
    cdef int HH = (H + 2 * padding - field_height) / stride + 1
 | 
			
		||||
    cdef int WW = (W + 2 * padding - field_width) / stride + 1
 | 
			
		||||
    cdef np.ndarray[DTYPE_t, ndim=4] x_padded = np.zeros((N, C, H + 2 * padding, W + 2 * padding),
 | 
			
		||||
                                        dtype=cols.dtype)
 | 
			
		||||
 | 
			
		||||
    # Moving the inner loop to a C-function with no bounds checking improves
 | 
			
		||||
    # performance quite a bit for col2im.
 | 
			
		||||
    col2im_cython_inner(cols, x_padded, N, C, H, W, HH, WW, 
 | 
			
		||||
                        field_height, field_width, padding, stride)
 | 
			
		||||
    if padding > 0:
 | 
			
		||||
        return x_padded[:, :, padding:-padding, padding:-padding]
 | 
			
		||||
    return x_padded
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@cython.boundscheck(False)
 | 
			
		||||
cdef int col2im_cython_inner(np.ndarray[DTYPE_t, ndim=2] cols,
 | 
			
		||||
                             np.ndarray[DTYPE_t, ndim=4] x_padded,
 | 
			
		||||
                             int N, int C, int H, int W, int HH, int WW,
 | 
			
		||||
                             int field_height, int field_width, int padding, int stride) except? -1:
 | 
			
		||||
    cdef int c, ii, jj, row, yy, xx, i, col
 | 
			
		||||
 | 
			
		||||
    for c in range(C):
 | 
			
		||||
        for ii in range(field_height):
 | 
			
		||||
            for jj in range(field_width):
 | 
			
		||||
                row = c * field_width * field_height + ii * field_height + jj
 | 
			
		||||
                for yy in range(HH):
 | 
			
		||||
                    for xx in range(WW):
 | 
			
		||||
                        for i in range(N):
 | 
			
		||||
                            col = yy * WW * N + xx * N + i
 | 
			
		||||
                            x_padded[i, c, stride * yy + ii, stride * xx + jj] += cols[row, col]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@cython.boundscheck(False)
 | 
			
		||||
@cython.wraparound(False)
 | 
			
		||||
cdef col2im_6d_cython_inner(np.ndarray[DTYPE_t, ndim=6] cols,
 | 
			
		||||
                            np.ndarray[DTYPE_t, ndim=4] x_padded,
 | 
			
		||||
                            int N, int C, int H, int W, int HH, int WW,
 | 
			
		||||
                            int out_h, int out_w, int pad, int stride):
 | 
			
		||||
 | 
			
		||||
    cdef int c, hh, ww, n, h, w
 | 
			
		||||
    for n in range(N):
 | 
			
		||||
        for c in range(C):
 | 
			
		||||
            for hh in range(HH):
 | 
			
		||||
                for ww in range(WW):
 | 
			
		||||
                    for h in range(out_h):
 | 
			
		||||
                        for w in range(out_w):
 | 
			
		||||
                            x_padded[n, c, stride * h + hh, stride * w + ww] += cols[c, hh, ww, n, h, w]
 | 
			
		||||
    
 | 
			
		||||
 | 
			
		||||
def col2im_6d_cython(np.ndarray[DTYPE_t, ndim=6] cols, int N, int C, int H, int W,
 | 
			
		||||
        int HH, int WW, int pad, int stride):
 | 
			
		||||
    cdef np.ndarray x = np.empty((N, C, H, W), dtype=cols.dtype)
 | 
			
		||||
    cdef int out_h = (H + 2 * pad - HH) / stride + 1
 | 
			
		||||
    cdef int out_w = (W + 2 * pad - WW) / stride + 1
 | 
			
		||||
    cdef np.ndarray[DTYPE_t, ndim=4] x_padded = np.zeros((N, C, H + 2 * pad, W + 2 * pad),
 | 
			
		||||
                                                  dtype=cols.dtype)
 | 
			
		||||
 | 
			
		||||
    col2im_6d_cython_inner(cols, x_padded, N, C, H, W, HH, WW, out_h, out_w, pad, stride)
 | 
			
		||||
 | 
			
		||||
    if pad > 0:
 | 
			
		||||
        return x_padded[:, :, pad:-pad, pad:-pad]
 | 
			
		||||
    return x_padded 
 | 
			
		||||
@ -0,0 +1,110 @@
 | 
			
		||||
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
 | 
			
		||||
 | 
			
		||||
pass
 | 
			
		||||
 | 
			
		||||
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
 | 
			
		||||
from .layers import *
 | 
			
		||||
from .fast_layers import *
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def affine_relu_forward(x, w, b):
 | 
			
		||||
    """
 | 
			
		||||
    Convenience layer that perorms an affine transform followed by a ReLU
 | 
			
		||||
 | 
			
		||||
    Inputs:
 | 
			
		||||
    - x: Input to the affine layer
 | 
			
		||||
    - w, b: Weights for the affine layer
 | 
			
		||||
 | 
			
		||||
    Returns a tuple of:
 | 
			
		||||
    - out: Output from the ReLU
 | 
			
		||||
    - cache: Object to give to the backward pass
 | 
			
		||||
    """
 | 
			
		||||
    a, fc_cache = affine_forward(x, w, b)
 | 
			
		||||
    out, relu_cache = relu_forward(a)
 | 
			
		||||
    cache = (fc_cache, relu_cache)
 | 
			
		||||
    return out, cache
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def affine_relu_backward(dout, cache):
 | 
			
		||||
    """
 | 
			
		||||
    Backward pass for the affine-relu convenience layer
 | 
			
		||||
    """
 | 
			
		||||
    fc_cache, relu_cache = cache
 | 
			
		||||
    da = relu_backward(dout, relu_cache)
 | 
			
		||||
    dx, dw, db = affine_backward(da, fc_cache)
 | 
			
		||||
    return dx, dw, db
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def conv_relu_forward(x, w, b, conv_param):
 | 
			
		||||
    """
 | 
			
		||||
    A convenience layer that performs a convolution followed by a ReLU.
 | 
			
		||||
 | 
			
		||||
    Inputs:
 | 
			
		||||
    - x: Input to the convolutional layer
 | 
			
		||||
    - w, b, conv_param: Weights and parameters for the convolutional layer
 | 
			
		||||
 | 
			
		||||
    Returns a tuple of:
 | 
			
		||||
    - out: Output from the ReLU
 | 
			
		||||
    - cache: Object to give to the backward pass
 | 
			
		||||
    """
 | 
			
		||||
    a, conv_cache = conv_forward_fast(x, w, b, conv_param)
 | 
			
		||||
    out, relu_cache = relu_forward(a)
 | 
			
		||||
    cache = (conv_cache, relu_cache)
 | 
			
		||||
    return out, cache
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def conv_relu_backward(dout, cache):
 | 
			
		||||
    """
 | 
			
		||||
    Backward pass for the conv-relu convenience layer.
 | 
			
		||||
    """
 | 
			
		||||
    conv_cache, relu_cache = cache
 | 
			
		||||
    da = relu_backward(dout, relu_cache)
 | 
			
		||||
    dx, dw, db = conv_backward_fast(da, conv_cache)
 | 
			
		||||
    return dx, dw, db
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def conv_bn_relu_forward(x, w, b, gamma, beta, conv_param, bn_param):
 | 
			
		||||
    a, conv_cache = conv_forward_fast(x, w, b, conv_param)
 | 
			
		||||
    an, bn_cache = spatial_batchnorm_forward(a, gamma, beta, bn_param)
 | 
			
		||||
    out, relu_cache = relu_forward(an)
 | 
			
		||||
    cache = (conv_cache, bn_cache, relu_cache)
 | 
			
		||||
    return out, cache
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def conv_bn_relu_backward(dout, cache):
 | 
			
		||||
    conv_cache, bn_cache, relu_cache = cache
 | 
			
		||||
    dan = relu_backward(dout, relu_cache)
 | 
			
		||||
    da, dgamma, dbeta = spatial_batchnorm_backward(dan, bn_cache)
 | 
			
		||||
    dx, dw, db = conv_backward_fast(da, conv_cache)
 | 
			
		||||
    return dx, dw, db, dgamma, dbeta
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def conv_relu_pool_forward(x, w, b, conv_param, pool_param):
 | 
			
		||||
    """
 | 
			
		||||
    Convenience layer that performs a convolution, a ReLU, and a pool.
 | 
			
		||||
 | 
			
		||||
    Inputs:
 | 
			
		||||
    - x: Input to the convolutional layer
 | 
			
		||||
    - w, b, conv_param: Weights and parameters for the convolutional layer
 | 
			
		||||
    - pool_param: Parameters for the pooling layer
 | 
			
		||||
 | 
			
		||||
    Returns a tuple of:
 | 
			
		||||
    - out: Output from the pooling layer
 | 
			
		||||
    - cache: Object to give to the backward pass
 | 
			
		||||
    """
 | 
			
		||||
    a, conv_cache = conv_forward_fast(x, w, b, conv_param)
 | 
			
		||||
    s, relu_cache = relu_forward(a)
 | 
			
		||||
    out, pool_cache = max_pool_forward_fast(s, pool_param)
 | 
			
		||||
    cache = (conv_cache, relu_cache, pool_cache)
 | 
			
		||||
    return out, cache
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def conv_relu_pool_backward(dout, cache):
 | 
			
		||||
    """
 | 
			
		||||
    Backward pass for the conv-relu-pool convenience layer
 | 
			
		||||
    """
 | 
			
		||||
    conv_cache, relu_cache, pool_cache = cache
 | 
			
		||||
    ds = max_pool_backward_fast(dout, pool_cache)
 | 
			
		||||
    da = relu_backward(ds, relu_cache)
 | 
			
		||||
    dx, dw, db = conv_backward_fast(da, conv_cache)
 | 
			
		||||
    return dx, dw, db
 | 
			
		||||
@ -0,0 +1,696 @@
 | 
			
		||||
from builtins import range
 | 
			
		||||
import numpy as np
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def affine_forward(x, w, b):
 | 
			
		||||
    """
 | 
			
		||||
    Computes the forward pass for an affine (fully-connected) layer.
 | 
			
		||||
 | 
			
		||||
    The input x has shape (N, d_1, ..., d_k) and contains a minibatch of N
 | 
			
		||||
    examples, where each example x[i] has shape (d_1, ..., d_k). We will
 | 
			
		||||
    reshape each input into a vector of dimension D = d_1 * ... * d_k, and
 | 
			
		||||
    then transform it to an output vector of dimension M.
 | 
			
		||||
 | 
			
		||||
    Inputs:
 | 
			
		||||
    - x: A numpy array containing input data, of shape (N, d_1, ..., d_k)
 | 
			
		||||
    - w: A numpy array of weights, of shape (D, M)
 | 
			
		||||
    - b: A numpy array of biases, of shape (M,)
 | 
			
		||||
 | 
			
		||||
    Returns a tuple of:
 | 
			
		||||
    - out: output, of shape (N, M)
 | 
			
		||||
    - cache: (x, w, b)
 | 
			
		||||
    """
 | 
			
		||||
    out = None
 | 
			
		||||
    ###########################################################################
 | 
			
		||||
    # TODO: Implement the affine forward pass. Store the result in out. You   #
 | 
			
		||||
    # will need to reshape the input into rows.                               #
 | 
			
		||||
    ###########################################################################
 | 
			
		||||
    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
 | 
			
		||||
 | 
			
		||||
    pass
 | 
			
		||||
 | 
			
		||||
    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
 | 
			
		||||
    ###########################################################################
 | 
			
		||||
    #                             END OF YOUR CODE                            #
 | 
			
		||||
    ###########################################################################
 | 
			
		||||
    cache = (x, w, b)
 | 
			
		||||
    return out, cache
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def affine_backward(dout, cache):
 | 
			
		||||
    """
 | 
			
		||||
    Computes the backward pass for an affine layer.
 | 
			
		||||
 | 
			
		||||
    Inputs:
 | 
			
		||||
    - dout: Upstream derivative, of shape (N, M)
 | 
			
		||||
    - cache: Tuple of:
 | 
			
		||||
      - x: Input data, of shape (N, d_1, ... d_k)
 | 
			
		||||
      - w: Weights, of shape (D, M)
 | 
			
		||||
      - b: Biases, of shape (M,)
 | 
			
		||||
 | 
			
		||||
    Returns a tuple of:
 | 
			
		||||
    - dx: Gradient with respect to x, of shape (N, d1, ..., d_k)
 | 
			
		||||
    - dw: Gradient with respect to w, of shape (D, M)
 | 
			
		||||
    - db: Gradient with respect to b, of shape (M,)
 | 
			
		||||
    """
 | 
			
		||||
    x, w, b = cache
 | 
			
		||||
    dx, dw, db = None, None, None
 | 
			
		||||
    ###########################################################################
 | 
			
		||||
    # TODO: Implement the affine backward pass.                               #
 | 
			
		||||
    ###########################################################################
 | 
			
		||||
    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
 | 
			
		||||
 | 
			
		||||
    pass
 | 
			
		||||
 | 
			
		||||
    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
 | 
			
		||||
    ###########################################################################
 | 
			
		||||
    #                             END OF YOUR CODE                            #
 | 
			
		||||
    ###########################################################################
 | 
			
		||||
    return dx, dw, db
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def relu_forward(x):
 | 
			
		||||
    """
 | 
			
		||||
    Computes the forward pass for a layer of rectified linear units (ReLUs).
 | 
			
		||||
 | 
			
		||||
    Input:
 | 
			
		||||
    - x: Inputs, of any shape
 | 
			
		||||
 | 
			
		||||
    Returns a tuple of:
 | 
			
		||||
    - out: Output, of the same shape as x
 | 
			
		||||
    - cache: x
 | 
			
		||||
    """
 | 
			
		||||
    out = None
 | 
			
		||||
    ###########################################################################
 | 
			
		||||
    # TODO: Implement the ReLU forward pass.                                  #
 | 
			
		||||
    ###########################################################################
 | 
			
		||||
    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
 | 
			
		||||
 | 
			
		||||
    pass
 | 
			
		||||
 | 
			
		||||
    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
 | 
			
		||||
    ###########################################################################
 | 
			
		||||
    #                             END OF YOUR CODE                            #
 | 
			
		||||
    ###########################################################################
 | 
			
		||||
    cache = x
 | 
			
		||||
    return out, cache
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def relu_backward(dout, cache):
 | 
			
		||||
    """
 | 
			
		||||
    Computes the backward pass for a layer of rectified linear units (ReLUs).
 | 
			
		||||
 | 
			
		||||
    Input:
 | 
			
		||||
    - dout: Upstream derivatives, of any shape
 | 
			
		||||
    - cache: Input x, of same shape as dout
 | 
			
		||||
 | 
			
		||||
    Returns:
 | 
			
		||||
    - dx: Gradient with respect to x
 | 
			
		||||
    """
 | 
			
		||||
    dx, x = None, cache
 | 
			
		||||
    ###########################################################################
 | 
			
		||||
    # TODO: Implement the ReLU backward pass.                                 #
 | 
			
		||||
    ###########################################################################
 | 
			
		||||
    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
 | 
			
		||||
 | 
			
		||||
    pass
 | 
			
		||||
 | 
			
		||||
    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
 | 
			
		||||
    ###########################################################################
 | 
			
		||||
    #                             END OF YOUR CODE                            #
 | 
			
		||||
    ###########################################################################
 | 
			
		||||
    return dx
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def batchnorm_forward(x, gamma, beta, bn_param):
 | 
			
		||||
    """
 | 
			
		||||
    Forward pass for batch normalization.
 | 
			
		||||
 | 
			
		||||
    During training the sample mean and (uncorrected) sample variance are
 | 
			
		||||
    computed from minibatch statistics and used to normalize the incoming data.
 | 
			
		||||
    During training we also keep an exponentially decaying running mean of the
 | 
			
		||||
    mean and variance of each feature, and these averages are used to normalize
 | 
			
		||||
    data at test-time.
 | 
			
		||||
 | 
			
		||||
    At each timestep we update the running averages for mean and variance using
 | 
			
		||||
    an exponential decay based on the momentum parameter:
 | 
			
		||||
 | 
			
		||||
    running_mean = momentum * running_mean + (1 - momentum) * sample_mean
 | 
			
		||||
    running_var = momentum * running_var + (1 - momentum) * sample_var
 | 
			
		||||
 | 
			
		||||
    Note that the batch normalization paper suggests a different test-time
 | 
			
		||||
    behavior: they compute sample mean and variance for each feature using a
 | 
			
		||||
    large number of training images rather than using a running average. For
 | 
			
		||||
    this implementation we have chosen to use running averages instead since
 | 
			
		||||
    they do not require an additional estimation step; the torch7
 | 
			
		||||
    implementation of batch normalization also uses running averages.
 | 
			
		||||
 | 
			
		||||
    Input:
 | 
			
		||||
    - x: Data of shape (N, D)
 | 
			
		||||
    - gamma: Scale parameter of shape (D,)
 | 
			
		||||
    - beta: Shift paremeter of shape (D,)
 | 
			
		||||
    - bn_param: Dictionary with the following keys:
 | 
			
		||||
      - mode: 'train' or 'test'; required
 | 
			
		||||
      - eps: Constant for numeric stability
 | 
			
		||||
      - momentum: Constant for running mean / variance.
 | 
			
		||||
      - running_mean: Array of shape (D,) giving running mean of features
 | 
			
		||||
      - running_var Array of shape (D,) giving running variance of features
 | 
			
		||||
 | 
			
		||||
    Returns a tuple of:
 | 
			
		||||
    - out: of shape (N, D)
 | 
			
		||||
    - cache: A tuple of values needed in the backward pass
 | 
			
		||||
    """
 | 
			
		||||
    mode = bn_param["mode"]
 | 
			
		||||
    eps = bn_param.get("eps", 1e-5)
 | 
			
		||||
    momentum = bn_param.get("momentum", 0.9)
 | 
			
		||||
 | 
			
		||||
    N, D = x.shape
 | 
			
		||||
    running_mean = bn_param.get("running_mean", np.zeros(D, dtype=x.dtype))
 | 
			
		||||
    running_var = bn_param.get("running_var", np.zeros(D, dtype=x.dtype))
 | 
			
		||||
 | 
			
		||||
    out, cache = None, None
 | 
			
		||||
    if mode == "train":
 | 
			
		||||
        #######################################################################
 | 
			
		||||
        # TODO: Implement the training-time forward pass for batch norm.      #
 | 
			
		||||
        # Use minibatch statistics to compute the mean and variance, use      #
 | 
			
		||||
        # these statistics to normalize the incoming data, and scale and      #
 | 
			
		||||
        # shift the normalized data using gamma and beta.                     #
 | 
			
		||||
        #                                                                     #
 | 
			
		||||
        # You should store the output in the variable out. Any intermediates  #
 | 
			
		||||
        # that you need for the backward pass should be stored in the cache   #
 | 
			
		||||
        # variable.                                                           #
 | 
			
		||||
        #                                                                     #
 | 
			
		||||
        # You should also use your computed sample mean and variance together #
 | 
			
		||||
        # with the momentum variable to update the running mean and running   #
 | 
			
		||||
        # variance, storing your result in the running_mean and running_var   #
 | 
			
		||||
        # variables.                                                          #
 | 
			
		||||
        #                                                                     #
 | 
			
		||||
        # Note that though you should be keeping track of the running         #
 | 
			
		||||
        # variance, you should normalize the data based on the standard       #
 | 
			
		||||
        # deviation (square root of variance) instead!                        #
 | 
			
		||||
        # Referencing the original paper (https://arxiv.org/abs/1502.03167)   #
 | 
			
		||||
        # might prove to be helpful.                                          #
 | 
			
		||||
        #######################################################################
 | 
			
		||||
        # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
 | 
			
		||||
 | 
			
		||||
        pass
 | 
			
		||||
 | 
			
		||||
        # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
 | 
			
		||||
        #######################################################################
 | 
			
		||||
        #                           END OF YOUR CODE                          #
 | 
			
		||||
        #######################################################################
 | 
			
		||||
    elif mode == "test":
 | 
			
		||||
        #######################################################################
 | 
			
		||||
        # TODO: Implement the test-time forward pass for batch normalization. #
 | 
			
		||||
        # Use the running mean and variance to normalize the incoming data,   #
 | 
			
		||||
        # then scale and shift the normalized data using gamma and beta.      #
 | 
			
		||||
        # Store the result in the out variable.                               #
 | 
			
		||||
        #######################################################################
 | 
			
		||||
        # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
 | 
			
		||||
 | 
			
		||||
        pass
 | 
			
		||||
 | 
			
		||||
        # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
 | 
			
		||||
        #######################################################################
 | 
			
		||||
        #                          END OF YOUR CODE                           #
 | 
			
		||||
        #######################################################################
 | 
			
		||||
    else:
 | 
			
		||||
        raise ValueError('Invalid forward batchnorm mode "%s"' % mode)
 | 
			
		||||
 | 
			
		||||
    # Store the updated running means back into bn_param
 | 
			
		||||
    bn_param["running_mean"] = running_mean
 | 
			
		||||
    bn_param["running_var"] = running_var
 | 
			
		||||
 | 
			
		||||
    return out, cache
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def batchnorm_backward(dout, cache):
 | 
			
		||||
    """
 | 
			
		||||
    Backward pass for batch normalization.
 | 
			
		||||
 | 
			
		||||
    For this implementation, you should write out a computation graph for
 | 
			
		||||
    batch normalization on paper and propagate gradients backward through
 | 
			
		||||
    intermediate nodes.
 | 
			
		||||
 | 
			
		||||
    Inputs:
 | 
			
		||||
    - dout: Upstream derivatives, of shape (N, D)
 | 
			
		||||
    - cache: Variable of intermediates from batchnorm_forward.
 | 
			
		||||
 | 
			
		||||
    Returns a tuple of:
 | 
			
		||||
    - dx: Gradient with respect to inputs x, of shape (N, D)
 | 
			
		||||
    - dgamma: Gradient with respect to scale parameter gamma, of shape (D,)
 | 
			
		||||
    - dbeta: Gradient with respect to shift parameter beta, of shape (D,)
 | 
			
		||||
    """
 | 
			
		||||
    dx, dgamma, dbeta = None, None, None
 | 
			
		||||
    ###########################################################################
 | 
			
		||||
    # TODO: Implement the backward pass for batch normalization. Store the    #
 | 
			
		||||
    # results in the dx, dgamma, and dbeta variables.                         #
 | 
			
		||||
    # Referencing the original paper (https://arxiv.org/abs/1502.03167)       #
 | 
			
		||||
    # might prove to be helpful.                                              #
 | 
			
		||||
    ###########################################################################
 | 
			
		||||
    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
 | 
			
		||||
 | 
			
		||||
    pass
 | 
			
		||||
 | 
			
		||||
    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
 | 
			
		||||
    ###########################################################################
 | 
			
		||||
    #                             END OF YOUR CODE                            #
 | 
			
		||||
    ###########################################################################
 | 
			
		||||
 | 
			
		||||
    return dx, dgamma, dbeta
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def dropout_forward(x, dropout_param):
 | 
			
		||||
    """
 | 
			
		||||
    Performs the forward pass for (inverted) dropout.
 | 
			
		||||
 | 
			
		||||
    Inputs:
 | 
			
		||||
    - x: Input data, of any shape
 | 
			
		||||
    - dropout_param: A dictionary with the following keys:
 | 
			
		||||
      - p: Dropout parameter. We keep each neuron output with probability p.
 | 
			
		||||
      - mode: 'test' or 'train'. If the mode is train, then perform dropout;
 | 
			
		||||
        if the mode is test, then just return the input.
 | 
			
		||||
      - seed: Seed for the random number generator. Passing seed makes this
 | 
			
		||||
        function deterministic, which is needed for gradient checking but not
 | 
			
		||||
        in real networks.
 | 
			
		||||
 | 
			
		||||
    Outputs:
 | 
			
		||||
    - out: Array of the same shape as x.
 | 
			
		||||
    - cache: tuple (dropout_param, mask). In training mode, mask is the dropout
 | 
			
		||||
      mask that was used to multiply the input; in test mode, mask is None.
 | 
			
		||||
 | 
			
		||||
    NOTE: Please implement **inverted** dropout, not the vanilla version of dropout.
 | 
			
		||||
    See http://cs231n.github.io/neural-networks-2/#reg for more details.
 | 
			
		||||
 | 
			
		||||
    NOTE 2: Keep in mind that p is the probability of **keep** a neuron
 | 
			
		||||
    output; this might be contrary to some sources, where it is referred to
 | 
			
		||||
    as the probability of dropping a neuron output.
 | 
			
		||||
    """
 | 
			
		||||
    p, mode = dropout_param["p"], dropout_param["mode"]
 | 
			
		||||
    if "seed" in dropout_param:
 | 
			
		||||
        np.random.seed(dropout_param["seed"])
 | 
			
		||||
 | 
			
		||||
    mask = None
 | 
			
		||||
    out = None
 | 
			
		||||
 | 
			
		||||
    if mode == "train":
 | 
			
		||||
        #######################################################################
 | 
			
		||||
        # TODO: Implement training phase forward pass for inverted dropout.   #
 | 
			
		||||
        # Store the dropout mask in the mask variable.                        #
 | 
			
		||||
        #######################################################################
 | 
			
		||||
        # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
 | 
			
		||||
 | 
			
		||||
        pass
 | 
			
		||||
 | 
			
		||||
        # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
 | 
			
		||||
        #######################################################################
 | 
			
		||||
        #                           END OF YOUR CODE                          #
 | 
			
		||||
        #######################################################################
 | 
			
		||||
    elif mode == "test":
 | 
			
		||||
        #######################################################################
 | 
			
		||||
        # TODO: Implement the test phase forward pass for inverted dropout.   #
 | 
			
		||||
        #######################################################################
 | 
			
		||||
        # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
 | 
			
		||||
 | 
			
		||||
        pass
 | 
			
		||||
 | 
			
		||||
        # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
 | 
			
		||||
        #######################################################################
 | 
			
		||||
        #                            END OF YOUR CODE                         #
 | 
			
		||||
        #######################################################################
 | 
			
		||||
 | 
			
		||||
    cache = (dropout_param, mask)
 | 
			
		||||
    out = out.astype(x.dtype, copy=False)
 | 
			
		||||
 | 
			
		||||
    return out, cache
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def dropout_backward(dout, cache):
 | 
			
		||||
    """
 | 
			
		||||
    Perform the backward pass for (inverted) dropout.
 | 
			
		||||
 | 
			
		||||
    Inputs:
 | 
			
		||||
    - dout: Upstream derivatives, of any shape
 | 
			
		||||
    - cache: (dropout_param, mask) from dropout_forward.
 | 
			
		||||
    """
 | 
			
		||||
    dropout_param, mask = cache
 | 
			
		||||
    mode = dropout_param["mode"]
 | 
			
		||||
 | 
			
		||||
    dx = None
 | 
			
		||||
    if mode == "train":
 | 
			
		||||
        #######################################################################
 | 
			
		||||
        # TODO: Implement training phase backward pass for inverted dropout   #
 | 
			
		||||
        #######################################################################
 | 
			
		||||
        # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
 | 
			
		||||
 | 
			
		||||
        pass
 | 
			
		||||
 | 
			
		||||
        # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
 | 
			
		||||
        #######################################################################
 | 
			
		||||
        #                          END OF YOUR CODE                           #
 | 
			
		||||
        #######################################################################
 | 
			
		||||
    elif mode == "test":
 | 
			
		||||
        dx = dout
 | 
			
		||||
    return dx
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def conv_forward_naive(x, w, b, conv_param):
 | 
			
		||||
    """
 | 
			
		||||
    A naive implementation of the forward pass for a convolutional layer.
 | 
			
		||||
 | 
			
		||||
    The input consists of N data points, each with C channels, height H and
 | 
			
		||||
    width W. We convolve each input with F different filters, where each filter
 | 
			
		||||
    spans all C channels and has height HH and width WW.
 | 
			
		||||
 | 
			
		||||
    Input:
 | 
			
		||||
    - x: Input data of shape (N, C, H, W)
 | 
			
		||||
    - w: Filter weights of shape (F, C, HH, WW)
 | 
			
		||||
    - b: Biases, of shape (F,)
 | 
			
		||||
    - conv_param: A dictionary with the following keys:
 | 
			
		||||
      - 'stride': The number of pixels between adjacent receptive fields in the
 | 
			
		||||
        horizontal and vertical directions.
 | 
			
		||||
      - 'pad': The number of pixels that will be used to zero-pad the input. 
 | 
			
		||||
        
 | 
			
		||||
 | 
			
		||||
    During padding, 'pad' zeros should be placed symmetrically (i.e equally on both sides)
 | 
			
		||||
    along the height and width axes of the input. Be careful not to modfiy the original
 | 
			
		||||
    input x directly.
 | 
			
		||||
 | 
			
		||||
    Returns a tuple of:
 | 
			
		||||
    - out: Output data, of shape (N, F, H', W') where H' and W' are given by
 | 
			
		||||
      H' = 1 + (H + 2 * pad - HH) / stride
 | 
			
		||||
      W' = 1 + (W + 2 * pad - WW) / stride
 | 
			
		||||
    - cache: (x, w, b, conv_param)
 | 
			
		||||
    """
 | 
			
		||||
    out = None
 | 
			
		||||
    ###########################################################################
 | 
			
		||||
    # TODO: Implement the convolutional forward pass.                         #
 | 
			
		||||
    # Hint: you can use the function np.pad for padding.                      #
 | 
			
		||||
    ###########################################################################
 | 
			
		||||
    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
 | 
			
		||||
 | 
			
		||||
    pass
 | 
			
		||||
 | 
			
		||||
    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
 | 
			
		||||
    ###########################################################################
 | 
			
		||||
    #                             END OF YOUR CODE                            #
 | 
			
		||||
    ###########################################################################
 | 
			
		||||
    cache = (x, w, b, conv_param)
 | 
			
		||||
    return out, cache
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def conv_backward_naive(dout, cache):
 | 
			
		||||
    """
 | 
			
		||||
    A naive implementation of the backward pass for a convolutional layer.
 | 
			
		||||
 | 
			
		||||
    Inputs:
 | 
			
		||||
    - dout: Upstream derivatives.
 | 
			
		||||
    - cache: A tuple of (x, w, b, conv_param) as in conv_forward_naive
 | 
			
		||||
 | 
			
		||||
    Returns a tuple of:
 | 
			
		||||
    - dx: Gradient with respect to x
 | 
			
		||||
    - dw: Gradient with respect to w
 | 
			
		||||
    - db: Gradient with respect to b
 | 
			
		||||
    """
 | 
			
		||||
    dx, dw, db = None, None, None
 | 
			
		||||
    ###########################################################################
 | 
			
		||||
    # TODO: Implement the convolutional backward pass.                        #
 | 
			
		||||
    ###########################################################################
 | 
			
		||||
    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
 | 
			
		||||
 | 
			
		||||
    pass
 | 
			
		||||
 | 
			
		||||
    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
 | 
			
		||||
    ###########################################################################
 | 
			
		||||
    #                             END OF YOUR CODE                            #
 | 
			
		||||
    ###########################################################################
 | 
			
		||||
    return dx, dw, db
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def max_pool_forward_naive(x, pool_param):
 | 
			
		||||
    """
 | 
			
		||||
    A naive implementation of the forward pass for a max-pooling layer.
 | 
			
		||||
 | 
			
		||||
    Inputs:
 | 
			
		||||
    - x: Input data, of shape (N, C, H, W)
 | 
			
		||||
    - pool_param: dictionary with the following keys:
 | 
			
		||||
      - 'pool_height': The height of each pooling region
 | 
			
		||||
      - 'pool_width': The width of each pooling region
 | 
			
		||||
      - 'stride': The distance between adjacent pooling regions
 | 
			
		||||
 | 
			
		||||
    No padding is necessary here. Output size is given by 
 | 
			
		||||
 | 
			
		||||
    Returns a tuple of:
 | 
			
		||||
    - out: Output data, of shape (N, C, H', W') where H' and W' are given by
 | 
			
		||||
      H' = 1 + (H - pool_height) / stride
 | 
			
		||||
      W' = 1 + (W - pool_width) / stride
 | 
			
		||||
    - cache: (x, pool_param)
 | 
			
		||||
    """
 | 
			
		||||
    out = None
 | 
			
		||||
    ###########################################################################
 | 
			
		||||
    # TODO: Implement the max-pooling forward pass                            #
 | 
			
		||||
    ###########################################################################
 | 
			
		||||
    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
 | 
			
		||||
 | 
			
		||||
    pass
 | 
			
		||||
 | 
			
		||||
    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
 | 
			
		||||
    ###########################################################################
 | 
			
		||||
    #                             END OF YOUR CODE                            #
 | 
			
		||||
    ###########################################################################
 | 
			
		||||
    cache = (x, pool_param)
 | 
			
		||||
    return out, cache
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def max_pool_backward_naive(dout, cache):
 | 
			
		||||
    """
 | 
			
		||||
    A naive implementation of the backward pass for a max-pooling layer.
 | 
			
		||||
 | 
			
		||||
    Inputs:
 | 
			
		||||
    - dout: Upstream derivatives
 | 
			
		||||
    - cache: A tuple of (x, pool_param) as in the forward pass.
 | 
			
		||||
 | 
			
		||||
    Returns:
 | 
			
		||||
    - dx: Gradient with respect to x
 | 
			
		||||
    """
 | 
			
		||||
    dx = None
 | 
			
		||||
    ###########################################################################
 | 
			
		||||
    # TODO: Implement the max-pooling backward pass                           #
 | 
			
		||||
    ###########################################################################
 | 
			
		||||
    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
 | 
			
		||||
 | 
			
		||||
    pass
 | 
			
		||||
 | 
			
		||||
    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
 | 
			
		||||
    ###########################################################################
 | 
			
		||||
    #                             END OF YOUR CODE                            #
 | 
			
		||||
    ###########################################################################
 | 
			
		||||
    return dx
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def spatial_batchnorm_forward(x, gamma, beta, bn_param):
 | 
			
		||||
    """
 | 
			
		||||
    Computes the forward pass for spatial batch normalization.
 | 
			
		||||
 | 
			
		||||
    Inputs:
 | 
			
		||||
    - x: Input data of shape (N, C, H, W)
 | 
			
		||||
    - gamma: Scale parameter, of shape (C,)
 | 
			
		||||
    - beta: Shift parameter, of shape (C,)
 | 
			
		||||
    - bn_param: Dictionary with the following keys:
 | 
			
		||||
      - mode: 'train' or 'test'; required
 | 
			
		||||
      - eps: Constant for numeric stability
 | 
			
		||||
      - momentum: Constant for running mean / variance. momentum=0 means that
 | 
			
		||||
        old information is discarded completely at every time step, while
 | 
			
		||||
        momentum=1 means that new information is never incorporated. The
 | 
			
		||||
        default of momentum=0.9 should work well in most situations.
 | 
			
		||||
      - running_mean: Array of shape (D,) giving running mean of features
 | 
			
		||||
      - running_var Array of shape (D,) giving running variance of features
 | 
			
		||||
 | 
			
		||||
    Returns a tuple of:
 | 
			
		||||
    - out: Output data, of shape (N, C, H, W)
 | 
			
		||||
    - cache: Values needed for the backward pass
 | 
			
		||||
    """
 | 
			
		||||
    out, cache = None, None
 | 
			
		||||
 | 
			
		||||
    ###########################################################################
 | 
			
		||||
    # TODO: Implement the forward pass for spatial batch normalization.       #
 | 
			
		||||
    #                                                                         #
 | 
			
		||||
    # HINT: You can implement spatial batch normalization by calling the      #
 | 
			
		||||
    # vanilla version of batch normalization you implemented above.           #
 | 
			
		||||
    # Your implementation should be very short; ours is less than five lines. #
 | 
			
		||||
    ###########################################################################
 | 
			
		||||
    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
 | 
			
		||||
 | 
			
		||||
    pass
 | 
			
		||||
 | 
			
		||||
    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
 | 
			
		||||
    ###########################################################################
 | 
			
		||||
    #                             END OF YOUR CODE                            #
 | 
			
		||||
    ###########################################################################
 | 
			
		||||
 | 
			
		||||
    return out, cache
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def spatial_batchnorm_backward(dout, cache):
 | 
			
		||||
    """
 | 
			
		||||
    Computes the backward pass for spatial batch normalization.
 | 
			
		||||
 | 
			
		||||
    Inputs:
 | 
			
		||||
    - dout: Upstream derivatives, of shape (N, C, H, W)
 | 
			
		||||
    - cache: Values from the forward pass
 | 
			
		||||
 | 
			
		||||
    Returns a tuple of:
 | 
			
		||||
    - dx: Gradient with respect to inputs, of shape (N, C, H, W)
 | 
			
		||||
    - dgamma: Gradient with respect to scale parameter, of shape (C,)
 | 
			
		||||
    - dbeta: Gradient with respect to shift parameter, of shape (C,)
 | 
			
		||||
    """
 | 
			
		||||
    dx, dgamma, dbeta = None, None, None
 | 
			
		||||
 | 
			
		||||
    ###########################################################################
 | 
			
		||||
    # TODO: Implement the backward pass for spatial batch normalization.      #
 | 
			
		||||
    #                                                                         #
 | 
			
		||||
    # HINT: You can implement spatial batch normalization by calling the      #
 | 
			
		||||
    # vanilla version of batch normalization you implemented above.           #
 | 
			
		||||
    # Your implementation should be very short; ours is less than five lines. #
 | 
			
		||||
    ###########################################################################
 | 
			
		||||
    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
 | 
			
		||||
 | 
			
		||||
    pass
 | 
			
		||||
 | 
			
		||||
    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
 | 
			
		||||
    ###########################################################################
 | 
			
		||||
    #                             END OF YOUR CODE                            #
 | 
			
		||||
    ###########################################################################
 | 
			
		||||
 | 
			
		||||
    return dx, dgamma, dbeta
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def spatial_groupnorm_forward(x, gamma, beta, G, gn_param):
 | 
			
		||||
    """
 | 
			
		||||
    Computes the forward pass for spatial group normalization.
 | 
			
		||||
    In contrast to layer normalization, group normalization splits each entry 
 | 
			
		||||
    in the data into G contiguous pieces, which it then normalizes independently.
 | 
			
		||||
    Per feature shifting and scaling are then applied to the data, in a manner identical to that of batch normalization and layer normalization.
 | 
			
		||||
 | 
			
		||||
    Inputs:
 | 
			
		||||
    - x: Input data of shape (N, C, H, W)
 | 
			
		||||
    - gamma: Scale parameter, of shape (C,)
 | 
			
		||||
    - beta: Shift parameter, of shape (C,)
 | 
			
		||||
    - G: Integer mumber of groups to split into, should be a divisor of C
 | 
			
		||||
    - gn_param: Dictionary with the following keys:
 | 
			
		||||
      - eps: Constant for numeric stability
 | 
			
		||||
 | 
			
		||||
    Returns a tuple of:
 | 
			
		||||
    - out: Output data, of shape (N, C, H, W)
 | 
			
		||||
    - cache: Values needed for the backward pass
 | 
			
		||||
    """
 | 
			
		||||
    out, cache = None, None
 | 
			
		||||
    eps = gn_param.get("eps", 1e-5)
 | 
			
		||||
    ###########################################################################
 | 
			
		||||
    # TODO: Implement the forward pass for spatial group normalization.       #
 | 
			
		||||
    # This will be extremely similar to the layer norm implementation.        #
 | 
			
		||||
    # In particular, think about how you could transform the matrix so that   #
 | 
			
		||||
    # the bulk of the code is similar to both train-time batch normalization  #
 | 
			
		||||
    # and layer normalization!                                                #
 | 
			
		||||
    ###########################################################################
 | 
			
		||||
    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
 | 
			
		||||
 | 
			
		||||
    pass
 | 
			
		||||
 | 
			
		||||
    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
 | 
			
		||||
    ###########################################################################
 | 
			
		||||
    #                             END OF YOUR CODE                            #
 | 
			
		||||
    ###########################################################################
 | 
			
		||||
    return out, cache
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def spatial_groupnorm_backward(dout, cache):
 | 
			
		||||
    """
 | 
			
		||||
    Computes the backward pass for spatial group normalization.
 | 
			
		||||
 | 
			
		||||
    Inputs:
 | 
			
		||||
    - dout: Upstream derivatives, of shape (N, C, H, W)
 | 
			
		||||
    - cache: Values from the forward pass
 | 
			
		||||
 | 
			
		||||
    Returns a tuple of:
 | 
			
		||||
    - dx: Gradient with respect to inputs, of shape (N, C, H, W)
 | 
			
		||||
    - dgamma: Gradient with respect to scale parameter, of shape (C,)
 | 
			
		||||
    - dbeta: Gradient with respect to shift parameter, of shape (C,)
 | 
			
		||||
    """
 | 
			
		||||
    dx, dgamma, dbeta = None, None, None
 | 
			
		||||
 | 
			
		||||
    ###########################################################################
 | 
			
		||||
    # TODO: Implement the backward pass for spatial group normalization.      #
 | 
			
		||||
    # This will be extremely similar to the layer norm implementation.        #
 | 
			
		||||
    ###########################################################################
 | 
			
		||||
    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
 | 
			
		||||
 | 
			
		||||
    pass
 | 
			
		||||
 | 
			
		||||
    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
 | 
			
		||||
    ###########################################################################
 | 
			
		||||
    #                             END OF YOUR CODE                            #
 | 
			
		||||
    ###########################################################################
 | 
			
		||||
    return dx, dgamma, dbeta
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def svm_loss(x, y):
 | 
			
		||||
    """
 | 
			
		||||
    Computes the loss and gradient using for multiclass SVM classification.
 | 
			
		||||
 | 
			
		||||
    Inputs:
 | 
			
		||||
    - x: Input data, of shape (N, C) where x[i, j] is the score for the jth
 | 
			
		||||
      class for the ith input.
 | 
			
		||||
    - y: Vector of labels, of shape (N,) where y[i] is the label for x[i] and
 | 
			
		||||
      0 <= y[i] < C
 | 
			
		||||
 | 
			
		||||
    Returns a tuple of:
 | 
			
		||||
    - loss: Scalar giving the loss
 | 
			
		||||
    - dx: Gradient of the loss with respect to x
 | 
			
		||||
    """
 | 
			
		||||
    N = x.shape[0]
 | 
			
		||||
    correct_class_scores = x[np.arange(N), y]
 | 
			
		||||
    margins = np.maximum(0, x - correct_class_scores[:, np.newaxis] + 1.0)
 | 
			
		||||
    margins[np.arange(N), y] = 0
 | 
			
		||||
    loss = np.sum(margins) / N
 | 
			
		||||
    num_pos = np.sum(margins > 0, axis=1)
 | 
			
		||||
    dx = np.zeros_like(x)
 | 
			
		||||
    dx[margins > 0] = 1
 | 
			
		||||
    dx[np.arange(N), y] -= num_pos
 | 
			
		||||
    dx /= N
 | 
			
		||||
    return loss, dx
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def softmax_loss(x, y):
 | 
			
		||||
    """
 | 
			
		||||
    Computes the loss and gradient for softmax classification.
 | 
			
		||||
 | 
			
		||||
    Inputs:
 | 
			
		||||
    - x: Input data, of shape (N, C) where x[i, j] is the score for the jth
 | 
			
		||||
      class for the ith input.
 | 
			
		||||
    - y: Vector of labels, of shape (N,) where y[i] is the label for x[i] and
 | 
			
		||||
      0 <= y[i] < C
 | 
			
		||||
 | 
			
		||||
    Returns a tuple of:
 | 
			
		||||
    - loss: Scalar giving the loss
 | 
			
		||||
    - dx: Gradient of the loss with respect to x
 | 
			
		||||
    """
 | 
			
		||||
    shifted_logits = x - np.max(x, axis=1, keepdims=True)
 | 
			
		||||
    Z = np.sum(np.exp(shifted_logits), axis=1, keepdims=True)
 | 
			
		||||
    log_probs = shifted_logits - np.log(Z)
 | 
			
		||||
    probs = np.exp(log_probs)
 | 
			
		||||
    N = x.shape[0]
 | 
			
		||||
    loss = -np.sum(log_probs[np.arange(N), y]) / N
 | 
			
		||||
    dx = probs.copy()
 | 
			
		||||
    dx[np.arange(N), y] -= 1
 | 
			
		||||
    dx /= N
 | 
			
		||||
    return loss, dx
 | 
			
		||||
@ -0,0 +1,162 @@
 | 
			
		||||
import numpy as np
 | 
			
		||||
 | 
			
		||||
"""
 | 
			
		||||
This file implements various first-order update rules that are commonly used
 | 
			
		||||
for training neural networks. Each update rule accepts current weights and the
 | 
			
		||||
gradient of the loss with respect to those weights and produces the next set of
 | 
			
		||||
weights. Each update rule has the same interface:
 | 
			
		||||
 | 
			
		||||
def update(w, dw, config=None):
 | 
			
		||||
 | 
			
		||||
Inputs:
 | 
			
		||||
  - w: A numpy array giving the current weights.
 | 
			
		||||
  - dw: A numpy array of the same shape as w giving the gradient of the
 | 
			
		||||
    loss with respect to w.
 | 
			
		||||
  - config: A dictionary containing hyperparameter values such as learning
 | 
			
		||||
    rate, momentum, etc. If the update rule requires caching values over many
 | 
			
		||||
    iterations, then config will also hold these cached values.
 | 
			
		||||
 | 
			
		||||
Returns:
 | 
			
		||||
  - next_w: The next point after the update.
 | 
			
		||||
  - config: The config dictionary to be passed to the next iteration of the
 | 
			
		||||
    update rule.
 | 
			
		||||
 | 
			
		||||
NOTE: For most update rules, the default learning rate will probably not
 | 
			
		||||
perform well; however the default values of the other hyperparameters should
 | 
			
		||||
work well for a variety of different problems.
 | 
			
		||||
 | 
			
		||||
For efficiency, update rules may perform in-place updates, mutating w and
 | 
			
		||||
setting next_w equal to w.
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def sgd(w, dw, config=None):
 | 
			
		||||
    """
 | 
			
		||||
    Performs vanilla stochastic gradient descent.
 | 
			
		||||
 | 
			
		||||
    config format:
 | 
			
		||||
    - learning_rate: Scalar learning rate.
 | 
			
		||||
    """
 | 
			
		||||
    if config is None:
 | 
			
		||||
        config = {}
 | 
			
		||||
    config.setdefault("learning_rate", 1e-2)
 | 
			
		||||
 | 
			
		||||
    w -= config["learning_rate"] * dw
 | 
			
		||||
    return w, config
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def sgd_momentum(w, dw, config=None):
 | 
			
		||||
    """
 | 
			
		||||
    Performs stochastic gradient descent with momentum.
 | 
			
		||||
 | 
			
		||||
    config format:
 | 
			
		||||
    - learning_rate: Scalar learning rate.
 | 
			
		||||
    - momentum: Scalar between 0 and 1 giving the momentum value.
 | 
			
		||||
      Setting momentum = 0 reduces to sgd.
 | 
			
		||||
    - velocity: A numpy array of the same shape as w and dw used to store a
 | 
			
		||||
      moving average of the gradients.
 | 
			
		||||
    """
 | 
			
		||||
    if config is None:
 | 
			
		||||
        config = {}
 | 
			
		||||
    config.setdefault("learning_rate", 1e-2)
 | 
			
		||||
    config.setdefault("momentum", 0.9)
 | 
			
		||||
    v = config.get("velocity", np.zeros_like(w))
 | 
			
		||||
 | 
			
		||||
    next_w = None
 | 
			
		||||
    ###########################################################################
 | 
			
		||||
    # TODO: Implement the momentum update formula. Store the updated value in #
 | 
			
		||||
    # the next_w variable. You should also use and update the velocity v.     #
 | 
			
		||||
    ###########################################################################
 | 
			
		||||
    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
 | 
			
		||||
 | 
			
		||||
    pass
 | 
			
		||||
 | 
			
		||||
    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
 | 
			
		||||
    ###########################################################################
 | 
			
		||||
    #                             END OF YOUR CODE                            #
 | 
			
		||||
    ###########################################################################
 | 
			
		||||
    config["velocity"] = v
 | 
			
		||||
 | 
			
		||||
    return next_w, config
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def rmsprop(w, dw, config=None):
 | 
			
		||||
    """
 | 
			
		||||
    Uses the RMSProp update rule, which uses a moving average of squared
 | 
			
		||||
    gradient values to set adaptive per-parameter learning rates.
 | 
			
		||||
 | 
			
		||||
    config format:
 | 
			
		||||
    - learning_rate: Scalar learning rate.
 | 
			
		||||
    - decay_rate: Scalar between 0 and 1 giving the decay rate for the squared
 | 
			
		||||
      gradient cache.
 | 
			
		||||
    - epsilon: Small scalar used for smoothing to avoid dividing by zero.
 | 
			
		||||
    - cache: Moving average of second moments of gradients.
 | 
			
		||||
    """
 | 
			
		||||
    if config is None:
 | 
			
		||||
        config = {}
 | 
			
		||||
    config.setdefault("learning_rate", 1e-2)
 | 
			
		||||
    config.setdefault("decay_rate", 0.99)
 | 
			
		||||
    config.setdefault("epsilon", 1e-8)
 | 
			
		||||
    config.setdefault("cache", np.zeros_like(w))
 | 
			
		||||
 | 
			
		||||
    next_w = None
 | 
			
		||||
    ###########################################################################
 | 
			
		||||
    # TODO: Implement the RMSprop update formula, storing the next value of w #
 | 
			
		||||
    # in the next_w variable. Don't forget to update cache value stored in    #
 | 
			
		||||
    # config['cache'].                                                        #
 | 
			
		||||
    ###########################################################################
 | 
			
		||||
    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
 | 
			
		||||
 | 
			
		||||
    pass
 | 
			
		||||
 | 
			
		||||
    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
 | 
			
		||||
    ###########################################################################
 | 
			
		||||
    #                             END OF YOUR CODE                            #
 | 
			
		||||
    ###########################################################################
 | 
			
		||||
 | 
			
		||||
    return next_w, config
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def adam(w, dw, config=None):
 | 
			
		||||
    """
 | 
			
		||||
    Uses the Adam update rule, which incorporates moving averages of both the
 | 
			
		||||
    gradient and its square and a bias correction term.
 | 
			
		||||
 | 
			
		||||
    config format:
 | 
			
		||||
    - learning_rate: Scalar learning rate.
 | 
			
		||||
    - beta1: Decay rate for moving average of first moment of gradient.
 | 
			
		||||
    - beta2: Decay rate for moving average of second moment of gradient.
 | 
			
		||||
    - epsilon: Small scalar used for smoothing to avoid dividing by zero.
 | 
			
		||||
    - m: Moving average of gradient.
 | 
			
		||||
    - v: Moving average of squared gradient.
 | 
			
		||||
    - t: Iteration number.
 | 
			
		||||
    """
 | 
			
		||||
    if config is None:
 | 
			
		||||
        config = {}
 | 
			
		||||
    config.setdefault("learning_rate", 1e-3)
 | 
			
		||||
    config.setdefault("beta1", 0.9)
 | 
			
		||||
    config.setdefault("beta2", 0.999)
 | 
			
		||||
    config.setdefault("epsilon", 1e-8)
 | 
			
		||||
    config.setdefault("m", np.zeros_like(w))
 | 
			
		||||
    config.setdefault("v", np.zeros_like(w))
 | 
			
		||||
    config.setdefault("t", 0)
 | 
			
		||||
 | 
			
		||||
    next_w = None
 | 
			
		||||
    ###########################################################################
 | 
			
		||||
    # TODO: Implement the Adam update formula, storing the next value of w in #
 | 
			
		||||
    # the next_w variable. Don't forget to update the m, v, and t variables   #
 | 
			
		||||
    # stored in config.                                                       #
 | 
			
		||||
    #                                                                         #
 | 
			
		||||
    # NOTE: In order to match the reference output, please modify t _before_  #
 | 
			
		||||
    # using it in any calculations.                                           #
 | 
			
		||||
    ###########################################################################
 | 
			
		||||
    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
 | 
			
		||||
 | 
			
		||||
    pass
 | 
			
		||||
 | 
			
		||||
    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
 | 
			
		||||
    ###########################################################################
 | 
			
		||||
    #                             END OF YOUR CODE                            #
 | 
			
		||||
    ###########################################################################
 | 
			
		||||
 | 
			
		||||
    return next_w, config
 | 
			
		||||
@ -0,0 +1,12 @@
 | 
			
		||||
from distutils.core import setup
 | 
			
		||||
from distutils.extension import Extension
 | 
			
		||||
from Cython.Build import cythonize
 | 
			
		||||
import numpy
 | 
			
		||||
 | 
			
		||||
extensions = [
 | 
			
		||||
    Extension(
 | 
			
		||||
        "im2col_cython", ["im2col_cython.pyx"], include_dirs=[numpy.get_include()]
 | 
			
		||||
    ),
 | 
			
		||||
]
 | 
			
		||||
 | 
			
		||||
setup(ext_modules=cythonize(extensions),)
 | 
			
		||||
@ -0,0 +1,309 @@
 | 
			
		||||
from __future__ import print_function, division
 | 
			
		||||
from future import standard_library
 | 
			
		||||
 | 
			
		||||
standard_library.install_aliases()
 | 
			
		||||
from builtins import range
 | 
			
		||||
from builtins import object
 | 
			
		||||
import os
 | 
			
		||||
import pickle as pickle
 | 
			
		||||
 | 
			
		||||
import numpy as np
 | 
			
		||||
 | 
			
		||||
from scripts import optim
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Solver(object):
 | 
			
		||||
    """
 | 
			
		||||
    A Solver encapsulates all the logic necessary for training classification
 | 
			
		||||
    models. The Solver performs stochastic gradient descent using different
 | 
			
		||||
    update rules defined in optim.py.
 | 
			
		||||
 | 
			
		||||
    The solver accepts both training and validataion data and labels so it can
 | 
			
		||||
    periodically check classification accuracy on both training and validation
 | 
			
		||||
    data to watch out for overfitting.
 | 
			
		||||
 | 
			
		||||
    To train a model, you will first construct a Solver instance, passing the
 | 
			
		||||
    model, dataset, and various options (learning rate, batch size, etc) to the
 | 
			
		||||
    constructor. You will then call the train() method to run the optimization
 | 
			
		||||
    procedure and train the model.
 | 
			
		||||
 | 
			
		||||
    After the train() method returns, model.params will contain the parameters
 | 
			
		||||
    that performed best on the validation set over the course of training.
 | 
			
		||||
    In addition, the instance variable solver.loss_history will contain a list
 | 
			
		||||
    of all losses encountered during training and the instance variables
 | 
			
		||||
    solver.train_acc_history and solver.val_acc_history will be lists of the
 | 
			
		||||
    accuracies of the model on the training and validation set at each epoch.
 | 
			
		||||
 | 
			
		||||
    Example usage might look something like this:
 | 
			
		||||
 | 
			
		||||
    data = {
 | 
			
		||||
      'X_train': # training data
 | 
			
		||||
      'y_train': # training labels
 | 
			
		||||
      'X_val': # validation data
 | 
			
		||||
      'y_val': # validation labels
 | 
			
		||||
    }
 | 
			
		||||
    model = MyAwesomeModel(hidden_size=100, reg=10)
 | 
			
		||||
    solver = Solver(model, data,
 | 
			
		||||
                    update_rule='sgd',
 | 
			
		||||
                    optim_config={
 | 
			
		||||
                      'learning_rate': 1e-3,
 | 
			
		||||
                    },
 | 
			
		||||
                    lr_decay=0.95,
 | 
			
		||||
                    num_epochs=10, batch_size=100,
 | 
			
		||||
                    print_every=100)
 | 
			
		||||
    solver.train()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    A Solver works on a model object that must conform to the following API:
 | 
			
		||||
 | 
			
		||||
    - model.params must be a dictionary mapping string parameter names to numpy
 | 
			
		||||
      arrays containing parameter values.
 | 
			
		||||
 | 
			
		||||
    - model.loss(X, y) must be a function that computes training-time loss and
 | 
			
		||||
      gradients, and test-time classification scores, with the following inputs
 | 
			
		||||
      and outputs:
 | 
			
		||||
 | 
			
		||||
      Inputs:
 | 
			
		||||
      - X: Array giving a minibatch of input data of shape (N, d_1, ..., d_k)
 | 
			
		||||
      - y: Array of labels, of shape (N,) giving labels for X where y[i] is the
 | 
			
		||||
        label for X[i].
 | 
			
		||||
 | 
			
		||||
      Returns:
 | 
			
		||||
      If y is None, run a test-time forward pass and return:
 | 
			
		||||
      - scores: Array of shape (N, C) giving classification scores for X where
 | 
			
		||||
        scores[i, c] gives the score of class c for X[i].
 | 
			
		||||
 | 
			
		||||
      If y is not None, run a training time forward and backward pass and
 | 
			
		||||
      return a tuple of:
 | 
			
		||||
      - loss: Scalar giving the loss
 | 
			
		||||
      - grads: Dictionary with the same keys as self.params mapping parameter
 | 
			
		||||
        names to gradients of the loss with respect to those parameters.
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    def __init__(self, model, data, **kwargs):
 | 
			
		||||
        """
 | 
			
		||||
        Construct a new Solver instance.
 | 
			
		||||
 | 
			
		||||
        Required arguments:
 | 
			
		||||
        - model: A model object conforming to the API described above
 | 
			
		||||
        - data: A dictionary of training and validation data containing:
 | 
			
		||||
          'X_train': Array, shape (N_train, d_1, ..., d_k) of training images
 | 
			
		||||
          'X_val': Array, shape (N_val, d_1, ..., d_k) of validation images
 | 
			
		||||
          'y_train': Array, shape (N_train,) of labels for training images
 | 
			
		||||
          'y_val': Array, shape (N_val,) of labels for validation images
 | 
			
		||||
 | 
			
		||||
        Optional arguments:
 | 
			
		||||
        - update_rule: A string giving the name of an update rule in optim.py.
 | 
			
		||||
          Default is 'sgd'.
 | 
			
		||||
        - optim_config: A dictionary containing hyperparameters that will be
 | 
			
		||||
          passed to the chosen update rule. Each update rule requires different
 | 
			
		||||
          hyperparameters (see optim.py) but all update rules require a
 | 
			
		||||
          'learning_rate' parameter so that should always be present.
 | 
			
		||||
        - lr_decay: A scalar for learning rate decay; after each epoch the
 | 
			
		||||
          learning rate is multiplied by this value.
 | 
			
		||||
        - batch_size: Size of minibatches used to compute loss and gradient
 | 
			
		||||
          during training.
 | 
			
		||||
        - num_epochs: The number of epochs to run for during training.
 | 
			
		||||
        - print_every: Integer; training losses will be printed every
 | 
			
		||||
          print_every iterations.
 | 
			
		||||
        - verbose: Boolean; if set to false then no output will be printed
 | 
			
		||||
          during training.
 | 
			
		||||
        - num_train_samples: Number of training samples used to check training
 | 
			
		||||
          accuracy; default is 1000; set to None to use entire training set.
 | 
			
		||||
        - num_val_samples: Number of validation samples to use to check val
 | 
			
		||||
          accuracy; default is None, which uses the entire validation set.
 | 
			
		||||
        - checkpoint_name: If not None, then save model checkpoints here every
 | 
			
		||||
          epoch.
 | 
			
		||||
        """
 | 
			
		||||
        self.model = model
 | 
			
		||||
        self.X_train = data["X_train"]
 | 
			
		||||
        self.y_train = data["y_train"]
 | 
			
		||||
        self.X_val = data["X_val"]
 | 
			
		||||
        self.y_val = data["y_val"]
 | 
			
		||||
 | 
			
		||||
        # Unpack keyword arguments
 | 
			
		||||
        self.update_rule = kwargs.pop("update_rule", "sgd")
 | 
			
		||||
        self.optim_config = kwargs.pop("optim_config", {})
 | 
			
		||||
        self.lr_decay = kwargs.pop("lr_decay", 1.0)
 | 
			
		||||
        self.batch_size = kwargs.pop("batch_size", 100)
 | 
			
		||||
        self.num_epochs = kwargs.pop("num_epochs", 10)
 | 
			
		||||
        self.num_train_samples = kwargs.pop("num_train_samples", 1000)
 | 
			
		||||
        self.num_val_samples = kwargs.pop("num_val_samples", None)
 | 
			
		||||
 | 
			
		||||
        self.checkpoint_name = kwargs.pop("checkpoint_name", None)
 | 
			
		||||
        self.print_every = kwargs.pop("print_every", 10)
 | 
			
		||||
        self.verbose = kwargs.pop("verbose", True)
 | 
			
		||||
 | 
			
		||||
        # Throw an error if there are extra keyword arguments
 | 
			
		||||
        if len(kwargs) > 0:
 | 
			
		||||
            extra = ", ".join('"%s"' % k for k in list(kwargs.keys()))
 | 
			
		||||
            raise ValueError("Unrecognized arguments %s" % extra)
 | 
			
		||||
 | 
			
		||||
        # Make sure the update rule exists, then replace the string
 | 
			
		||||
        # name with the actual function
 | 
			
		||||
        if not hasattr(optim, self.update_rule):
 | 
			
		||||
            raise ValueError('Invalid update_rule "%s"' % self.update_rule)
 | 
			
		||||
        self.update_rule = getattr(optim, self.update_rule)
 | 
			
		||||
 | 
			
		||||
        self._reset()
 | 
			
		||||
 | 
			
		||||
    def _reset(self):
 | 
			
		||||
        """
 | 
			
		||||
        Set up some book-keeping variables for optimization. Don't call this
 | 
			
		||||
        manually.
 | 
			
		||||
        """
 | 
			
		||||
        # Set up some variables for book-keeping
 | 
			
		||||
        self.epoch = 0
 | 
			
		||||
        self.best_val_acc = 0
 | 
			
		||||
        self.best_params = {}
 | 
			
		||||
        self.loss_history = []
 | 
			
		||||
        self.train_acc_history = []
 | 
			
		||||
        self.val_acc_history = []
 | 
			
		||||
 | 
			
		||||
        # Make a deep copy of the optim_config for each parameter
 | 
			
		||||
        self.optim_configs = {}
 | 
			
		||||
        for p in self.model.params:
 | 
			
		||||
            d = {k: v for k, v in self.optim_config.items()}
 | 
			
		||||
            self.optim_configs[p] = d
 | 
			
		||||
 | 
			
		||||
    def _step(self):
 | 
			
		||||
        """
 | 
			
		||||
        Make a single gradient update. This is called by train() and should not
 | 
			
		||||
        be called manually.
 | 
			
		||||
        """
 | 
			
		||||
        # Make a minibatch of training data
 | 
			
		||||
        num_train = self.X_train.shape[0]
 | 
			
		||||
        batch_mask = np.random.choice(num_train, self.batch_size)
 | 
			
		||||
        X_batch = self.X_train[batch_mask]
 | 
			
		||||
        y_batch = self.y_train[batch_mask]
 | 
			
		||||
 | 
			
		||||
        # Compute loss and gradient
 | 
			
		||||
        loss, grads = self.model.loss(X_batch, y_batch)
 | 
			
		||||
        self.loss_history.append(loss)
 | 
			
		||||
 | 
			
		||||
        # Perform a parameter update
 | 
			
		||||
        for p, w in self.model.params.items():
 | 
			
		||||
            dw = grads[p]
 | 
			
		||||
            config = self.optim_configs[p]
 | 
			
		||||
            next_w, next_config = self.update_rule(w, dw, config)
 | 
			
		||||
            self.model.params[p] = next_w
 | 
			
		||||
            self.optim_configs[p] = next_config
 | 
			
		||||
 | 
			
		||||
    def _save_checkpoint(self):
 | 
			
		||||
        if self.checkpoint_name is None:
 | 
			
		||||
            return
 | 
			
		||||
        checkpoint = {
 | 
			
		||||
            "model": self.model,
 | 
			
		||||
            "update_rule": self.update_rule,
 | 
			
		||||
            "lr_decay": self.lr_decay,
 | 
			
		||||
            "optim_config": self.optim_config,
 | 
			
		||||
            "batch_size": self.batch_size,
 | 
			
		||||
            "num_train_samples": self.num_train_samples,
 | 
			
		||||
            "num_val_samples": self.num_val_samples,
 | 
			
		||||
            "epoch": self.epoch,
 | 
			
		||||
            "loss_history": self.loss_history,
 | 
			
		||||
            "train_acc_history": self.train_acc_history,
 | 
			
		||||
            "val_acc_history": self.val_acc_history,
 | 
			
		||||
        }
 | 
			
		||||
        filename = "%s_epoch_%d.pkl" % (self.checkpoint_name, self.epoch)
 | 
			
		||||
        if self.verbose:
 | 
			
		||||
            print('Saving checkpoint to "%s"' % filename)
 | 
			
		||||
        with open(filename, "wb") as f:
 | 
			
		||||
            pickle.dump(checkpoint, f)
 | 
			
		||||
 | 
			
		||||
    def check_accuracy(self, X, y, num_samples=None, batch_size=100):
 | 
			
		||||
        """
 | 
			
		||||
        Check accuracy of the model on the provided data.
 | 
			
		||||
 | 
			
		||||
        Inputs:
 | 
			
		||||
        - X: Array of data, of shape (N, d_1, ..., d_k)
 | 
			
		||||
        - y: Array of labels, of shape (N,)
 | 
			
		||||
        - num_samples: If not None, subsample the data and only test the model
 | 
			
		||||
          on num_samples datapoints.
 | 
			
		||||
        - batch_size: Split X and y into batches of this size to avoid using
 | 
			
		||||
          too much memory.
 | 
			
		||||
 | 
			
		||||
        Returns:
 | 
			
		||||
        - acc: Scalar giving the fraction of instances that were correctly
 | 
			
		||||
          classified by the model.
 | 
			
		||||
        """
 | 
			
		||||
 | 
			
		||||
        # Maybe subsample the data
 | 
			
		||||
        N = X.shape[0]
 | 
			
		||||
        if num_samples is not None and N > num_samples:
 | 
			
		||||
            mask = np.random.choice(N, num_samples)
 | 
			
		||||
            N = num_samples
 | 
			
		||||
            X = X[mask]
 | 
			
		||||
            y = y[mask]
 | 
			
		||||
 | 
			
		||||
        # Compute predictions in batches
 | 
			
		||||
        num_batches = N // batch_size
 | 
			
		||||
        if N % batch_size != 0:
 | 
			
		||||
            num_batches += 1
 | 
			
		||||
        y_pred = []
 | 
			
		||||
        for i in range(num_batches):
 | 
			
		||||
            start = i * batch_size
 | 
			
		||||
            end = (i + 1) * batch_size
 | 
			
		||||
            scores = self.model.loss(X[start:end])
 | 
			
		||||
            y_pred.append(np.argmax(scores, axis=1))
 | 
			
		||||
        y_pred = np.hstack(y_pred)
 | 
			
		||||
        acc = np.mean(y_pred == y)
 | 
			
		||||
 | 
			
		||||
        return acc
 | 
			
		||||
 | 
			
		||||
    def train(self):
 | 
			
		||||
        """
 | 
			
		||||
        Run optimization to train the model.
 | 
			
		||||
        """
 | 
			
		||||
        num_train = self.X_train.shape[0]
 | 
			
		||||
        iterations_per_epoch = max(num_train // self.batch_size, 1)
 | 
			
		||||
        num_iterations = self.num_epochs * iterations_per_epoch
 | 
			
		||||
 | 
			
		||||
        for t in range(num_iterations):
 | 
			
		||||
            self._step()
 | 
			
		||||
 | 
			
		||||
            # Maybe print training loss
 | 
			
		||||
            if self.verbose and t % self.print_every == 0:
 | 
			
		||||
                print(
 | 
			
		||||
                    "(Iteration %d / %d) loss: %f"
 | 
			
		||||
                    % (t + 1, num_iterations, self.loss_history[-1])
 | 
			
		||||
                )
 | 
			
		||||
 | 
			
		||||
            # At the end of every epoch, increment the epoch counter and decay
 | 
			
		||||
            # the learning rate.
 | 
			
		||||
            epoch_end = (t + 1) % iterations_per_epoch == 0
 | 
			
		||||
            if epoch_end:
 | 
			
		||||
                self.epoch += 1
 | 
			
		||||
                for k in self.optim_configs:
 | 
			
		||||
                    self.optim_configs[k]["learning_rate"] *= self.lr_decay
 | 
			
		||||
 | 
			
		||||
            # Check train and val accuracy on the first iteration, the last
 | 
			
		||||
            # iteration, and at the end of each epoch.
 | 
			
		||||
            first_it = t == 0
 | 
			
		||||
            last_it = t == num_iterations - 1
 | 
			
		||||
            if first_it or last_it or epoch_end:
 | 
			
		||||
                train_acc = self.check_accuracy(
 | 
			
		||||
                    self.X_train, self.y_train, num_samples=self.num_train_samples
 | 
			
		||||
                )
 | 
			
		||||
                val_acc = self.check_accuracy(
 | 
			
		||||
                    self.X_val, self.y_val, num_samples=self.num_val_samples
 | 
			
		||||
                )
 | 
			
		||||
                self.train_acc_history.append(train_acc)
 | 
			
		||||
                self.val_acc_history.append(val_acc)
 | 
			
		||||
                self._save_checkpoint()
 | 
			
		||||
 | 
			
		||||
                if self.verbose:
 | 
			
		||||
                    print(
 | 
			
		||||
                        "(Epoch %d / %d) train acc: %f; val_acc: %f"
 | 
			
		||||
                        % (self.epoch, self.num_epochs, train_acc, val_acc)
 | 
			
		||||
                    )
 | 
			
		||||
 | 
			
		||||
                # Keep track of the best model
 | 
			
		||||
                if val_acc > self.best_val_acc:
 | 
			
		||||
                    self.best_val_acc = val_acc
 | 
			
		||||
                    self.best_params = {}
 | 
			
		||||
                    for k, v in self.model.params.items():
 | 
			
		||||
                        self.best_params[k] = v.copy()
 | 
			
		||||
 | 
			
		||||
        # At the end of training swap the best params into the model
 | 
			
		||||
        self.model.params = self.best_params
 | 
			
		||||
@ -0,0 +1,78 @@
 | 
			
		||||
from builtins import range
 | 
			
		||||
from past.builtins import xrange
 | 
			
		||||
 | 
			
		||||
from math import sqrt, ceil
 | 
			
		||||
import numpy as np
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def visualize_grid(Xs, ubound=255.0, padding=1):
 | 
			
		||||
    """
 | 
			
		||||
    Reshape a 4D tensor of image data to a grid for easy visualization.
 | 
			
		||||
 | 
			
		||||
    Inputs:
 | 
			
		||||
    - Xs: Data of shape (N, H, W, C)
 | 
			
		||||
    - ubound: Output grid will have values scaled to the range [0, ubound]
 | 
			
		||||
    - padding: The number of blank pixels between elements of the grid
 | 
			
		||||
    """
 | 
			
		||||
    (N, H, W, C) = Xs.shape
 | 
			
		||||
    grid_size = int(ceil(sqrt(N)))
 | 
			
		||||
    grid_height = H * grid_size + padding * (grid_size - 1)
 | 
			
		||||
    grid_width = W * grid_size + padding * (grid_size - 1)
 | 
			
		||||
    grid = np.zeros((grid_height, grid_width, C))
 | 
			
		||||
    next_idx = 0
 | 
			
		||||
    y0, y1 = 0, H
 | 
			
		||||
    for y in range(grid_size):
 | 
			
		||||
        x0, x1 = 0, W
 | 
			
		||||
        for x in range(grid_size):
 | 
			
		||||
            if next_idx < N:
 | 
			
		||||
                img = Xs[next_idx]
 | 
			
		||||
                low, high = np.min(img), np.max(img)
 | 
			
		||||
                grid[y0:y1, x0:x1] = ubound * (img - low) / (high - low)
 | 
			
		||||
                # grid[y0:y1, x0:x1] = Xs[next_idx]
 | 
			
		||||
                next_idx += 1
 | 
			
		||||
            x0 += W + padding
 | 
			
		||||
            x1 += W + padding
 | 
			
		||||
        y0 += H + padding
 | 
			
		||||
        y1 += H + padding
 | 
			
		||||
    # grid_max = np.max(grid)
 | 
			
		||||
    # grid_min = np.min(grid)
 | 
			
		||||
    # grid = ubound * (grid - grid_min) / (grid_max - grid_min)
 | 
			
		||||
    return grid
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def vis_grid(Xs):
 | 
			
		||||
    """ visualize a grid of images """
 | 
			
		||||
    (N, H, W, C) = Xs.shape
 | 
			
		||||
    A = int(ceil(sqrt(N)))
 | 
			
		||||
    G = np.ones((A * H + A, A * W + A, C), Xs.dtype)
 | 
			
		||||
    G *= np.min(Xs)
 | 
			
		||||
    n = 0
 | 
			
		||||
    for y in range(A):
 | 
			
		||||
        for x in range(A):
 | 
			
		||||
            if n < N:
 | 
			
		||||
                G[y * H + y : (y + 1) * H + y, x * W + x : (x + 1) * W + x, :] = Xs[
 | 
			
		||||
                    n, :, :, :
 | 
			
		||||
                ]
 | 
			
		||||
                n += 1
 | 
			
		||||
    # normalize to [0,1]
 | 
			
		||||
    maxg = G.max()
 | 
			
		||||
    ming = G.min()
 | 
			
		||||
    G = (G - ming) / (maxg - ming)
 | 
			
		||||
    return G
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def vis_nn(rows):
 | 
			
		||||
    """ visualize array of arrays of images """
 | 
			
		||||
    N = len(rows)
 | 
			
		||||
    D = len(rows[0])
 | 
			
		||||
    H, W, C = rows[0][0].shape
 | 
			
		||||
    Xs = rows[0][0]
 | 
			
		||||
    G = np.ones((N * H + N, D * W + D, C), Xs.dtype)
 | 
			
		||||
    for y in range(N):
 | 
			
		||||
        for x in range(D):
 | 
			
		||||
            G[y * H + y : (y + 1) * H + y, x * W + x : (x + 1) * W + x, :] = rows[y][x]
 | 
			
		||||
    # normalize to [0,1]
 | 
			
		||||
    maxg = G.max()
 | 
			
		||||
    ming = G.min()
 | 
			
		||||
    G = (G - ming) / (maxg - ming)
 | 
			
		||||
    return G
 | 
			
		||||
					Loading…
					
					
				
		Reference in New Issue