mirror of https://github.com/da0c/DL_Course_SamU
You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
163 lines
6.2 KiB
Python
163 lines
6.2 KiB
Python
4 years ago
|
import numpy as np
|
||
|
|
||
|
"""
|
||
|
This file implements various first-order update rules that are commonly used
|
||
|
for training neural networks. Each update rule accepts current weights and the
|
||
|
gradient of the loss with respect to those weights and produces the next set of
|
||
|
weights. Each update rule has the same interface:
|
||
|
|
||
|
def update(w, dw, config=None):
|
||
|
|
||
|
Inputs:
|
||
|
- w: A numpy array giving the current weights.
|
||
|
- dw: A numpy array of the same shape as w giving the gradient of the
|
||
|
loss with respect to w.
|
||
|
- config: A dictionary containing hyperparameter values such as learning
|
||
|
rate, momentum, etc. If the update rule requires caching values over many
|
||
|
iterations, then config will also hold these cached values.
|
||
|
|
||
|
Returns:
|
||
|
- next_w: The next point after the update.
|
||
|
- config: The config dictionary to be passed to the next iteration of the
|
||
|
update rule.
|
||
|
|
||
|
NOTE: For most update rules, the default learning rate will probably not
|
||
|
perform well; however the default values of the other hyperparameters should
|
||
|
work well for a variety of different problems.
|
||
|
|
||
|
For efficiency, update rules may perform in-place updates, mutating w and
|
||
|
setting next_w equal to w.
|
||
|
"""
|
||
|
|
||
|
|
||
|
def sgd(w, dw, config=None):
|
||
|
"""
|
||
|
Performs vanilla stochastic gradient descent.
|
||
|
|
||
|
config format:
|
||
|
- learning_rate: Scalar learning rate.
|
||
|
"""
|
||
|
if config is None:
|
||
|
config = {}
|
||
|
config.setdefault("learning_rate", 1e-2)
|
||
|
|
||
|
w -= config["learning_rate"] * dw
|
||
|
return w, config
|
||
|
|
||
|
|
||
|
def sgd_momentum(w, dw, config=None):
|
||
|
"""
|
||
|
Performs stochastic gradient descent with momentum.
|
||
|
|
||
|
config format:
|
||
|
- learning_rate: Scalar learning rate.
|
||
|
- momentum: Scalar between 0 and 1 giving the momentum value.
|
||
|
Setting momentum = 0 reduces to sgd.
|
||
|
- velocity: A numpy array of the same shape as w and dw used to store a
|
||
|
moving average of the gradients.
|
||
|
"""
|
||
|
if config is None:
|
||
|
config = {}
|
||
|
config.setdefault("learning_rate", 1e-2)
|
||
|
config.setdefault("momentum", 0.9)
|
||
|
v = config.get("velocity", np.zeros_like(w))
|
||
|
|
||
|
next_w = None
|
||
|
###########################################################################
|
||
|
# TODO: Implement the momentum update formula. Store the updated value in #
|
||
|
# the next_w variable. You should also use and update the velocity v. #
|
||
|
###########################################################################
|
||
|
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
|
||
|
|
||
|
pass
|
||
|
|
||
|
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
|
||
|
###########################################################################
|
||
|
# END OF YOUR CODE #
|
||
|
###########################################################################
|
||
|
config["velocity"] = v
|
||
|
|
||
|
return next_w, config
|
||
|
|
||
|
|
||
|
def rmsprop(w, dw, config=None):
|
||
|
"""
|
||
|
Uses the RMSProp update rule, which uses a moving average of squared
|
||
|
gradient values to set adaptive per-parameter learning rates.
|
||
|
|
||
|
config format:
|
||
|
- learning_rate: Scalar learning rate.
|
||
|
- decay_rate: Scalar between 0 and 1 giving the decay rate for the squared
|
||
|
gradient cache.
|
||
|
- epsilon: Small scalar used for smoothing to avoid dividing by zero.
|
||
|
- cache: Moving average of second moments of gradients.
|
||
|
"""
|
||
|
if config is None:
|
||
|
config = {}
|
||
|
config.setdefault("learning_rate", 1e-2)
|
||
|
config.setdefault("decay_rate", 0.99)
|
||
|
config.setdefault("epsilon", 1e-8)
|
||
|
config.setdefault("cache", np.zeros_like(w))
|
||
|
|
||
|
next_w = None
|
||
|
###########################################################################
|
||
|
# TODO: Implement the RMSprop update formula, storing the next value of w #
|
||
|
# in the next_w variable. Don't forget to update cache value stored in #
|
||
|
# config['cache']. #
|
||
|
###########################################################################
|
||
|
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
|
||
|
|
||
|
pass
|
||
|
|
||
|
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
|
||
|
###########################################################################
|
||
|
# END OF YOUR CODE #
|
||
|
###########################################################################
|
||
|
|
||
|
return next_w, config
|
||
|
|
||
|
|
||
|
def adam(w, dw, config=None):
|
||
|
"""
|
||
|
Uses the Adam update rule, which incorporates moving averages of both the
|
||
|
gradient and its square and a bias correction term.
|
||
|
|
||
|
config format:
|
||
|
- learning_rate: Scalar learning rate.
|
||
|
- beta1: Decay rate for moving average of first moment of gradient.
|
||
|
- beta2: Decay rate for moving average of second moment of gradient.
|
||
|
- epsilon: Small scalar used for smoothing to avoid dividing by zero.
|
||
|
- m: Moving average of gradient.
|
||
|
- v: Moving average of squared gradient.
|
||
|
- t: Iteration number.
|
||
|
"""
|
||
|
if config is None:
|
||
|
config = {}
|
||
|
config.setdefault("learning_rate", 1e-3)
|
||
|
config.setdefault("beta1", 0.9)
|
||
|
config.setdefault("beta2", 0.999)
|
||
|
config.setdefault("epsilon", 1e-8)
|
||
|
config.setdefault("m", np.zeros_like(w))
|
||
|
config.setdefault("v", np.zeros_like(w))
|
||
|
config.setdefault("t", 0)
|
||
|
|
||
|
next_w = None
|
||
|
###########################################################################
|
||
|
# TODO: Implement the Adam update formula, storing the next value of w in #
|
||
|
# the next_w variable. Don't forget to update the m, v, and t variables #
|
||
|
# stored in config. #
|
||
|
# #
|
||
|
# NOTE: In order to match the reference output, please modify t _before_ #
|
||
|
# using it in any calculations. #
|
||
|
###########################################################################
|
||
|
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
|
||
|
|
||
|
pass
|
||
|
|
||
|
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
|
||
|
###########################################################################
|
||
|
# END OF YOUR CODE #
|
||
|
###########################################################################
|
||
|
|
||
|
return next_w, config
|