""" # -*- coding: utf-8 -*- # # Copyright 2021 Michael Büsch # # Licensed under the Apache License version 2.0 # or the MIT license, at your option. # SPDX-License-Identifier: Apache-2.0 OR MIT # """ __all__ = [ "AlphaDecay", "AlphaDecaySimple", "AlphaDecayExp", "AlphaDecaySqrt", "Optimizer", "GradDescent", "Momentum", "RMSProp", "Adam", ] from abc import ABC, abstractmethod from mlplib.backward import BackpropGrads from mlplib.parameters import Parameters import numpy as np class Optimizer(ABC): """Optimizer abstract base class. """ def __init__(self, params: Parameters, alpha: float) -> None: self.params = params self.alpha = alpha @abstractmethod def apply(self, epoch: int, gradients: BackpropGrads) -> None: """Optimizer function. """ class GradDescent(Optimizer): """Simple Gradient Descent optimizer. """ def apply(self, epoch: int, gradients: BackpropGrads) -> None: alpha = self.alpha for ((w, b, *_), dw, db) in zip(self.params, gradients.dw, gradients.db): # Adjust weights and biases. w -= alpha * dw b -= alpha * db class Momentum(Optimizer): """Momentum Gradient Descent optimizer. """ def __init__(self, params: Parameters, alpha: float = 0.01, beta: float = 0.9) -> None: super().__init__(params, alpha) self.beta = beta self.vdw = [] self.vdb = [] for (w, b, *_) in params: self.vdw.append(np.zeros(w.shape)) self.vdb.append(np.zeros(b.shape)) def apply(self, epoch: int, gradients: BackpropGrads) -> None: epoch += 1 assert epoch > 0 alpha = self.alpha beta = self.beta beta_inv = 1.0 - beta beta_powi_inv = 1.0 - (beta ** epoch) for (vdw, vdb, (w, b, *_), dw, db) in zip(self.vdw, self.vdb, self.params, gradients.dw, gradients.db): # Apply momentum to the derivatives. vdw *= beta vdw += beta_inv * dw vdb *= beta vdb += beta_inv * db # Bias correction. cvdw = vdw / beta_powi_inv cvdb = vdb / beta_powi_inv # Adjust weights and biases. w -= alpha * cvdw b -= alpha * cvdb class RMSProp(Optimizer): """RMSProp optimizer. """ def __init__(self, params: Parameters, alpha: float = 0.01, beta: float = 0.9) -> None: super().__init__(params, alpha) self.beta = beta self.epsilon = np.finfo(np.float32).eps self.sdw = [] self.sdb = [] for (w, b, *_) in params: self.sdw.append(np.zeros(w.shape)) self.sdb.append(np.zeros(b.shape)) def apply(self, epoch: int, gradients: BackpropGrads) -> None: epoch += 1 assert epoch > 0 alpha = self.alpha beta = self.beta beta_inv = 1.0 - beta beta_powi_inv = 1.0 - (beta ** epoch) epsilon = self.epsilon sqrt = np.sqrt for (sdw, sdb, (w, b, *_), dw, db) in zip(self.sdw, self.sdb, self.params, gradients.dw, gradients.db): # Calculate (R)MS of the derivatives. sdw *= beta sdw += beta_inv * (dw * dw) sdb *= beta sdb += beta_inv * (db * db) # Bias correction. csdw = sdw / beta_powi_inv csdb = sdb / beta_powi_inv # Adjust weights and biases. w -= alpha * (dw / (sqrt(csdw) + epsilon)) b -= alpha * (db / (sqrt(csdb) + epsilon)) class Adam(Optimizer): """Adam optimizer. """ def __init__(self, params: Parameters, alpha: float = 0.001, beta1: float = 0.9, beta2: float = 0.999) -> None: super().__init__(params, alpha) self.beta1 = beta1 self.beta2 = beta2 self.epsilon = np.finfo(np.float32).eps self.vdw = [] self.sdw = [] self.vdb = [] self.sdb = [] for (w, b, *_) in params: self.vdw.append(np.zeros(w.shape)) self.sdw.append(np.zeros(w.shape)) self.vdb.append(np.zeros(b.shape)) self.sdb.append(np.zeros(b.shape)) def apply(self, epoch: int, gradients: BackpropGrads) -> None: epoch += 1 assert epoch > 0 alpha = self.alpha beta1 = self.beta1 beta2 = self.beta2 beta1_inv = 1.0 - beta1 beta2_inv = 1.0 - beta2 beta1_powi_inv = 1.0 - (beta1 ** epoch) beta2_powi_inv = 1.0 - (beta2 ** epoch) epsilon = self.epsilon sqrt = np.sqrt for (vdw, vdb, sdw, sdb, (w, b, *_), dw, db) in zip(self.vdw, self.vdb, self.sdw, self.sdb, self.params, gradients.dw, gradients.db): # Apply momentum to the derivatives. vdw *= beta1 vdw += beta1_inv * dw vdb *= beta1 vdb += beta1_inv * db # Calculate (R)MS of the derivatives. sdw *= beta2 sdw += beta2_inv * (dw * dw) sdb *= beta2 sdb += beta2_inv * (db * db) # Bias correction. cvdw = vdw / beta1_powi_inv cvdb = vdb / beta1_powi_inv csdw = sdw / beta2_powi_inv csdb = sdb / beta2_powi_inv # Adjust weights and biases. w -= alpha * (cvdw / (sqrt(csdw) + epsilon)) b -= alpha * (cvdb / (sqrt(csdb) + epsilon)) class AlphaDecay(ABC): """Learning rate decay base class. """ def __init__(self, optimizer: Optimizer, decay_rate: float): self.optimizer = optimizer self.alpha0 = optimizer.alpha self.decay_rate = min(max(decay_rate, 0.0), 1.0) @property def alpha(self): return self.optimizer.alpha def apply(self, epoch: int, *args, **kwargs): self.optimizer.alpha = self.decay(epoch) self.optimizer.apply(epoch, *args, **kwargs) @abstractmethod def decay(self, epoch: int): """Learning rate decay function. """ class AlphaDecaySimple(AlphaDecay): """ 1.0 alpha = ------------------------ * alpha0 1.0 + decay_rate * epoch """ def decay(self, epoch: int): return self.alpha0 / (1.0 + (self.decay_rate * epoch)) class AlphaDecayExp(AlphaDecay): """ alpha = ((1.0 - decay_rate) ** epoch) * alpha0 """ def decay(self, epoch: int): return ((1.0 - self.decay_rate) ** epoch) * self.alpha0 class AlphaDecaySqrt(AlphaDecay): """ decay_rate alpha = ----------- * alpha0 sqrt(epoch) """ def decay(self, epoch: int): epoch += 1 assert epoch > 0 return (self.decay_rate * self.alpha0) / np.sqrt(epoch) # vim: ts=4 sw=4 expandtab