From 9cfaa29d4211611f8079c070e682c2a46b4f9151 Mon Sep 17 00:00:00 2001 From: Andrey Tkachenko Date: Thu, 4 Jul 2019 17:48:37 +0400 Subject: [PATCH] added Adam optimizer --- src/backend.rs | 34 +++++++++++++++ src/backends/native.rs | 56 ++++++++++++++++++++++-- src/layers/linear.rs | 4 +- src/main.rs | 9 ++-- src/optimizer.rs | 6 +-- src/optimizers/adam.rs | 96 ++++++++++++++++++++++++++++++++++++++++++ src/optimizers/mod.rs | 4 +- src/optimizers/sgd.rs | 21 +++++++-- src/tensor.rs | 4 +- 9 files changed, 214 insertions(+), 20 deletions(-) create mode 100644 src/optimizers/adam.rs diff --git a/src/backend.rs b/src/backend.rs index d31add3..c1a6fa6 100644 --- a/src/backend.rs +++ b/src/backend.rs @@ -148,6 +148,17 @@ impl <'a, N, T: BackendAxpy> BackendAxpy for &'a T { } } +pub trait BackendAxpys: Backend { + fn axpys(&self, dst: &mut Self::Tensor, scale: N, a: &Self::Tensor); +} + +impl <'a, N, T: BackendAxpys> BackendAxpys for &'a T { + #[inline] + fn axpys(&self, dst: &mut Self::Tensor, scale: N, a: &Self::Tensor) { + (**self).axpys(dst, scale, a) + } +} + pub trait BackendAdd: Backend { fn add(&self, dst: &mut Self::Tensor, a: &Self::Tensor); } @@ -179,4 +190,27 @@ impl <'a, N, T: BackendMul> BackendMul for &'a T { fn mul(&self, dst: &mut Self::Tensor, a: &Self::Tensor) { (**self).mul(dst, a) } +} + + +pub trait BackendMaximum: Backend { + fn maximum(&self, dst: &mut Self::Tensor, a: &Self::Tensor); +} + +impl <'a, N, T: BackendMaximum> BackendMaximum for &'a T { + #[inline] + fn maximum(&self, dst: &mut Self::Tensor, a: &Self::Tensor) { + (**self).maximum(dst, a) + } +} + +pub trait BackendAdam: BackendScale + BackendAxpy + BackendAxpys + BackendMaximum { + fn adam_p(&self, dst: &mut Self::Tensor, lr: N, moms: &Self::Tensor, vels: &Self::Tensor, eps: N); +} + +impl <'a, N, T: BackendAdam> BackendAdam for &'a T { + #[inline] + fn adam_p(&self, dst: &mut Self::Tensor, lr: N, moms: &Self::Tensor, vels: &Self::Tensor, eps: N) { + (**self).adam_p(dst, lr, moms, vels, eps) + } } \ No newline at end of file diff --git a/src/backends/native.rs b/src/backends/native.rs index 2a598b7..10e3546 100644 --- a/src/backends/native.rs +++ b/src/backends/native.rs @@ -60,13 +60,13 @@ impl Native { fn fmt_tensor(&self, t: &NativeTensorF32, f: &mut String) -> fmt::Result { let strides = t.shape.default_strides(); let last_idx = strides.dims - 1; - writeln!(f, "default stridses {} {}", t.shape.default_strides(), last_idx); + writeln!(f, "default stridses {} {}", t.shape.default_strides(), last_idx)?; write!(f, "Tensor(shape={}, data=[", t.shape)?; for (idx, val) in t.read().iter().enumerate() { - let mut is_first = idx == 0; + let is_first = idx == 0; let mut need_nl = false; - let mut padding = 2; + let padding = 2; for (sidx, s) in strides.iter().enumerate() { if sidx != last_idx && idx % s as usize == 0 { @@ -148,7 +148,7 @@ impl Backend for Native { fn print_tensor(&self, t: &Self::Tensor) { let mut s = String::new(); - self.fmt_tensor(t, &mut s); + self.fmt_tensor(t, &mut s).unwrap(); println!("{}", s); } } @@ -351,6 +351,21 @@ impl BackendAxpy for Native { } } +impl BackendAxpys for Native { + fn axpys(&self, dst: &mut Self::Tensor, scale: f32, a: &Self::Tensor) { + let dst_size = dst.shape().size(); + + assert!(a.shape() == dst.shape()); + + let a_s = &a.read()[0 .. dst_size]; + let dst_s = &mut dst.write()[0 .. dst_size]; + + for i in 0 .. dst_size { + dst_s[i] += scale * a_s[i] * a_s[i]; + } + } +} + impl BackendAdd for Native { fn add(&self, dst: &mut Self::Tensor, a: &Self::Tensor) { let dst_size = dst.shape().size(); @@ -401,6 +416,39 @@ impl BackendMul for Native { } } +impl BackendMaximum for Native { + fn maximum(&self, dst: &mut Self::Tensor, a: &Self::Tensor) { + let dst_size = dst.shape().size(); + + assert!(a.shape() == dst.shape()); + + let a_s = &a.read()[0 .. dst_size]; + let dst_s = &mut dst.write()[0 .. dst_size]; + + for i in 0 .. dst_size { + dst_s[i] = f32::max(a_s[i], dst_s[i]); + } + } +} + + +impl BackendAdam for Native { + fn adam_p(&self, dst: &mut Self::Tensor, lr: f32, moms: &Self::Tensor, vels: &Self::Tensor, eps: f32) { + let dst_size = dst.shape().size(); + + assert!(moms.shape() == dst.shape()); + assert!(vels.shape() == dst.shape()); + + let moms_s = &moms.read()[0 .. dst_size]; + let vels_s = &vels.read()[0 .. dst_size]; + let dst_s = &mut dst.write()[0 .. dst_size]; + + for i in 0 .. dst_size { + dst_s[i] += lr * moms_s[i] / (vels_s[i].sqrt() + eps) + } + } +} + #[test] fn test_matmul() { diff --git a/src/layers/linear.rs b/src/layers/linear.rs index 507e05d..d450a0c 100644 --- a/src/layers/linear.rs +++ b/src/layers/linear.rs @@ -73,7 +73,7 @@ impl Optimizable for Linear } fn optimize(&mut self, backend: &B, optimizer: &O) { - optimizer.update_gradients(backend, &mut self.weights.ctx, &mut self.weights.params, &self.weights.grads); - // optimizer.update_gradients(backend, &mut self.biases.ctx, &mut self.biases.params, &self.biases.grads); + optimizer.update_params(backend, &mut self.weights.ctx, &mut self.weights.params, &self.weights.grads); + // optimizer.update_params(backend, &mut self.biases.ctx, &mut self.biases.params, &self.biases.grads); } } \ No newline at end of file diff --git a/src/main.rs b/src/main.rs index 9ad9ce8..ad82e67 100644 --- a/src/main.rs +++ b/src/main.rs @@ -15,7 +15,7 @@ pub mod losses; pub mod tensor; use self::backends::{Native, NativeTensorF32}; -use self::optimizers::Sgd; +use self::optimizers::Adam; use self::layers::*; use self::layer::*; use crate::backend::{Backend, BackendScale}; @@ -58,16 +58,17 @@ fn main() { const BATCH_SIZE: usize = 128; let backend = Native; - let optimizer = Sgd::new(0.01, 0.1, false); + // let optimizer = Sgd::new(0.01, 0.1, false); + let optimizer = Adam::default(); let hidden_count = 64; - let mut linear_1: LayerImpl<_, _, _, Linear<_, _, &Sgd<_, _>>> = LayerImpl::new((784, ).into(), &backend, &optimizer, LinearConfig { + let mut linear_1: LayerImpl<_, _, _, Linear<_, _, &Adam<_, _>>> = LayerImpl::new((784, ).into(), &backend, &optimizer, LinearConfig { outputs: hidden_count }); let mut sigmoid_1: LayerImpl<_, _, _, Sigmoid<_, _>> = LayerImpl::new((hidden_count, ).into(), &backend, &optimizer, SigmoidConfig); - let mut linear_2: LayerImpl<_, _, _, Linear<_, _, &Sgd<_, _>>> = LayerImpl::new((hidden_count, ).into(), &backend, &optimizer, LinearConfig { + let mut linear_2: LayerImpl<_, _, _, Linear<_, _, &Adam<_, _>>> = LayerImpl::new((hidden_count, ).into(), &backend, &optimizer, LinearConfig { outputs: 10 }); diff --git a/src/optimizer.rs b/src/optimizer.rs index fb35a94..5389c3d 100644 --- a/src/optimizer.rs +++ b/src/optimizer.rs @@ -9,15 +9,15 @@ pub trait OptimizerContext { pub trait Optimizer> { type Context: OptimizerContext; - fn update_gradients(&self, backend: &B, ctx: &mut Self::Context, grads: &mut B::Tensor, params: &B::Tensor); + fn update_params(&self, backend: &B, ctx: &mut Self::Context, grads: &mut B::Tensor, params: &B::Tensor); } impl <'a, N, B: Backend, O: Optimizer> Optimizer for &'a O { type Context = O::Context; #[inline] - fn update_gradients(&self, backend: &B, ctx: &mut Self::Context, grads: &mut B::Tensor, params: &B::Tensor) { - (**self).update_gradients(backend, ctx, grads, params) + fn update_params(&self, backend: &B, ctx: &mut Self::Context, grads: &mut B::Tensor, params: &B::Tensor) { + (**self).update_params(backend, ctx, grads, params) } } diff --git a/src/optimizers/adam.rs b/src/optimizers/adam.rs new file mode 100644 index 0000000..2dfb4b9 --- /dev/null +++ b/src/optimizers/adam.rs @@ -0,0 +1,96 @@ +use std::marker::PhantomData; +use crate::backend::{Backend, BackendAdam}; +use crate::optimizer::{Optimizer, OptimizerContext}; +use crate::tensor::{Tensor, TensorShape}; +use std::cell::Cell; + + +pub struct AdamContext + where B: Backend +{ + moms: B::Tensor, + vels: B::Tensor, + vhats: B::Tensor, + _m: PhantomData, +} + +impl> OptimizerContext for AdamContext { + fn new>(shape: S) -> Self { + let shape = shape.into(); + + Self { + moms: B::Tensor::new(shape.clone()), + vels: B::Tensor::new(shape.clone()), + vhats: B::Tensor::new(shape), + _m: Default::default(), + } + } +} + +pub struct Adam> { + learning_rate: f32, + beta_1: f32, + beta_2: f32, + epsilon: Option, + amsgrad: bool, + iteration: Cell, + _m: PhantomData, +} + +impl Default for Adam + where B: Backend +{ + fn default() -> Self { + Self { + learning_rate: 0.001, + beta_1: 0.9, + beta_2: 0.999, + epsilon: None, + amsgrad: false, + iteration: Cell::new(0.0), + _m: Default::default(), + } + } +} + +impl> Adam { + pub fn new(learning_rate: f32, beta_1: f32, beta_2: f32, amsgrad: bool) -> Self { + Self { + learning_rate, + beta_1, + beta_2, + epsilon: None, + amsgrad, + iteration: Cell::new(0.0), + _m: Default::default(), + } + } +} + +impl + BackendAdam> Optimizer for Adam { + type Context = AdamContext; + + fn update_params(&self, backend: &B, ctx: &mut Self::Context, params: &mut B::Tensor, grads: &B::Tensor) { + let iter = self.iteration.get(); + let t = iter + 1.0; + self.iteration.set(iter + 0.25); + + let lr_t = self.learning_rate * ((1.0 - self.beta_2.powf(t)).sqrt() / (1.0 - self.beta_1.powf(t))); + + // m_t = (self.beta_1 * m) + (1. - self.beta_1) * g; + backend.scale(&mut ctx.moms, backend.scalar_f32(self.beta_1)); + backend.axpy(&mut ctx.moms, backend.scalar_f32(1.0 - self.beta_1), grads); + + // v_t = (self.beta_2 * v) + (1. - self.beta_2) * square(grads); + backend.scale(&mut ctx.vels, backend.scalar_f32(self.beta_2)); + backend.axpys(&mut ctx.vels, backend.scalar_f32(1.0 - self.beta_2), grads); + + if self.amsgrad { + backend.maximum(&mut ctx.vhats, &ctx.vels); + backend.adam_p(params, backend.scalar_f32(-lr_t), &ctx.moms, &ctx.vhats, backend.scalar_f32(self.epsilon.unwrap_or(std::f32::EPSILON))); + } else { + // p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) + backend.adam_p(params, backend.scalar_f32(-lr_t), &ctx.moms, &ctx.vels, backend.scalar_f32(self.epsilon.unwrap_or(std::f32::EPSILON))); + } + } +} diff --git a/src/optimizers/mod.rs b/src/optimizers/mod.rs index 0bdc1b9..47c9010 100644 --- a/src/optimizers/mod.rs +++ b/src/optimizers/mod.rs @@ -1,3 +1,5 @@ mod sgd; +mod adam; -pub use self::sgd::*; \ No newline at end of file +pub use self::sgd::*; +pub use self::adam::*; \ No newline at end of file diff --git a/src/optimizers/sgd.rs b/src/optimizers/sgd.rs index f1592cc..5348c41 100644 --- a/src/optimizers/sgd.rs +++ b/src/optimizers/sgd.rs @@ -27,7 +27,22 @@ pub struct Sgd> { _m: PhantomData, } -impl> Sgd { +impl Default for Sgd + where B: Backend +{ + fn default() -> Self { + Self { + learning_rate: 0.01, + momentum: 0.0, + nesterov: false, + _m: Default::default(), + } + } +} + +impl Sgd + where B: Backend +{ pub fn new(learning_rate: f32, momentum: f32, nesterov: bool) -> Self { Self { learning_rate, @@ -41,9 +56,7 @@ impl> Sgd { impl + BackendScale + BackendAxpy + BackendAdd> Optimizer for Sgd { type Context = SgdContext; - fn update_gradients(&self, backend: &B, ctx: &mut Self::Context, params: &mut B::Tensor, grads: &B::Tensor) { - // backend.axpy(params, backend.scalar_f32(-self.learning_rate), grads); - + fn update_params(&self, backend: &B, ctx: &mut Self::Context, params: &mut B::Tensor, grads: &B::Tensor) { // m = momentum * m - lr * grads backend.scale(&mut ctx.moments, backend.scalar_f32(self.momentum)); backend.axpy(&mut ctx.moments, backend.scalar_f32(-self.learning_rate), grads); diff --git a/src/tensor.rs b/src/tensor.rs index d2274cf..7542cef 100644 --- a/src/tensor.rs +++ b/src/tensor.rs @@ -55,7 +55,7 @@ impl fmt::Display for TensorShape { write!(f, ", ")?; } - write!(f, "{}", self.shape[i]); + write!(f, "{}", self.shape[i])?; } write!(f, ")")?; @@ -178,7 +178,7 @@ impl TensorShape { } impl From<()> for TensorShape { - fn from(x: ()) -> Self { + fn from(_: ()) -> Self { TensorShape { shape: [0, 0, 0, 0], dims: 0,