added Adam optimizer

This commit is contained in:
Andrey Tkachenko 2019-07-04 17:48:37 +04:00
parent fe9a8a0ea2
commit 9cfaa29d42
9 changed files with 214 additions and 20 deletions

View File

@ -148,6 +148,17 @@ impl <'a, N, T: BackendAxpy<N>> BackendAxpy<N> for &'a T {
} }
} }
pub trait BackendAxpys<N>: Backend<N> {
fn axpys(&self, dst: &mut Self::Tensor, scale: N, a: &Self::Tensor);
}
impl <'a, N, T: BackendAxpys<N>> BackendAxpys<N> for &'a T {
#[inline]
fn axpys(&self, dst: &mut Self::Tensor, scale: N, a: &Self::Tensor) {
(**self).axpys(dst, scale, a)
}
}
pub trait BackendAdd<N>: Backend<N> { pub trait BackendAdd<N>: Backend<N> {
fn add(&self, dst: &mut Self::Tensor, a: &Self::Tensor); fn add(&self, dst: &mut Self::Tensor, a: &Self::Tensor);
} }
@ -179,4 +190,27 @@ impl <'a, N, T: BackendMul<N>> BackendMul<N> for &'a T {
fn mul(&self, dst: &mut Self::Tensor, a: &Self::Tensor) { fn mul(&self, dst: &mut Self::Tensor, a: &Self::Tensor) {
(**self).mul(dst, a) (**self).mul(dst, a)
} }
}
pub trait BackendMaximum<N>: Backend<N> {
fn maximum(&self, dst: &mut Self::Tensor, a: &Self::Tensor);
}
impl <'a, N, T: BackendMaximum<N>> BackendMaximum<N> for &'a T {
#[inline]
fn maximum(&self, dst: &mut Self::Tensor, a: &Self::Tensor) {
(**self).maximum(dst, a)
}
}
pub trait BackendAdam<N>: BackendScale<N> + BackendAxpy<N> + BackendAxpys<N> + BackendMaximum<N> {
fn adam_p(&self, dst: &mut Self::Tensor, lr: N, moms: &Self::Tensor, vels: &Self::Tensor, eps: N);
}
impl <'a, N, T: BackendAdam<N>> BackendAdam<N> for &'a T {
#[inline]
fn adam_p(&self, dst: &mut Self::Tensor, lr: N, moms: &Self::Tensor, vels: &Self::Tensor, eps: N) {
(**self).adam_p(dst, lr, moms, vels, eps)
}
} }

View File

@ -60,13 +60,13 @@ impl Native {
fn fmt_tensor(&self, t: &NativeTensorF32, f: &mut String) -> fmt::Result { fn fmt_tensor(&self, t: &NativeTensorF32, f: &mut String) -> fmt::Result {
let strides = t.shape.default_strides(); let strides = t.shape.default_strides();
let last_idx = strides.dims - 1; let last_idx = strides.dims - 1;
writeln!(f, "default stridses {} {}", t.shape.default_strides(), last_idx); writeln!(f, "default stridses {} {}", t.shape.default_strides(), last_idx)?;
write!(f, "Tensor(shape={}, data=[", t.shape)?; write!(f, "Tensor(shape={}, data=[", t.shape)?;
for (idx, val) in t.read().iter().enumerate() { for (idx, val) in t.read().iter().enumerate() {
let mut is_first = idx == 0; let is_first = idx == 0;
let mut need_nl = false; let mut need_nl = false;
let mut padding = 2; let padding = 2;
for (sidx, s) in strides.iter().enumerate() { for (sidx, s) in strides.iter().enumerate() {
if sidx != last_idx && idx % s as usize == 0 { if sidx != last_idx && idx % s as usize == 0 {
@ -148,7 +148,7 @@ impl Backend<f32> for Native {
fn print_tensor(&self, t: &Self::Tensor) { fn print_tensor(&self, t: &Self::Tensor) {
let mut s = String::new(); let mut s = String::new();
self.fmt_tensor(t, &mut s); self.fmt_tensor(t, &mut s).unwrap();
println!("{}", s); println!("{}", s);
} }
} }
@ -351,6 +351,21 @@ impl BackendAxpy<f32> for Native {
} }
} }
impl BackendAxpys<f32> for Native {
fn axpys(&self, dst: &mut Self::Tensor, scale: f32, a: &Self::Tensor) {
let dst_size = dst.shape().size();
assert!(a.shape() == dst.shape());
let a_s = &a.read()[0 .. dst_size];
let dst_s = &mut dst.write()[0 .. dst_size];
for i in 0 .. dst_size {
dst_s[i] += scale * a_s[i] * a_s[i];
}
}
}
impl BackendAdd<f32> for Native { impl BackendAdd<f32> for Native {
fn add(&self, dst: &mut Self::Tensor, a: &Self::Tensor) { fn add(&self, dst: &mut Self::Tensor, a: &Self::Tensor) {
let dst_size = dst.shape().size(); let dst_size = dst.shape().size();
@ -401,6 +416,39 @@ impl BackendMul<f32> for Native {
} }
} }
impl BackendMaximum<f32> for Native {
fn maximum(&self, dst: &mut Self::Tensor, a: &Self::Tensor) {
let dst_size = dst.shape().size();
assert!(a.shape() == dst.shape());
let a_s = &a.read()[0 .. dst_size];
let dst_s = &mut dst.write()[0 .. dst_size];
for i in 0 .. dst_size {
dst_s[i] = f32::max(a_s[i], dst_s[i]);
}
}
}
impl BackendAdam<f32> for Native {
fn adam_p(&self, dst: &mut Self::Tensor, lr: f32, moms: &Self::Tensor, vels: &Self::Tensor, eps: f32) {
let dst_size = dst.shape().size();
assert!(moms.shape() == dst.shape());
assert!(vels.shape() == dst.shape());
let moms_s = &moms.read()[0 .. dst_size];
let vels_s = &vels.read()[0 .. dst_size];
let dst_s = &mut dst.write()[0 .. dst_size];
for i in 0 .. dst_size {
dst_s[i] += lr * moms_s[i] / (vels_s[i].sqrt() + eps)
}
}
}
#[test] #[test]
fn test_matmul() { fn test_matmul() {

View File

@ -73,7 +73,7 @@ impl <N, B, O> Optimizable<N, B, O> for Linear<N, B, O>
} }
fn optimize(&mut self, backend: &B, optimizer: &O) { fn optimize(&mut self, backend: &B, optimizer: &O) {
optimizer.update_gradients(backend, &mut self.weights.ctx, &mut self.weights.params, &self.weights.grads); optimizer.update_params(backend, &mut self.weights.ctx, &mut self.weights.params, &self.weights.grads);
// optimizer.update_gradients(backend, &mut self.biases.ctx, &mut self.biases.params, &self.biases.grads); // optimizer.update_params(backend, &mut self.biases.ctx, &mut self.biases.params, &self.biases.grads);
} }
} }

View File

@ -15,7 +15,7 @@ pub mod losses;
pub mod tensor; pub mod tensor;
use self::backends::{Native, NativeTensorF32}; use self::backends::{Native, NativeTensorF32};
use self::optimizers::Sgd; use self::optimizers::Adam;
use self::layers::*; use self::layers::*;
use self::layer::*; use self::layer::*;
use crate::backend::{Backend, BackendScale}; use crate::backend::{Backend, BackendScale};
@ -58,16 +58,17 @@ fn main() {
const BATCH_SIZE: usize = 128; const BATCH_SIZE: usize = 128;
let backend = Native; let backend = Native;
let optimizer = Sgd::new(0.01, 0.1, false); // let optimizer = Sgd::new(0.01, 0.1, false);
let optimizer = Adam::default();
let hidden_count = 64; let hidden_count = 64;
let mut linear_1: LayerImpl<_, _, _, Linear<_, _, &Sgd<_, _>>> = LayerImpl::new((784, ).into(), &backend, &optimizer, LinearConfig { let mut linear_1: LayerImpl<_, _, _, Linear<_, _, &Adam<_, _>>> = LayerImpl::new((784, ).into(), &backend, &optimizer, LinearConfig {
outputs: hidden_count outputs: hidden_count
}); });
let mut sigmoid_1: LayerImpl<_, _, _, Sigmoid<_, _>> = LayerImpl::new((hidden_count, ).into(), &backend, &optimizer, SigmoidConfig); let mut sigmoid_1: LayerImpl<_, _, _, Sigmoid<_, _>> = LayerImpl::new((hidden_count, ).into(), &backend, &optimizer, SigmoidConfig);
let mut linear_2: LayerImpl<_, _, _, Linear<_, _, &Sgd<_, _>>> = LayerImpl::new((hidden_count, ).into(), &backend, &optimizer, LinearConfig { let mut linear_2: LayerImpl<_, _, _, Linear<_, _, &Adam<_, _>>> = LayerImpl::new((hidden_count, ).into(), &backend, &optimizer, LinearConfig {
outputs: 10 outputs: 10
}); });

View File

@ -9,15 +9,15 @@ pub trait OptimizerContext {
pub trait Optimizer<N, B: Backend<N>> { pub trait Optimizer<N, B: Backend<N>> {
type Context: OptimizerContext; type Context: OptimizerContext;
fn update_gradients(&self, backend: &B, ctx: &mut Self::Context, grads: &mut B::Tensor, params: &B::Tensor); fn update_params(&self, backend: &B, ctx: &mut Self::Context, grads: &mut B::Tensor, params: &B::Tensor);
} }
impl <'a, N, B: Backend<N>, O: Optimizer<N, B>> Optimizer<N, B> for &'a O { impl <'a, N, B: Backend<N>, O: Optimizer<N, B>> Optimizer<N, B> for &'a O {
type Context = O::Context; type Context = O::Context;
#[inline] #[inline]
fn update_gradients(&self, backend: &B, ctx: &mut Self::Context, grads: &mut B::Tensor, params: &B::Tensor) { fn update_params(&self, backend: &B, ctx: &mut Self::Context, grads: &mut B::Tensor, params: &B::Tensor) {
(**self).update_gradients(backend, ctx, grads, params) (**self).update_params(backend, ctx, grads, params)
} }
} }

96
src/optimizers/adam.rs Normal file
View File

@ -0,0 +1,96 @@
use std::marker::PhantomData;
use crate::backend::{Backend, BackendAdam};
use crate::optimizer::{Optimizer, OptimizerContext};
use crate::tensor::{Tensor, TensorShape};
use std::cell::Cell;
pub struct AdamContext<N, B>
where B: Backend<N>
{
moms: B::Tensor,
vels: B::Tensor,
vhats: B::Tensor,
_m: PhantomData<fn(N, B)>,
}
impl<N, B: Backend<N>> OptimizerContext for AdamContext<N, B> {
fn new<S: Into<TensorShape>>(shape: S) -> Self {
let shape = shape.into();
Self {
moms: B::Tensor::new(shape.clone()),
vels: B::Tensor::new(shape.clone()),
vhats: B::Tensor::new(shape),
_m: Default::default(),
}
}
}
pub struct Adam<N, B: Backend<N>> {
learning_rate: f32,
beta_1: f32,
beta_2: f32,
epsilon: Option<f32>,
amsgrad: bool,
iteration: Cell<f32>,
_m: PhantomData<fn(N, B)>,
}
impl<N, B> Default for Adam<N, B>
where B: Backend<N>
{
fn default() -> Self {
Self {
learning_rate: 0.001,
beta_1: 0.9,
beta_2: 0.999,
epsilon: None,
amsgrad: false,
iteration: Cell::new(0.0),
_m: Default::default(),
}
}
}
impl<N, B: Backend<N>> Adam<N, B> {
pub fn new(learning_rate: f32, beta_1: f32, beta_2: f32, amsgrad: bool) -> Self {
Self {
learning_rate,
beta_1,
beta_2,
epsilon: None,
amsgrad,
iteration: Cell::new(0.0),
_m: Default::default(),
}
}
}
impl<N, B: Backend<N> + BackendAdam<N>> Optimizer<N, B> for Adam<N, B> {
type Context = AdamContext<N, B>;
fn update_params(&self, backend: &B, ctx: &mut Self::Context, params: &mut B::Tensor, grads: &B::Tensor) {
let iter = self.iteration.get();
let t = iter + 1.0;
self.iteration.set(iter + 0.25);
let lr_t = self.learning_rate * ((1.0 - self.beta_2.powf(t)).sqrt() / (1.0 - self.beta_1.powf(t)));
// m_t = (self.beta_1 * m) + (1. - self.beta_1) * g;
backend.scale(&mut ctx.moms, backend.scalar_f32(self.beta_1));
backend.axpy(&mut ctx.moms, backend.scalar_f32(1.0 - self.beta_1), grads);
// v_t = (self.beta_2 * v) + (1. - self.beta_2) * square(grads);
backend.scale(&mut ctx.vels, backend.scalar_f32(self.beta_2));
backend.axpys(&mut ctx.vels, backend.scalar_f32(1.0 - self.beta_2), grads);
if self.amsgrad {
backend.maximum(&mut ctx.vhats, &ctx.vels);
backend.adam_p(params, backend.scalar_f32(-lr_t), &ctx.moms, &ctx.vhats, backend.scalar_f32(self.epsilon.unwrap_or(std::f32::EPSILON)));
} else {
// p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon)
backend.adam_p(params, backend.scalar_f32(-lr_t), &ctx.moms, &ctx.vels, backend.scalar_f32(self.epsilon.unwrap_or(std::f32::EPSILON)));
}
}
}

View File

@ -1,3 +1,5 @@
mod sgd; mod sgd;
mod adam;
pub use self::sgd::*; pub use self::sgd::*;
pub use self::adam::*;

View File

@ -27,7 +27,22 @@ pub struct Sgd<N, B: Backend<N>> {
_m: PhantomData<fn(N, B)>, _m: PhantomData<fn(N, B)>,
} }
impl<N, B: Backend<N>> Sgd<N, B> { impl<N, B> Default for Sgd<N, B>
where B: Backend<N>
{
fn default() -> Self {
Self {
learning_rate: 0.01,
momentum: 0.0,
nesterov: false,
_m: Default::default(),
}
}
}
impl<N, B> Sgd<N, B>
where B: Backend<N>
{
pub fn new(learning_rate: f32, momentum: f32, nesterov: bool) -> Self { pub fn new(learning_rate: f32, momentum: f32, nesterov: bool) -> Self {
Self { Self {
learning_rate, learning_rate,
@ -41,9 +56,7 @@ impl<N, B: Backend<N>> Sgd<N, B> {
impl<N, B: Backend<N> + BackendScale<N> + BackendAxpy<N> + BackendAdd<N>> Optimizer<N, B> for Sgd<N, B> { impl<N, B: Backend<N> + BackendScale<N> + BackendAxpy<N> + BackendAdd<N>> Optimizer<N, B> for Sgd<N, B> {
type Context = SgdContext<N, B>; type Context = SgdContext<N, B>;
fn update_gradients(&self, backend: &B, ctx: &mut Self::Context, params: &mut B::Tensor, grads: &B::Tensor) { fn update_params(&self, backend: &B, ctx: &mut Self::Context, params: &mut B::Tensor, grads: &B::Tensor) {
// backend.axpy(params, backend.scalar_f32(-self.learning_rate), grads);
// m = momentum * m - lr * grads // m = momentum * m - lr * grads
backend.scale(&mut ctx.moments, backend.scalar_f32(self.momentum)); backend.scale(&mut ctx.moments, backend.scalar_f32(self.momentum));
backend.axpy(&mut ctx.moments, backend.scalar_f32(-self.learning_rate), grads); backend.axpy(&mut ctx.moments, backend.scalar_f32(-self.learning_rate), grads);

View File

@ -55,7 +55,7 @@ impl fmt::Display for TensorShape {
write!(f, ", ")?; write!(f, ", ")?;
} }
write!(f, "{}", self.shape[i]); write!(f, "{}", self.shape[i])?;
} }
write!(f, ")")?; write!(f, ")")?;
@ -178,7 +178,7 @@ impl TensorShape {
} }
impl From<()> for TensorShape { impl From<()> for TensorShape {
fn from(x: ()) -> Self { fn from(_: ()) -> Self {
TensorShape { TensorShape {
shape: [0, 0, 0, 0], shape: [0, 0, 0, 0],
dims: 0, dims: 0,