added Adam optimizer
This commit is contained in:
parent
fe9a8a0ea2
commit
9cfaa29d42
@ -148,6 +148,17 @@ impl <'a, N, T: BackendAxpy<N>> BackendAxpy<N> for &'a T {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub trait BackendAxpys<N>: Backend<N> {
|
||||||
|
fn axpys(&self, dst: &mut Self::Tensor, scale: N, a: &Self::Tensor);
|
||||||
|
}
|
||||||
|
|
||||||
|
impl <'a, N, T: BackendAxpys<N>> BackendAxpys<N> for &'a T {
|
||||||
|
#[inline]
|
||||||
|
fn axpys(&self, dst: &mut Self::Tensor, scale: N, a: &Self::Tensor) {
|
||||||
|
(**self).axpys(dst, scale, a)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub trait BackendAdd<N>: Backend<N> {
|
pub trait BackendAdd<N>: Backend<N> {
|
||||||
fn add(&self, dst: &mut Self::Tensor, a: &Self::Tensor);
|
fn add(&self, dst: &mut Self::Tensor, a: &Self::Tensor);
|
||||||
}
|
}
|
||||||
@ -179,4 +190,27 @@ impl <'a, N, T: BackendMul<N>> BackendMul<N> for &'a T {
|
|||||||
fn mul(&self, dst: &mut Self::Tensor, a: &Self::Tensor) {
|
fn mul(&self, dst: &mut Self::Tensor, a: &Self::Tensor) {
|
||||||
(**self).mul(dst, a)
|
(**self).mul(dst, a)
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
pub trait BackendMaximum<N>: Backend<N> {
|
||||||
|
fn maximum(&self, dst: &mut Self::Tensor, a: &Self::Tensor);
|
||||||
|
}
|
||||||
|
|
||||||
|
impl <'a, N, T: BackendMaximum<N>> BackendMaximum<N> for &'a T {
|
||||||
|
#[inline]
|
||||||
|
fn maximum(&self, dst: &mut Self::Tensor, a: &Self::Tensor) {
|
||||||
|
(**self).maximum(dst, a)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub trait BackendAdam<N>: BackendScale<N> + BackendAxpy<N> + BackendAxpys<N> + BackendMaximum<N> {
|
||||||
|
fn adam_p(&self, dst: &mut Self::Tensor, lr: N, moms: &Self::Tensor, vels: &Self::Tensor, eps: N);
|
||||||
|
}
|
||||||
|
|
||||||
|
impl <'a, N, T: BackendAdam<N>> BackendAdam<N> for &'a T {
|
||||||
|
#[inline]
|
||||||
|
fn adam_p(&self, dst: &mut Self::Tensor, lr: N, moms: &Self::Tensor, vels: &Self::Tensor, eps: N) {
|
||||||
|
(**self).adam_p(dst, lr, moms, vels, eps)
|
||||||
|
}
|
||||||
}
|
}
|
@ -60,13 +60,13 @@ impl Native {
|
|||||||
fn fmt_tensor(&self, t: &NativeTensorF32, f: &mut String) -> fmt::Result {
|
fn fmt_tensor(&self, t: &NativeTensorF32, f: &mut String) -> fmt::Result {
|
||||||
let strides = t.shape.default_strides();
|
let strides = t.shape.default_strides();
|
||||||
let last_idx = strides.dims - 1;
|
let last_idx = strides.dims - 1;
|
||||||
writeln!(f, "default stridses {} {}", t.shape.default_strides(), last_idx);
|
writeln!(f, "default stridses {} {}", t.shape.default_strides(), last_idx)?;
|
||||||
write!(f, "Tensor(shape={}, data=[", t.shape)?;
|
write!(f, "Tensor(shape={}, data=[", t.shape)?;
|
||||||
|
|
||||||
for (idx, val) in t.read().iter().enumerate() {
|
for (idx, val) in t.read().iter().enumerate() {
|
||||||
let mut is_first = idx == 0;
|
let is_first = idx == 0;
|
||||||
let mut need_nl = false;
|
let mut need_nl = false;
|
||||||
let mut padding = 2;
|
let padding = 2;
|
||||||
|
|
||||||
for (sidx, s) in strides.iter().enumerate() {
|
for (sidx, s) in strides.iter().enumerate() {
|
||||||
if sidx != last_idx && idx % s as usize == 0 {
|
if sidx != last_idx && idx % s as usize == 0 {
|
||||||
@ -148,7 +148,7 @@ impl Backend<f32> for Native {
|
|||||||
|
|
||||||
fn print_tensor(&self, t: &Self::Tensor) {
|
fn print_tensor(&self, t: &Self::Tensor) {
|
||||||
let mut s = String::new();
|
let mut s = String::new();
|
||||||
self.fmt_tensor(t, &mut s);
|
self.fmt_tensor(t, &mut s).unwrap();
|
||||||
println!("{}", s);
|
println!("{}", s);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -351,6 +351,21 @@ impl BackendAxpy<f32> for Native {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl BackendAxpys<f32> for Native {
|
||||||
|
fn axpys(&self, dst: &mut Self::Tensor, scale: f32, a: &Self::Tensor) {
|
||||||
|
let dst_size = dst.shape().size();
|
||||||
|
|
||||||
|
assert!(a.shape() == dst.shape());
|
||||||
|
|
||||||
|
let a_s = &a.read()[0 .. dst_size];
|
||||||
|
let dst_s = &mut dst.write()[0 .. dst_size];
|
||||||
|
|
||||||
|
for i in 0 .. dst_size {
|
||||||
|
dst_s[i] += scale * a_s[i] * a_s[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl BackendAdd<f32> for Native {
|
impl BackendAdd<f32> for Native {
|
||||||
fn add(&self, dst: &mut Self::Tensor, a: &Self::Tensor) {
|
fn add(&self, dst: &mut Self::Tensor, a: &Self::Tensor) {
|
||||||
let dst_size = dst.shape().size();
|
let dst_size = dst.shape().size();
|
||||||
@ -401,6 +416,39 @@ impl BackendMul<f32> for Native {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl BackendMaximum<f32> for Native {
|
||||||
|
fn maximum(&self, dst: &mut Self::Tensor, a: &Self::Tensor) {
|
||||||
|
let dst_size = dst.shape().size();
|
||||||
|
|
||||||
|
assert!(a.shape() == dst.shape());
|
||||||
|
|
||||||
|
let a_s = &a.read()[0 .. dst_size];
|
||||||
|
let dst_s = &mut dst.write()[0 .. dst_size];
|
||||||
|
|
||||||
|
for i in 0 .. dst_size {
|
||||||
|
dst_s[i] = f32::max(a_s[i], dst_s[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
impl BackendAdam<f32> for Native {
|
||||||
|
fn adam_p(&self, dst: &mut Self::Tensor, lr: f32, moms: &Self::Tensor, vels: &Self::Tensor, eps: f32) {
|
||||||
|
let dst_size = dst.shape().size();
|
||||||
|
|
||||||
|
assert!(moms.shape() == dst.shape());
|
||||||
|
assert!(vels.shape() == dst.shape());
|
||||||
|
|
||||||
|
let moms_s = &moms.read()[0 .. dst_size];
|
||||||
|
let vels_s = &vels.read()[0 .. dst_size];
|
||||||
|
let dst_s = &mut dst.write()[0 .. dst_size];
|
||||||
|
|
||||||
|
for i in 0 .. dst_size {
|
||||||
|
dst_s[i] += lr * moms_s[i] / (vels_s[i].sqrt() + eps)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_matmul() {
|
fn test_matmul() {
|
||||||
|
@ -73,7 +73,7 @@ impl <N, B, O> Optimizable<N, B, O> for Linear<N, B, O>
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn optimize(&mut self, backend: &B, optimizer: &O) {
|
fn optimize(&mut self, backend: &B, optimizer: &O) {
|
||||||
optimizer.update_gradients(backend, &mut self.weights.ctx, &mut self.weights.params, &self.weights.grads);
|
optimizer.update_params(backend, &mut self.weights.ctx, &mut self.weights.params, &self.weights.grads);
|
||||||
// optimizer.update_gradients(backend, &mut self.biases.ctx, &mut self.biases.params, &self.biases.grads);
|
// optimizer.update_params(backend, &mut self.biases.ctx, &mut self.biases.params, &self.biases.grads);
|
||||||
}
|
}
|
||||||
}
|
}
|
@ -15,7 +15,7 @@ pub mod losses;
|
|||||||
pub mod tensor;
|
pub mod tensor;
|
||||||
|
|
||||||
use self::backends::{Native, NativeTensorF32};
|
use self::backends::{Native, NativeTensorF32};
|
||||||
use self::optimizers::Sgd;
|
use self::optimizers::Adam;
|
||||||
use self::layers::*;
|
use self::layers::*;
|
||||||
use self::layer::*;
|
use self::layer::*;
|
||||||
use crate::backend::{Backend, BackendScale};
|
use crate::backend::{Backend, BackendScale};
|
||||||
@ -58,16 +58,17 @@ fn main() {
|
|||||||
const BATCH_SIZE: usize = 128;
|
const BATCH_SIZE: usize = 128;
|
||||||
|
|
||||||
let backend = Native;
|
let backend = Native;
|
||||||
let optimizer = Sgd::new(0.01, 0.1, false);
|
// let optimizer = Sgd::new(0.01, 0.1, false);
|
||||||
|
let optimizer = Adam::default();
|
||||||
let hidden_count = 64;
|
let hidden_count = 64;
|
||||||
|
|
||||||
let mut linear_1: LayerImpl<_, _, _, Linear<_, _, &Sgd<_, _>>> = LayerImpl::new((784, ).into(), &backend, &optimizer, LinearConfig {
|
let mut linear_1: LayerImpl<_, _, _, Linear<_, _, &Adam<_, _>>> = LayerImpl::new((784, ).into(), &backend, &optimizer, LinearConfig {
|
||||||
outputs: hidden_count
|
outputs: hidden_count
|
||||||
});
|
});
|
||||||
|
|
||||||
let mut sigmoid_1: LayerImpl<_, _, _, Sigmoid<_, _>> = LayerImpl::new((hidden_count, ).into(), &backend, &optimizer, SigmoidConfig);
|
let mut sigmoid_1: LayerImpl<_, _, _, Sigmoid<_, _>> = LayerImpl::new((hidden_count, ).into(), &backend, &optimizer, SigmoidConfig);
|
||||||
|
|
||||||
let mut linear_2: LayerImpl<_, _, _, Linear<_, _, &Sgd<_, _>>> = LayerImpl::new((hidden_count, ).into(), &backend, &optimizer, LinearConfig {
|
let mut linear_2: LayerImpl<_, _, _, Linear<_, _, &Adam<_, _>>> = LayerImpl::new((hidden_count, ).into(), &backend, &optimizer, LinearConfig {
|
||||||
outputs: 10
|
outputs: 10
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -9,15 +9,15 @@ pub trait OptimizerContext {
|
|||||||
pub trait Optimizer<N, B: Backend<N>> {
|
pub trait Optimizer<N, B: Backend<N>> {
|
||||||
type Context: OptimizerContext;
|
type Context: OptimizerContext;
|
||||||
|
|
||||||
fn update_gradients(&self, backend: &B, ctx: &mut Self::Context, grads: &mut B::Tensor, params: &B::Tensor);
|
fn update_params(&self, backend: &B, ctx: &mut Self::Context, grads: &mut B::Tensor, params: &B::Tensor);
|
||||||
}
|
}
|
||||||
|
|
||||||
impl <'a, N, B: Backend<N>, O: Optimizer<N, B>> Optimizer<N, B> for &'a O {
|
impl <'a, N, B: Backend<N>, O: Optimizer<N, B>> Optimizer<N, B> for &'a O {
|
||||||
type Context = O::Context;
|
type Context = O::Context;
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
fn update_gradients(&self, backend: &B, ctx: &mut Self::Context, grads: &mut B::Tensor, params: &B::Tensor) {
|
fn update_params(&self, backend: &B, ctx: &mut Self::Context, grads: &mut B::Tensor, params: &B::Tensor) {
|
||||||
(**self).update_gradients(backend, ctx, grads, params)
|
(**self).update_params(backend, ctx, grads, params)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
96
src/optimizers/adam.rs
Normal file
96
src/optimizers/adam.rs
Normal file
@ -0,0 +1,96 @@
|
|||||||
|
use std::marker::PhantomData;
|
||||||
|
use crate::backend::{Backend, BackendAdam};
|
||||||
|
use crate::optimizer::{Optimizer, OptimizerContext};
|
||||||
|
use crate::tensor::{Tensor, TensorShape};
|
||||||
|
use std::cell::Cell;
|
||||||
|
|
||||||
|
|
||||||
|
pub struct AdamContext<N, B>
|
||||||
|
where B: Backend<N>
|
||||||
|
{
|
||||||
|
moms: B::Tensor,
|
||||||
|
vels: B::Tensor,
|
||||||
|
vhats: B::Tensor,
|
||||||
|
_m: PhantomData<fn(N, B)>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<N, B: Backend<N>> OptimizerContext for AdamContext<N, B> {
|
||||||
|
fn new<S: Into<TensorShape>>(shape: S) -> Self {
|
||||||
|
let shape = shape.into();
|
||||||
|
|
||||||
|
Self {
|
||||||
|
moms: B::Tensor::new(shape.clone()),
|
||||||
|
vels: B::Tensor::new(shape.clone()),
|
||||||
|
vhats: B::Tensor::new(shape),
|
||||||
|
_m: Default::default(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct Adam<N, B: Backend<N>> {
|
||||||
|
learning_rate: f32,
|
||||||
|
beta_1: f32,
|
||||||
|
beta_2: f32,
|
||||||
|
epsilon: Option<f32>,
|
||||||
|
amsgrad: bool,
|
||||||
|
iteration: Cell<f32>,
|
||||||
|
_m: PhantomData<fn(N, B)>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<N, B> Default for Adam<N, B>
|
||||||
|
where B: Backend<N>
|
||||||
|
{
|
||||||
|
fn default() -> Self {
|
||||||
|
Self {
|
||||||
|
learning_rate: 0.001,
|
||||||
|
beta_1: 0.9,
|
||||||
|
beta_2: 0.999,
|
||||||
|
epsilon: None,
|
||||||
|
amsgrad: false,
|
||||||
|
iteration: Cell::new(0.0),
|
||||||
|
_m: Default::default(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<N, B: Backend<N>> Adam<N, B> {
|
||||||
|
pub fn new(learning_rate: f32, beta_1: f32, beta_2: f32, amsgrad: bool) -> Self {
|
||||||
|
Self {
|
||||||
|
learning_rate,
|
||||||
|
beta_1,
|
||||||
|
beta_2,
|
||||||
|
epsilon: None,
|
||||||
|
amsgrad,
|
||||||
|
iteration: Cell::new(0.0),
|
||||||
|
_m: Default::default(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<N, B: Backend<N> + BackendAdam<N>> Optimizer<N, B> for Adam<N, B> {
|
||||||
|
type Context = AdamContext<N, B>;
|
||||||
|
|
||||||
|
fn update_params(&self, backend: &B, ctx: &mut Self::Context, params: &mut B::Tensor, grads: &B::Tensor) {
|
||||||
|
let iter = self.iteration.get();
|
||||||
|
let t = iter + 1.0;
|
||||||
|
self.iteration.set(iter + 0.25);
|
||||||
|
|
||||||
|
let lr_t = self.learning_rate * ((1.0 - self.beta_2.powf(t)).sqrt() / (1.0 - self.beta_1.powf(t)));
|
||||||
|
|
||||||
|
// m_t = (self.beta_1 * m) + (1. - self.beta_1) * g;
|
||||||
|
backend.scale(&mut ctx.moms, backend.scalar_f32(self.beta_1));
|
||||||
|
backend.axpy(&mut ctx.moms, backend.scalar_f32(1.0 - self.beta_1), grads);
|
||||||
|
|
||||||
|
// v_t = (self.beta_2 * v) + (1. - self.beta_2) * square(grads);
|
||||||
|
backend.scale(&mut ctx.vels, backend.scalar_f32(self.beta_2));
|
||||||
|
backend.axpys(&mut ctx.vels, backend.scalar_f32(1.0 - self.beta_2), grads);
|
||||||
|
|
||||||
|
if self.amsgrad {
|
||||||
|
backend.maximum(&mut ctx.vhats, &ctx.vels);
|
||||||
|
backend.adam_p(params, backend.scalar_f32(-lr_t), &ctx.moms, &ctx.vhats, backend.scalar_f32(self.epsilon.unwrap_or(std::f32::EPSILON)));
|
||||||
|
} else {
|
||||||
|
// p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon)
|
||||||
|
backend.adam_p(params, backend.scalar_f32(-lr_t), &ctx.moms, &ctx.vels, backend.scalar_f32(self.epsilon.unwrap_or(std::f32::EPSILON)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -1,3 +1,5 @@
|
|||||||
mod sgd;
|
mod sgd;
|
||||||
|
mod adam;
|
||||||
|
|
||||||
pub use self::sgd::*;
|
pub use self::sgd::*;
|
||||||
|
pub use self::adam::*;
|
@ -27,7 +27,22 @@ pub struct Sgd<N, B: Backend<N>> {
|
|||||||
_m: PhantomData<fn(N, B)>,
|
_m: PhantomData<fn(N, B)>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<N, B: Backend<N>> Sgd<N, B> {
|
impl<N, B> Default for Sgd<N, B>
|
||||||
|
where B: Backend<N>
|
||||||
|
{
|
||||||
|
fn default() -> Self {
|
||||||
|
Self {
|
||||||
|
learning_rate: 0.01,
|
||||||
|
momentum: 0.0,
|
||||||
|
nesterov: false,
|
||||||
|
_m: Default::default(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<N, B> Sgd<N, B>
|
||||||
|
where B: Backend<N>
|
||||||
|
{
|
||||||
pub fn new(learning_rate: f32, momentum: f32, nesterov: bool) -> Self {
|
pub fn new(learning_rate: f32, momentum: f32, nesterov: bool) -> Self {
|
||||||
Self {
|
Self {
|
||||||
learning_rate,
|
learning_rate,
|
||||||
@ -41,9 +56,7 @@ impl<N, B: Backend<N>> Sgd<N, B> {
|
|||||||
impl<N, B: Backend<N> + BackendScale<N> + BackendAxpy<N> + BackendAdd<N>> Optimizer<N, B> for Sgd<N, B> {
|
impl<N, B: Backend<N> + BackendScale<N> + BackendAxpy<N> + BackendAdd<N>> Optimizer<N, B> for Sgd<N, B> {
|
||||||
type Context = SgdContext<N, B>;
|
type Context = SgdContext<N, B>;
|
||||||
|
|
||||||
fn update_gradients(&self, backend: &B, ctx: &mut Self::Context, params: &mut B::Tensor, grads: &B::Tensor) {
|
fn update_params(&self, backend: &B, ctx: &mut Self::Context, params: &mut B::Tensor, grads: &B::Tensor) {
|
||||||
// backend.axpy(params, backend.scalar_f32(-self.learning_rate), grads);
|
|
||||||
|
|
||||||
// m = momentum * m - lr * grads
|
// m = momentum * m - lr * grads
|
||||||
backend.scale(&mut ctx.moments, backend.scalar_f32(self.momentum));
|
backend.scale(&mut ctx.moments, backend.scalar_f32(self.momentum));
|
||||||
backend.axpy(&mut ctx.moments, backend.scalar_f32(-self.learning_rate), grads);
|
backend.axpy(&mut ctx.moments, backend.scalar_f32(-self.learning_rate), grads);
|
||||||
|
@ -55,7 +55,7 @@ impl fmt::Display for TensorShape {
|
|||||||
write!(f, ", ")?;
|
write!(f, ", ")?;
|
||||||
}
|
}
|
||||||
|
|
||||||
write!(f, "{}", self.shape[i]);
|
write!(f, "{}", self.shape[i])?;
|
||||||
}
|
}
|
||||||
write!(f, ")")?;
|
write!(f, ")")?;
|
||||||
|
|
||||||
@ -178,7 +178,7 @@ impl TensorShape {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl From<()> for TensorShape {
|
impl From<()> for TensorShape {
|
||||||
fn from(x: ()) -> Self {
|
fn from(_: ()) -> Self {
|
||||||
TensorShape {
|
TensorShape {
|
||||||
shape: [0, 0, 0, 0],
|
shape: [0, 0, 0, 0],
|
||||||
dims: 0,
|
dims: 0,
|
||||||
|
Loading…
Reference in New Issue
Block a user