initial

2024-11-01 14:25:58 +04:00 · 2024-11-01 14:25:58 +04:00 · e79a8329ea
commit e79a8329ea
18 changed files with 3509 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1 @@
+/target
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@ -0,0 +1,7 @@
+[package]
+name = "rax"
+version = "0.1.0"
+edition = "2021"
+
+[dependencies]
+cubecl = { git = "https://github.com/tracel-ai/cubecl.git", features = ["wgpu", "cuda"] }
--- a/examples/align.rs
+++ b/examples/align.rs
@ -0,0 +1 @@
+fn main() {}
--- a/src/alignment.rs
+++ b/src/alignment.rs
@ -0,0 +1,20 @@
+use crate::misc::DistanceKind;
+
+pub mod block_matching;
+pub mod gauss_pyramid;
+pub mod local_search;
+pub mod upsample_alignments;
+
+pub struct AlingLevelConfig {
+    pub factor: u32,
+    pub upsampling_factor: u32,
+    pub tile_size: u32,
+    pub search_radius: u32,
+    pub distance: DistanceKind,
+}
+
+pub struct AlignmentConfig {
+    pub levels: Vec<AlingLevelConfig>,
+}
+
+pub struct Alignments {}
--- a/src/alignment/block_matching.rs
+++ b/src/alignment/block_matching.rs
@ -0,0 +1,96 @@
+use cubecl::{client::ComputeClient, Runtime};
+
+use crate::image::{Alignments, GrayImage};
+
+use super::{
+    gauss_pyramid::build_pyramid, local_search::local_search,
+    upsample_alignments::upsample_alignments, AlingLevelConfig,
+};
+
+///
+/// Align the reference image with the img : returns a patchwise flow such that
+/// for patches py, px :
+///     img[py, px] ~= ref_img[py + alignments[py, px, 1],
+///                            px + alignments[py, px, 0]]
+///
+/// Parameters
+/// ----------
+/// ref_img     : Img [imshape_y, imshape_x]
+///     Image to be compared
+///
+/// ref_pyramid : [Img]
+///     Pyramid representation of the ref image J_1
+///
+/// Returns
+/// -------
+/// alignments : Alignments
+///     a device array[n_patchs_y, n_patchs_x, 2]
+///     Patchwise flow : V_n(p) for each patch (p)
+///
+pub fn align_image_block_matching<R: Runtime>(
+    cube: &ComputeClient<R::Server, R::Channel>,
+    img: &GrayImage,
+    ref_pyramid: &[GrayImage],
+    level_configs: &[AlingLevelConfig],
+) -> Alignments {
+    let alt_pyramid = build_pyramid::<R>(cube, img, level_configs.iter().map(|x| x.factor));
+
+    // Align alternate image to the reference image
+    let mut alignments = None;
+    let cfg_iter = level_configs.iter().rev();
+    let mut prev_tile_size = 64;
+
+    for (lv, cfg) in cfg_iter.enumerate() {
+        alignments = Some(align_on_a_level::<R>(
+            cube,
+            &ref_pyramid[lv],
+            &alt_pyramid[lv],
+            cfg,
+            alignments.as_ref(),
+            prev_tile_size,
+        ));
+
+        prev_tile_size = cfg.tile_size;
+    }
+
+    alignments.unwrap()
+}
+
+///
+/// Alignment will always be an integer with this function, however it is
+/// set to DEFAULT_FLOAT_TYPE. This enables to directly use the outputed
+/// alignment for ICA without any casting from int to float
+///
+pub(crate) fn align_on_a_level<R: Runtime>(
+    cube: &ComputeClient<R::Server, R::Channel>,
+    ref_pyramid_lvl: &GrayImage,
+    alt_pyramid_lvl: &GrayImage,
+    cfg: &AlingLevelConfig,
+    prev: Option<&Alignments>,
+    prev_tile_size: u32,
+) -> Alignments {
+    let (ref_h, ref_w) = ref_pyramid_lvl.shape();
+
+    // Number of patches that can fit on this level
+    let h = ref_h / cfg.tile_size;
+    let w = ref_w / cfg.tile_size;
+
+    let mut alignments = Alignments::new_empty::<R>(cube, w, h);
+
+    if let Some(prev) = prev {
+        // use the upsampled previous alignments as initial guesses
+        upsample_alignments::<R>(
+            cube,
+            &mut alignments,
+            ref_pyramid_lvl,
+            alt_pyramid_lvl,
+            prev,
+            prev_tile_size,
+            cfg,
+        )
+    }
+
+    local_search::<R>(cube, &mut alignments, ref_pyramid_lvl, alt_pyramid_lvl, cfg);
+
+    alignments
+}
--- a/src/alignment/gauss_pyramid.rs
+++ b/src/alignment/gauss_pyramid.rs
@ -0,0 +1,71 @@
+use cubecl::{client::ComputeClient, Runtime};
+
+use crate::image::{downsample::downsample, padding::pad, GrayImage};
+
+///
+/// Construct N-level coarse-to-fine gaussian pyramid
+///
+/// Args:
+///         image: input image (expected to be a grayscale image downsampled from a Bayer raw image)
+///         factors: [int], dowsampling factors (fine-to-coarse)
+///
+pub(crate) fn build_pyramid<R: Runtime>(
+    cube: &ComputeClient<R::Server, R::Channel>,
+    img: &GrayImage,
+    factors: impl Iterator<Item = u32>,
+) -> Vec<GrayImage> {
+    // Subsequent pyramid levels are successively created
+    // with convolution by a kernel followed by downsampling
+    let init = Vec::with_capacity(factors.size_hint().1.unwrap_or(factors.size_hint().0));
+    let mut levels = factors.fold(init, |mut acc, factor| {
+        acc.push(downsample::<R>(cube, acc.last().unwrap_or(img), factor));
+        acc
+    });
+
+    // Reverse the pyramid to get it coarse-to-fine
+    levels.reverse();
+
+    levels
+}
+
+///
+/// Returns the pyramid representation of `img`, that will be used for
+/// future block matching
+///
+/// Parameters
+/// ----------
+///   img       : Img grayscale input image
+///   factors   : [int] pyramid scaling factors
+///   tile_size : int size of the tile
+///
+/// Returns
+/// -------
+/// pyramid : [Img]
+///     pyramid representation of the image
+///
+pub fn build_gauss_pyramid<R: Runtime>(
+    cube: &ComputeClient<R::Server, R::Channel>,
+    img: &GrayImage,
+    factors: impl Iterator<Item = u32>,
+    tile_size: u32,
+) -> Vec<GrayImage> {
+    let (h, w) = img.shape();
+
+    // if needed, pad images with zeros so that getTiles contains all image pixels
+    let padding_patches_height = tile_size - (h % tile_size);
+    let padding_patches_width = tile_size - (w % tile_size);
+
+    if padding_patches_width != 0 || padding_patches_height != 0 {
+        let img = pad::<R>(
+            cube,
+            img,
+            0,
+            padding_patches_width,
+            0,
+            padding_patches_height,
+        );
+        build_pyramid::<R>(cube, &img, factors)
+    } else {
+        build_pyramid::<R>(cube, img, factors)
+    }
+}
--- a/src/alignment/local_search.rs
+++ b/src/alignment/local_search.rs
@ -0,0 +1,114 @@
+use cubecl::{
+    client::ComputeClient,
+    prelude::{Float, Line, ScalarArg, Tensor, ABSOLUTE_POS_X, ABSOLUTE_POS_Y},
+    CubeCount, CubeDim, Runtime,
+};
+
+use crate::{
+    alignment::upsample_alignments::cube_compute_dist,
+    image::{Alignments, GrayImage},
+    misc::DistanceKind,
+};
+
+use super::AlingLevelConfig;
+
+pub fn local_search<R: Runtime>(
+    cube: &ComputeClient<R::Server, R::Channel>,
+    upsampled_alignments: &mut Alignments,
+    ref_pyramid_lvl: &GrayImage,
+    alt_pyramid_lvl: &GrayImage,
+    cfg: &AlingLevelConfig,
+) {
+    let threadsperblock = CubeDim::default();
+
+    let w = upsampled_alignments.shape()[0] as u32;
+    let h = upsampled_alignments.shape()[1] as u32;
+
+    let blockspergrid_x = (w + threadsperblock.x - 1) / threadsperblock.x;
+    let blockspergrid_y = (h + threadsperblock.y - 1) / threadsperblock.y;
+
+    unsafe {
+        cube_local_search::launch_unchecked::<R>(
+            cube,
+            CubeCount::Static(blockspergrid_x, blockspergrid_y, 1),
+            threadsperblock,
+            ref_pyramid_lvl.as_tensor_arg().as_tensor_arg(1),
+            alt_pyramid_lvl.as_tensor_arg().as_tensor_arg(1),
+            upsampled_alignments.as_tensor_arg().as_tensor_arg(1),
+            ScalarArg::new(cfg.tile_size),
+            ScalarArg::new(cfg.search_radius as _),
+            cfg.distance,
+        )
+    }
+}
+
+#[cubecl::cube(launch_unchecked)]
+#[allow(clippy::identity_op)]
+fn cube_local_search(
+    ref_pyramid_lvl: &Tensor<f32>,
+    alt_pyramid_lvl: &Tensor<f32>,
+    upsampled_alignments: &mut Tensor<f32>,
+    tile_size: u32,
+    search_radius: i32,
+    #[comptime] distance: DistanceKind,
+) {
+    let n_patchs_y = upsampled_alignments.shape(0);
+    let n_patchs_x = upsampled_alignments.shape(1);
+
+    let tile_x = ABSOLUTE_POS_X;
+    let tile_y = ABSOLUTE_POS_Y;
+
+    if tile_y >= n_patchs_y || tile_x >= n_patchs_x {
+        return;
+    }
+
+    let offset = tile_y * upsampled_alignments.stride(0) + tile_x * upsampled_alignments.stride(1);
+
+    let mut local_flow = Line::empty(2);
+    local_flow[0] = upsampled_alignments[offset + 0];
+    local_flow[1] = upsampled_alignments[offset + 1];
+
+    // position of the pixel in the top left corner of the patch
+    let patch_pos_x = tile_x * tile_size;
+    let patch_pos_y = tile_y * tile_size;
+
+    // this should be rewritten to allow patchs bigger than 32
+    let mut local_ref = Line::empty(1024);
+    for i in 0..tile_size {
+        for j in 0..tile_size {
+            let idx = patch_pos_x + j;
+            let idy = patch_pos_y + i;
+
+            local_ref[i * 32 + j] = ref_pyramid_lvl[idy * ref_pyramid_lvl.stride(0) + idx];
+        }
+    }
+
+    let mut min_dist = f32::new(f32::INFINITY); // init as infty
+    let mut min_shift_y: i32 = 0;
+    let mut min_shift_x: i32 = 0;
+
+    // window search
+    for search_shift_y in -search_radius..=search_radius {
+        for search_shift_x in -search_radius..=search_radius {
+            // computing dist
+            let dist = cube_compute_dist(
+                &local_ref,
+                alt_pyramid_lvl,
+                &local_flow,
+                patch_pos_x as i32 + search_shift_x,
+                patch_pos_y as i32 + search_shift_y,
+                tile_size,
+                distance,
+            );
+
+            if dist < min_dist {
+                min_dist = dist;
+                min_shift_y = search_shift_y;
+                min_shift_x = search_shift_x;
+            }
+        }
+    }
+
+    upsampled_alignments[offset + 0] = local_flow[0] + min_shift_x as f32;
+    upsampled_alignments[offset + 1] = local_flow[1] + min_shift_y as f32;
+}
--- a/src/alignment/upsample_alignments.rs
+++ b/src/alignment/upsample_alignments.rs
@ -0,0 +1,263 @@
+use cubecl::{
+    client::ComputeClient,
+    prelude::{Abs, Float, Line, Max, Min, ScalarArg, Tensor, ABSOLUTE_POS_X, ABSOLUTE_POS_Y},
+    CubeCount, CubeDim, Runtime,
+};
+
+use crate::{
+    image::{Alignments, GrayImage},
+    misc::DistanceKind,
+};
+
+use super::AlingLevelConfig;
+
+///
+/// Upsample alignements to adapt them to the next pyramid level (Section 3.2 of the IPOL article).
+///
+pub(crate) fn upsample_alignments<R: Runtime>(
+    cube: &ComputeClient<R::Server, R::Channel>,
+    dst: &mut Alignments,
+    ref_pyramid_lvl: &GrayImage,
+    alt_pyramid_lvl: &GrayImage,
+    prev_alignments: &Alignments,
+    prev_tile_size: u32,
+    cfg: &AlingLevelConfig,
+) {
+    let threadsperblock = CubeDim::default();
+
+    let n_tiles_y_new = dst.shape()[0] as u32;
+    let n_tiles_x_new = dst.shape()[1] as u32;
+
+    let blockspergrid_x = (n_tiles_x_new + threadsperblock.x - 1) / threadsperblock.x;
+    let blockspergrid_y = (n_tiles_y_new + threadsperblock.y - 1) / threadsperblock.y;
+
+    unsafe {
+        cube_upsample_alignments::launch_unchecked::<R>(
+            cube,
+            CubeCount::Static(blockspergrid_x, blockspergrid_y, 1),
+            threadsperblock,
+            ref_pyramid_lvl.as_tensor_arg().as_tensor_arg(1),
+            alt_pyramid_lvl.as_tensor_arg().as_tensor_arg(1),
+            prev_alignments.as_tensor_arg().as_tensor_arg(1),
+            dst.as_tensor_arg().as_tensor_arg(1),
+            ScalarArg::new(cfg.upsampling_factor),
+            ScalarArg::new(cfg.tile_size),
+            ScalarArg::new(prev_tile_size),
+        )
+    }
+}
+
+#[allow(clippy::identity_op)]
+#[cubecl::cube(launch_unchecked)]
+fn cube_upsample_alignments(
+    ref_pyramid_lvl: &Tensor<f32>,
+    alt_pyramid_lvl: &Tensor<f32>,
+    previous_alignments: &Tensor<f32>,
+    upsampled_alignments: &mut Tensor<f32>,
+    upsampling_factor: u32,
+    tile_size: u32,
+    prev_tile_size: u32,
+) {
+    let subtile_x = ABSOLUTE_POS_X;
+    let subtile_y = ABSOLUTE_POS_Y;
+
+    let n_tiles_y_prev = previous_alignments.shape(0);
+    let n_tiles_x_prev = previous_alignments.shape(1);
+
+    let n_tiles_y_new = upsampled_alignments.shape(0);
+    let n_tiles_x_new = upsampled_alignments.shape(1);
+
+    let w = ref_pyramid_lvl.shape(1);
+
+    let repeat_factor = upsampling_factor / (tile_size / prev_tile_size);
+
+    if subtile_x >= n_tiles_x_new || subtile_y >= n_tiles_y_new {
+        return;
+    }
+
+    let des_offset =
+        subtile_y * upsampled_alignments.stride(0) + subtile_x * upsampled_alignments.stride(1);
+
+    // the new subtile is on the side of the image, and is not contained within a bigger old tile
+    if subtile_x >= repeat_factor * n_tiles_x_prev || subtile_y >= repeat_factor * n_tiles_y_prev {
+        upsampled_alignments[des_offset + 0] = 0.;
+        upsampled_alignments[des_offset + 1] = 0.;
+        return;
+    }
+
+    let prev_tile_x = subtile_x / repeat_factor;
+    let prev_tile_y = subtile_y / repeat_factor;
+
+    // position of the top left pixel in the subtile
+    let subtile_pos_y = subtile_y * tile_size;
+    let subtile_pos_x = subtile_x * tile_size;
+
+    // copying ref patch into local memory, because it needs to be read 3 times
+    // this should be rewritten to allow patchs bigger than 32
+    let mut local_ref = Line::empty(1024);
+    for i in 0..tile_size {
+        for j in 0..tile_size {
+            let idx = subtile_pos_x + j;
+            let idy = subtile_pos_y + i;
+
+            local_ref[i * 32 + j] = ref_pyramid_lvl[idy * w + idx];
+        }
+    }
+
+    // position of the new tile within the old tile
+    let ups_subtile_x = subtile_x % repeat_factor;
+    let ups_subtile_y = subtile_y % repeat_factor;
+
+    // computing id for the 3 closest patchs
+    let x_shift = if 2 * ups_subtile_x + 1 > repeat_factor {
+        1i32
+    } else {
+        -(1i32)
+    };
+
+    let y_shift = if 2 * ups_subtile_y + 1 > repeat_factor {
+        1i32
+    } else {
+        -(1i32)
+    };
+
+    // Choosing the best of the 3 alignments by minimising L1 dist
+    let mut dist = 1.0f32 / 0.0f32;
+    let mut optimal_flow_x = 0.0f32;
+    let mut optimal_flow_y = 0.0f32;
+
+    // 3 Candidates alignments are fetched (by fetching them as early as possible, we may received
+    // them from global memory before we even require them, as calculations are performed during this delay)
+    let candidate_alignment_0_shift = cube_alignment_flow(
+        previous_alignments,
+        prev_tile_x,
+        prev_tile_y,
+        upsampling_factor as f32,
+    );
+
+    let candidate_alignment_vert_shift = cube_alignment_flow(
+        previous_alignments,
+        prev_tile_x,
+        clamp(prev_tile_y as i32 + y_shift, 0, n_tiles_y_prev as i32 - 1) as u32,
+        upsampling_factor as f32,
+    );
+
+    let candidate_alignment_horizontal_shift = cube_alignment_flow(
+        previous_alignments,
+        clamp(prev_tile_x as i32 + x_shift, 0, n_tiles_x_prev as i32 - 1) as u32,
+        prev_tile_y,
+        upsampling_factor as f32,
+    );
+
+    // 0 shift
+    let dist_ = cube_compute_dist(
+        &local_ref,
+        alt_pyramid_lvl,
+        &candidate_alignment_0_shift,
+        subtile_pos_x as i32,
+        subtile_pos_y as i32,
+        tile_size,
+        DistanceKind::L1,
+    );
+
+    if dist_ < dist {
+        dist = dist_;
+        optimal_flow_x = candidate_alignment_0_shift[0];
+        optimal_flow_y = candidate_alignment_0_shift[1];
+    }
+
+    // vertical shift
+    let dist_ = cube_compute_dist(
+        &local_ref,
+        alt_pyramid_lvl,
+        &candidate_alignment_vert_shift,
+        subtile_pos_x as i32,
+        subtile_pos_y as i32,
+        tile_size,
+        DistanceKind::L1,
+    );
+
+    if dist_ < dist {
+        dist = dist_;
+        optimal_flow_x = candidate_alignment_vert_shift[0];
+        optimal_flow_y = candidate_alignment_vert_shift[1];
+    }
+
+    // horizontal shift
+    let dist_ = cube_compute_dist(
+        &local_ref,
+        alt_pyramid_lvl,
+        &candidate_alignment_horizontal_shift,
+        subtile_pos_x as i32,
+        subtile_pos_y as i32,
+        tile_size,
+        DistanceKind::L1,
+    );
+
+    if dist_ < dist {
+        optimal_flow_x = candidate_alignment_horizontal_shift[0];
+        optimal_flow_y = candidate_alignment_horizontal_shift[1];
+    }
+
+    // applying best flow
+    upsampled_alignments[des_offset + 0] = optimal_flow_x;
+    upsampled_alignments[des_offset + 1] = optimal_flow_y;
+}
+
+#[cubecl::cube]
+fn clamp(a: i32, min: i32, max: i32) -> i32 {
+    // Clamp::clamp(a, min, max)
+    Min::min(Max::max(a, min), max)
+}
+
+#[cubecl::cube]
+#[allow(clippy::identity_op)]
+fn cube_alignment_flow(
+    previous_alignments: &Tensor<f32>,
+    x: u32,
+    y: u32,
+    factor: f32,
+) -> Line<f32> {
+    let offset = y * previous_alignments.stride(0) + x * previous_alignments.stride(1);
+
+    let mut candidate_alignment: Line<f32> = Line::empty(2);
+    candidate_alignment[0] = previous_alignments[offset + 0] * factor;
+    candidate_alignment[1] = previous_alignments[offset + 1] * factor;
+    candidate_alignment
+}
+
+#[cubecl::cube]
+pub(crate) fn cube_compute_dist(
+    local_ref: &Line<f32>,
+    alt_pyramid_lvl: &Tensor<f32>,
+    candidate_alignment: &Line<f32>,
+    pos_x: i32,
+    pos_y: i32,
+    tile_size: u32,
+    #[comptime] distance: DistanceKind,
+) -> f32 {
+    let h = alt_pyramid_lvl.shape(0);
+    let w = alt_pyramid_lvl.shape(1);
+
+    let mut dist = 0.0f32;
+    for i in 0..tile_size {
+        for j in 0..tile_size {
+            let new_idx = pos_x + j as i32 + candidate_alignment[0] as i32;
+            let new_idy = pos_y + i as i32 + candidate_alignment[1] as i32;
+
+            if (0 <= new_idx && new_idx < w as i32) && (0 <= new_idy && new_idy < h as i32) {
+                let alt_offset = new_idy as u32 * w + new_idx as u32;
+                let diff = local_ref[i * 32 + j] - alt_pyramid_lvl[alt_offset];
+
+                dist += match distance {
+                    DistanceKind::L1 => Abs::abs(diff),
+                    DistanceKind::L2 => diff * diff,
+                };
+            } else {
+                dist = f32::new(f32::INFINITY)
+            }
+        }
+    }
+
+    dist
+}
--- a/src/gat.rs
+++ b/src/gat.rs
@ -0,0 +1,69 @@
+use cubecl::prelude::{Max, Sqrt, Tensor, ABSOLUTE_POS_X, ABSOLUTE_POS_Y};
+
+use crate::{image::GrayImage, kernels::NoiseConfig};
+
+///
+/// Generalized Ascombe Transform
+/// noise model : std² = alpha * I + beta
+/// Where alpha and beta are iso dependant.
+///
+/// Parameters
+/// ----------
+/// image : TYPE
+///     DESCRIPTION.
+/// alpha : float
+///     value of alpha for the given iso
+/// iso : float
+///     ISO value
+/// beta : float
+///     Value of beta for the given iso
+///
+/// Returns
+/// -------
+/// VST_image : TYPE
+///     input image with stabilized variance
+///
+pub fn generalized_ascombe_transform(image: &GrayImage, nc: NoiseConfig) -> GrayImage {
+    let _ = nc;
+    let _ = image;
+    // assert len(image.shape) == 2
+    // imshape_y, imshape_x = image.shape
+
+    // VST_image = cuda.device_array(image.shape, DEFAULT_NUMPY_FLOAT_TYPE)
+
+    // threadsperblock = (DEFAULT_THREADS, DEFAULT_THREADS)
+    // blockspergrid_x = math.ceil(imshape_x/threadsperblock[1])
+    // blockspergrid_y = math.ceil(imshape_y/threadsperblock[0])
+    // blockspergrid = (blockspergrid_x, blockspergrid_y)
+
+    // cuda_GAT[blockspergrid, threadsperblock](image, VST_image,
+    //                                          alpha, beta)
+
+    // return VST_image
+    todo!()
+}
+
+#[cubecl::cube]
+fn cuda_gat(image: &Tensor<f32>, vst_image: &mut Tensor<f32>, alpha: f32, beta: f32) {
+    let x = ABSOLUTE_POS_X;
+    let y = ABSOLUTE_POS_Y;
+
+    let imshape_y = image.shape(0);
+    let imshape_x = image.shape(1);
+
+    if y >= imshape_y || x >= imshape_x {
+        return;
+    }
+
+    let offset = y * image.stride(0) + x;
+
+    // ISO should not appear here,  since alpha and beta are
+    // already iso dependant.
+
+    let vst = Max::max(
+        0.0,
+        alpha * image[offset] + 3.0 / 8.0 * alpha * alpha + beta,
+    );
+
+    vst_image[offset] = 2.0 / alpha * Sqrt::sqrt(vst);
+}
--- a/src/image.rs
+++ b/src/image.rs
@ -0,0 +1,133 @@
+use std::marker::PhantomData;
+
+use cubecl::{
+    client::ComputeClient,
+    prelude::{CubePrimitive, TensorHandleRef},
+    server::Handle,
+    Runtime,
+};
+
+pub mod convolution;
+pub mod downsample;
+pub mod padding;
+
+#[derive(Debug)]
+pub struct Tensor<T: CubePrimitive, const R: usize> {
+    pub(crate) handle: Handle,
+    pub(crate) shape: [usize; R],
+    pub(crate) stride: [usize; R],
+    _m: PhantomData<T>,
+}
+
+impl<T: CubePrimitive, const R: usize> Tensor<T, R> {
+    pub fn new_empty<Rt: Runtime>(
+        cube: &ComputeClient<Rt::Server, Rt::Channel>,
+        width: u32,
+        height: u32,
+    ) -> Self {
+        let _ = height;
+        let _ = width;
+        let _ = cube;
+        todo!()
+    }
+
+    #[inline]
+    pub fn shape(&self) -> [usize; R] {
+        self.shape
+    }
+
+    #[inline]
+    pub fn stride(&self) -> [usize; R] {
+        self.stride
+    }
+
+    #[inline]
+    pub fn as_tensor_arg<Rt: Runtime>(&self) -> TensorHandleRef<'_, Rt> {
+        TensorHandleRef {
+            handle: &self.handle,
+            strides: &self.stride,
+            shape: &self.shape,
+            runtime: PhantomData,
+        }
+    }
+
+    #[inline]
+    pub fn from_slice<Rt: Runtime>(
+        cube: &ComputeClient<Rt::Server, Rt::Channel>,
+        data: &[T],
+    ) -> Self {
+        let _ = data;
+        let _ = cube;
+        todo!()
+    }
+}
+
+pub struct ImageBuffer<T: CubePrimitive, const R: usize, const S: usize> {
+    data: Tensor<T, R>,
+    width: u32,
+    height: u32,
+}
+
+impl<T: CubePrimitive, const R: usize, const S: usize> std::ops::Deref for ImageBuffer<T, R, S> {
+    type Target = Tensor<T, R>;
+
+    fn deref(&self) -> &Self::Target {
+        &self.data
+    }
+}
+
+impl<T: CubePrimitive, const R: usize, const S: usize> std::ops::DerefMut for ImageBuffer<T, R, S> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        &mut self.data
+    }
+}
+
+impl<T: CubePrimitive, const R: usize, const S: usize> ImageBuffer<T, R, S> {
+    pub fn from_tensor(data: Tensor<T, R>, width: u32, height: u32) -> Self {
+        assert!(height as usize <= data.shape[0]);
+        assert!(width as usize <= data.shape[0] * S);
+
+        Self {
+            data,
+            width,
+            height,
+        }
+    }
+
+    #[inline]
+    pub fn shape(&self) -> (u32, u32) {
+        (self.height, self.width)
+    }
+
+    #[inline]
+    pub fn height(&self) -> u32 {
+        self.height
+    }
+
+    #[inline]
+    pub fn width(&self) -> u32 {
+        self.width
+    }
+
+    pub fn clone<Rt: Runtime>(&self, cube: &ComputeClient<Rt::Server, Rt::Channel>) -> Self {
+        let _ = cube;
+        todo!()
+    }
+}
+
+// Bayer Raw Images
+pub type Raw16Image = ImageBuffer<u32, 2, 2>;
+pub type Raw10Image = ImageBuffer<u32, 2, 3>;
+
+// Gray Images
+pub type GrayImage = ImageBuffer<f32, 2, 1>;
+pub type Gray10Image = ImageBuffer<u32, 2, 3>;
+pub type Gray16Image = ImageBuffer<u32, 2, 2>;
+
+// Rgb Images
+pub type RgbImage = ImageBuffer<f32, 3, 1>;
+pub type Rgba8Image = ImageBuffer<u32, 2, 4>;
+pub type Rgb10Image = ImageBuffer<u32, 2, 3>;
+
+// Alingments
+pub type Alignments = Tensor<f32, 3>;
--- a/src/image/convolution.rs
+++ b/src/image/convolution.rs
@ -0,0 +1,61 @@
+use cubecl::{
+    client::ComputeClient,
+    prelude::{Abs, Float, ABSOLUTE_POS_X, ABSOLUTE_POS_Y},
+    Runtime,
+};
+
+use super::Tensor;
+
+pub fn conv1d<R: Runtime>(
+    cube: &ComputeClient<R::Server, R::Channel>,
+    img: &Tensor<f32, 2>,
+    kern: &Tensor<f32, 1>,
+) -> Tensor<f32, 2> {
+    let _ = kern;
+    let _ = img;
+    let _ = cube;
+    todo!()
+}
+
+pub fn conv1d_t<R: Runtime>(
+    cube: &ComputeClient<R::Server, R::Channel>,
+    img: &Tensor<f32, 2>,
+    kern: &Tensor<f32, 1>,
+) -> Tensor<f32, 2> {
+    let _ = kern;
+    let _ = img;
+    let _ = cube;
+    todo!()
+}
+
+#[cubecl::cube(launch_unchecked)]
+fn cube_conv1d<F: Float>(
+    input: &cubecl::prelude::Tensor<F>,
+    kern: &cubecl::prelude::Tensor<F>,
+    output: &mut cubecl::prelude::Tensor<F>,
+    #[comptime] trans: bool,
+) {
+    let kern_half = kern.len() as i32 / 2;
+    let h = output.shape(0) as i32 - 1;
+    let w = output.shape(1) as i32 - 1;
+
+    let px = ABSOLUTE_POS_X as i32 - kern_half;
+    let py = ABSOLUTE_POS_Y as i32 - kern_half;
+
+    for i in 0..kern.len() as i32 {
+        let ox = if trans {
+            Abs::abs(w - Abs::abs(px + i - w))
+        } else {
+            px
+        };
+
+        let oy = if trans {
+            Abs::abs(h - Abs::abs(py + i - h))
+        } else {
+            py
+        };
+
+        output[oy as u32 * output.stride(0) + ox as u32] =
+            input[oy as u32 * input.stride(0) + ox as u32] * kern[i as u32];
+    }
+}
--- a/src/image/downsample.rs
+++ b/src/image/downsample.rs
@ -0,0 +1,65 @@
+use cubecl::{client::ComputeClient, Runtime};
+
+use super::{convolution::conv1d, GrayImage, Tensor};
+
+///
+/// Apply a convolution by a kernel if required, then downsample an image.
+/// Args:
+///      image: Device Array the input image (WARNING: single channel only!)
+///      factor: downsampling factor
+///
+pub fn downsample<R: Runtime>(
+    cube: &ComputeClient<R::Server, R::Channel>,
+    img: &GrayImage,
+    factor: u32,
+) -> GrayImage {
+    // Special case
+    if factor == 1 {
+        return img.clone::<R>(cube);
+    }
+
+    // gaussian kernel std is proportional to downsampling factor
+    // filteredImage = gaussian_filter(image, sigma=factor * 0.5, order=0, output=None, mode='reflect')
+    let gaussian_kernel = create_gaussian_kernel1d(factor as f32 * 0.5, 4 * ((factor + 1) / 2));
+    let gaussian_kernel = Tensor::from_slice::<R>(cube, &gaussian_kernel);
+
+    let temp = conv1d::<R>(cube, &img.data, &gaussian_kernel); // convolve y
+    let filtered_image = conv1d::<R>(cube, &temp, &gaussian_kernel); // convolve x
+
+    // Shape of the downsampled image
+    let [h2, w2] = filtered_image.shape();
+    let (h2, w2) = (h2 as f32 / factor as f32, w2 as f32 / factor as f32);
+
+    let tensor = subsample::<R>(cube, &filtered_image, factor, factor);
+
+    GrayImage::from_tensor(tensor, w2 as _, h2 as _)
+}
+
+///
+/// Computes a 1-D Gaussian convolution kernel.
+///
+pub fn create_gaussian_kernel1d(sigma: f32, radius: u32) -> Vec<f32> {
+    let sigma2 = sigma * sigma;
+
+    let mut phi_x = (-(radius as i32)..radius as i32 + 1)
+        .map(|x| f32::exp(-0.5 / (sigma2 * (x * x) as f32)))
+        .collect::<Vec<_>>();
+
+    let sum: f32 = phi_x.iter().cloned().sum();
+
+    phi_x.iter_mut().for_each(|x| *x /= sum);
+    phi_x
+}
+
+pub fn subsample<R: Runtime>(
+    cube: &ComputeClient<R::Server, R::Channel>,
+    img: &Tensor<f32, 2>,
+    y_step: u32,
+    x_step: u32,
+) -> Tensor<f32, 2> {
+    let _ = x_step;
+    let _ = y_step;
+    let _ = img;
+    let _ = cube;
+    todo!()
+}
--- a/src/image/padding.rs
+++ b/src/image/padding.rs
@ -0,0 +1,20 @@
+use cubecl::{client::ComputeClient, Runtime};
+
+use super::GrayImage;
+
+pub fn pad<R: Runtime>(
+    cube: &ComputeClient<R::Server, R::Channel>,
+    img: &GrayImage,
+    left: u32,
+    right: u32,
+    top: u32,
+    bottom: u32,
+) -> GrayImage {
+    let _ = bottom;
+    let _ = top;
+    let _ = right;
+    let _ = left;
+    let _ = img;
+    let _ = cube;
+    todo!()
+}
--- a/src/kernels.rs
+++ b/src/kernels.rs
@ -0,0 +1,385 @@
+use cubecl::prelude::{Abs, Float, Line, Max, Min, Sqrt, Tensor, ABSOLUTE_POS_X, ABSOLUTE_POS_Y};
+
+use crate::{
+    gat::generalized_ascombe_transform,
+    image::{self, GrayImage},
+};
+
+pub struct EstimateKernelsConfig {
+    k_detail: f32,
+    k_denoise: f32,
+    d_th: f32,
+    d_tr: f32,
+    k_stretch: f32,
+    k_shrink: f32,
+}
+
+#[derive(Debug, Clone, Copy, PartialEq)]
+pub struct NoiseConfig {
+    pub alpha: f32,
+    pub beta: f32,
+}
+
+///
+/// Implementation of Alg. 5: ComputeKernelCovariance
+///
+/// Returns the kernels covariance matrices for the frame J_n, sampled at the
+/// center of every bayer quad (or at the center of every grey pixel in grey
+/// mode).
+///
+/// Parameters
+/// ----------
+/// img : device Array[imshape_y, imshape_x]
+///     Raw image J_n
+///
+/// Returns
+/// -------
+/// covs : device Array[imshape_y/2, imshape_x/2, 2, 2]
+///     Covariance matrices Omega_n, sampled at the center of each bayer quad.
+///
+///
+pub fn estimate_kernels(
+    img: &GrayImage,
+    cfg: EstimateKernelsConfig,
+    noise_cfg: NoiseConfig,
+) -> image::Tensor<f32, 4> {
+    // Performing Variance Stabilization Transform
+    let img = generalized_ascombe_transform(img, noise_cfg);
+
+    // Decimate to grey
+    // if bayer_mode :
+    //     img_grey = compute_grey_images_(img, method="decimating")
+
+    // else :
+    //     img_grey = img # no need to copy now, they will be copied to gpu later.
+
+    let img_gray = img;
+    let (grey_imshape_y, grey_imshape_x) = img_gray.shape();
+
+    // Computing grads
+    // let th_grey_img = th.as_tensor(img_grey, dtype = DEFAULT_TORCH_FLOAT_TYPE, device = "cuda");
+
+    // Horizontal filters
+    // let grad_kernel1 = np.array([[[[-0.5, 0.5]]], [[[0.5, 0.5]]]]);
+
+    // Vertical filters
+    // let grad_kernel2 = np.array([[[[0.5], [0.5]]], [[[-0.5], [0.5]]]]);
+
+    // let tmp = conv1d(th_grey_img, grad_kernel1);
+    // let th_full_grad = conv1d(tmp, grad_kernel2, groups = 2);
+
+    // The default padding mode reduces the shape of grey_img of 1 pixel in each
+    // direction, as expected
+
+    // let cuda_full_grads =
+    // cuda.as_cuda_array(th_full_grad.squeeze().transpose(0, 1).transpose(1, 2));
+
+    // shape [y, x, 2]
+    // let covs = cuda.device_array(grey_imshape + (2, 2), DEFAULT_NUMPY_FLOAT_TYPE);
+
+    // threadsperblock = (DEFAULT_THREADS, DEFAULT_THREADS)
+    // blockspergrid_x = math.ceil(grey_imshape_x/threadsperblock[1])
+    // blockspergrid_y = math.ceil(grey_imshape_y/threadsperblock[0])
+    // blockspergrid = (blockspergrid_x, blockspergrid_y)
+
+    // cuda_estimate_kernel[blockspergrid, threadsperblock](cuda_full_grads,
+    //                                 k_detail, k_denoise, D_th, D_tr, k_stretch, k_shrink,
+    //                                 covs)
+    // covs
+    todo!()
+}
+
+#[allow(clippy::identity_op)]
+#[cubecl::cube]
+fn cuda_estimate_kernel(
+    full_grads: &Tensor<f32>,
+    covs: &mut Tensor<f32>,
+    k_detail: f32,
+    k_denoise: f32,
+    d_th: f32,
+    d_tr: f32,
+    k_stretch: f32,
+    k_shrink: f32,
+) {
+    let pixel_idx = ABSOLUTE_POS_X;
+    let pixel_idy = ABSOLUTE_POS_Y;
+
+    let imshape_y = covs.shape(0);
+    let imshape_x = covs.shape(1);
+
+    if pixel_idy >= imshape_y || pixel_idx >= imshape_x {
+        return;
+    }
+
+    let covs_offset = pixel_idy * imshape_x + pixel_idx;
+
+    let mut structure_tensor: Line<f32> = Line::empty(4);
+    structure_tensor[0] = 0.0;
+    structure_tensor[1] = 0.0;
+    structure_tensor[2] = 0.0;
+    structure_tensor[3] = 0.0;
+
+    for i in 0..2 {
+        for j in 0..2 {
+            let x = (pixel_idx + j) as i32 - 1;
+            let y = (pixel_idy + i) as i32 - 1;
+
+            if (0 <= y && y < full_grads.shape(0) as i32)
+                && (0 <= x && x < full_grads.shape(1) as i32)
+            {
+                let full_grad_offset = y as u32 * full_grads.stride(0) + x as u32;
+                let full_grad_x = full_grads[full_grad_offset + 0];
+                let full_grad_y = full_grads[full_grad_offset + 1];
+
+                structure_tensor[0] += full_grad_x * full_grad_x;
+                structure_tensor[1] += full_grad_x * full_grad_y;
+                structure_tensor[2] += full_grad_x * full_grad_y;
+                structure_tensor[3] += full_grad_y * full_grad_y;
+            }
+        }
+    }
+
+    let mut l = Line::empty(2);
+    let mut e1 = Line::empty(2);
+    let mut e2 = Line::empty(2);
+    let mut k = Line::empty(2);
+
+    get_eigen_elmts_2x2(&structure_tensor, &mut l, &mut e1, &mut e2);
+    compute_k(
+        &mut k, l[0], l[1], k_detail, k_denoise, d_th, d_tr, k_stretch, k_shrink,
+    );
+
+    let k_1_sq = k[0] * k[0];
+    let k_2_sq = k[1] * k[1];
+
+    covs[covs_offset + 0] = k_1_sq * e1[0] * e1[0] + k_2_sq * e2[0] * e2[0];
+    covs[covs_offset + 1] = k_1_sq * e1[0] * e1[1] + k_2_sq * e2[0] * e2[1];
+    covs[covs_offset + 2] = k_1_sq * e1[0] * e1[1] + k_2_sq * e2[0] * e2[1];
+    covs[covs_offset + 3] = k_1_sq * e1[1] * e1[1] + k_2_sq * e2[1] * e2[1];
+}
+
+///
+/// Cuda function for resolving the 2x2 system A*X = B
+/// by using the analytical formula
+///
+/// Parameters
+/// ----------
+/// A : Array[2,2]
+///
+/// B : Array[2]
+///
+/// Returns
+/// -------
+/// None
+///
+///
+
+#[cubecl::cube]
+fn solve_2x2(a: &Line<f32>, b: &Line<f32>, x: &mut Line<f32>) {
+    let det_a = a[0] * a[3] - a[1] * a[2];
+
+    x[0] = (a[3] * b[0] - a[1] * b[1]) / det_a;
+    x[1] = (a[0] * b[1] - a[2] * b[0]) / det_a;
+}
+
+///
+/// inverts the 2x2 M array
+///
+/// Parameters
+/// ----------
+/// M : Array[2, 2]
+///     Array to invert
+/// M_i : Array[2, 2]
+///
+/// Returns
+/// -------
+/// None.
+///
+///
+#[cubecl::cube]
+fn invert_2x2(m: &Line<f32>, m_i: &mut Line<f32>) {
+    let det = m[0] * m[3] - m[1] * m[2];
+
+    if Abs::abs(det) > f32::new(f32::EPSILON) {
+        let det_i = 1.0 / det;
+
+        m_i[0] = m[3] * det_i;
+        m_i[1] = -m[1] * det_i;
+        m_i[2] = -m[2] * det_i;
+        m_i[3] = m[0] * det_i;
+    } else {
+        m_i[0] = 1.0;
+        m_i[1] = 0.0;
+        m_i[2] = 0.0;
+        m_i[3] = 1.0;
+    }
+}
+
+///
+/// Returns the two roots of the polynom a*X^2 + b*X + c = 0 for a, b and c
+/// real numbers. The function only returns real roots : make sure they exist
+/// before calling the function. l[0] contains the root with the biggest module
+/// and l[1] the smallest
+///
+///
+/// Parameters
+/// ----------
+/// a : float
+///
+/// b : float
+///
+/// c : float
+///
+/// roots : Array[2]
+///
+/// Returns
+/// -------
+/// None
+///
+#[cubecl::cube]
+fn get_real_polyroots_2(a: f32, b: f32, c: f32, roots: &mut Line<f32>) {
+    // numerical instabilities can cause delta to be slightly negative despite
+    // the equation admitting 2 real roots.
+    let delta_root = Sqrt::sqrt(Max::max(b * b - 4.0 * a * c, 0.0));
+    let r1 = (-b + delta_root) / (2.0 * a);
+    let r2 = (-b - delta_root) / (2.0 * a);
+
+    let r1_abs: f32 = Abs::abs(r1);
+    if r1_abs >= Abs::abs(r2) {
+        roots[0] = r1;
+        roots[1] = r2;
+    } else {
+        roots[0] = r2;
+        roots[1] = r1;
+    }
+}
+
+#[cubecl::cube]
+fn get_eigen_val_2x2(m: &Line<f32>, l: &mut Line<f32>) {
+    let a = 1.0;
+    let b = -(m[0] + m[3]);
+    let c = m[0] * m[3] - m[1] * m[2];
+
+    get_real_polyroots_2(a, b, c, l)
+}
+
+///
+/// return the eigen vectors with norm 1 for the eigen values l
+/// M.e1 = l1.e1 ; M.e2 = l2.e2
+///
+/// Parameters
+/// ----------
+/// M : Array[2,2]
+///   Real Symmetric array for which eigen values are to be determined
+/// l : Array[2]
+/// e1, e2 : Array[2]
+///     sorted Eigenvalues
+/// e1, e2 : Array[2, 2]
+///     Computed orthogonal and normalized eigen vectors
+///
+/// Returns
+/// -------
+/// None.
+///
+#[cubecl::cube]
+fn get_eigen_vect_2x2(m: &Line<f32>, l: &Line<f32>, e1: &mut Line<f32>, e2: &mut Line<f32>) {
+    // 2x2 algorithm : https://en.wikipedia.org/wiki/Eigenvalue_algorithm
+    if m[1] == 0.0 && m[0] == m[3] {
+        // m is multiple of identity, picking 2 ortogonal eigen vectors.
+        e1[0] = 1.0;
+        e1[1] = 0.0;
+        e2[0] = 0.0;
+        e2[1] = 1.0;
+    } else {
+        // averaging 2 for increased reliability
+        e1[0] = m[0] + m[1] - l[1];
+        e1[1] = m[2] + m[3] - l[1];
+
+        if e1[0] == 0.0 {
+            e1[1] = 1.0;
+            e2[0] = 1.0;
+            e2[1] = 0.0;
+        } else if e1[1] == 0.0 {
+            e1[0] = 1.0;
+            e2[0] = 0.0;
+            e2[1] = 1.0;
+        } else {
+            let norm_ = Sqrt::sqrt(e1[0] * e1[0] + e1[1] * e1[1]);
+            e1[0] /= norm_;
+            e1[1] /= norm_;
+
+            let sign = copysign(e1[0]);
+            e2[1] = Abs::abs(e1[0]);
+            e2[0] = -e1[1] * sign
+        }
+    }
+}
+
+#[cubecl::cube]
+fn get_eigen_elmts_2x2(m: &Line<f32>, l: &mut Line<f32>, e1: &mut Line<f32>, e2: &mut Line<f32>) {
+    get_eigen_val_2x2(m, l);
+    get_eigen_vect_2x2(m, l, e1, e2);
+}
+
+///
+/// Computes k_1 and k_2 based on lambda1, lambda2 and the constants.
+///
+/// Parameters
+/// ----------
+/// l1 : float
+///     lambda1 (dominant eigen value)
+/// l2 : float
+///     lambda2 : second eigenvalue
+/// k : Array[2]
+///     empty vector where k_1 and k_2 will be stored
+/// k_detail : float
+/// k_denoise : float
+/// D_th : float
+/// D_tr : float
+/// k_stretch : float
+/// k_shrink : float
+///     Parameters to compute k_1 and k_2, all detailed in the article.
+///
+#[cubecl::cube]
+fn compute_k(
+    k: &mut Line<f32>,
+    l1: f32,
+    l2: f32,
+    k_detail: f32,
+    k_denoise: f32,
+    d_th: f32,
+    d_tr: f32,
+    k_stretch: f32,
+    k_shrink: f32,
+) {
+    // When A is Nan, we fall back to this condition
+    let mut k1 = 1.0f32;
+    let mut k2 = 1.0f32;
+
+    let a: f32 = 1.0 + Sqrt::sqrt((l1 - l2) / (l1 + l2));
+    let d: f32 = clamp(1.0 - Sqrt::sqrt(l1) / d_tr + d_th, 0.0, 1.0);
+
+    // This is a very aggressive way of driving anisotropy, but it works well so far.
+    if a > 1.95 {
+        k1 = 1.0 / k_shrink;
+        k2 = k_stretch;
+    }
+
+    k[0] = k_detail * ((1.0 - d) * k1 + d * k_denoise);
+    k[1] = k_detail * ((1.0 - d) * k2 + d * k_denoise);
+}
+
+#[cubecl::cube]
+fn copysign(val: f32) -> f32 {
+    if val < 0.0f32 {
+        -(1.0f32)
+    } else {
+        1.0f32
+    }
+}
+
+#[cubecl::cube]
+fn clamp(a: f32, min: f32, max: f32) -> f32 {
+    // Clamp::clamp(a, min, max)
+    Min::min(Max::max(a, min), max)
+}
--- a/src/lib.rs
+++ b/src/lib.rs
@ -0,0 +1,6 @@
+pub mod alignment;
+pub mod gat;
+pub mod image;
+pub mod kernels;
+// pub mod merge;
+pub mod misc;
--- a/src/merge.rs
+++ b/src/merge.rs
@ -0,0 +1,411 @@
+use cubecl::prelude::Tensor;
+
+use crate::image::{Alignments, GrayImage};
+
+///
+/// Implementation of Alg. 11: AccumulationReference
+/// Accumulates the reference frame into num and den, while considering
+/// (if enabled) the accumulated robustness mask to enforce single frame SR if
+/// necessary.
+///
+/// Parameters
+/// ----------
+/// ref_img : device Array[imshape_y, imshape_x]
+///     Reference image J_1
+/// kernels : device Array[imshape_y//2, imshape_x//2, 2, 2]
+///     Covariance Matrices Omega_1
+/// num : device Array[s*imshape_y, s*imshape_x, c]
+///     Numerator of the accumulator
+/// den : device Array[s*imshape_y, s*imshape_x, c]
+///     Denominator of the accumulator
+/// options : dict
+///     verbose options.
+/// params : dict
+///     parameters (containing the zoom s).
+/// acc_rob : [imshape_y, imshape_x], optional
+///     accumulated robustness mask. The default is None.
+///
+/// Returns
+/// -------
+/// None.
+///
+pub fn merge_ref() {}
+
+#[cubecl::cube]
+fn accumulate_ref(ref_img: &Tensor<f32>, covs: &Tensor<f32>, bayer_mode: bool, iso_kernel: bool, scale: f32, cfa_pattern: &Array<f32>,
+                   num: &Tensor<f32>, den: &Tensor<f32>, acc_rob: &Tensor<f32>,
+                   robustness_denoise: bool, max_frame_count: u32, rad_max: u32, max_multiplier: u32) {
+    
+    let output_pixel_idx, output_pixel_idy = cuda.grid(2);
+    let output_size_y, output_size_x, _ = num.shape;
+    let input_size_y, input_size_x = ref_img.shape;
+    
+    if output_pixel_idx >= output_size_x || output_pixel_idy >= output_size_y) {
+        return;
+    }
+    
+    if bayer_mode {
+        n_channels = 3;
+        acc = cuda.local.array(3, dtype=DEFAULT_CUDA_FLOAT_TYPE);
+        val = cuda.local.array(3, dtype=DEFAULT_CUDA_FLOAT_TYPE);
+    } else {
+        n_channels = 1;
+        acc = cuda.local.array(1, dtype=DEFAULT_CUDA_FLOAT_TYPE);
+        val = cuda.local.array(1, dtype=DEFAULT_CUDA_FLOAT_TYPE);
+    }
+    
+    // Copying CFA locally. We will read that 9 times, so it's worth it
+    let local_cfa = cuda.local.array((2,2), uint8);
+    for i in 0..2 {
+        for j in 0..2 {
+            local_cfa[i,j] = uint8(cfa_pattern[i,j]);
+        }
+    }
+    
+    let coarse_ref_sub_pos = cuda.local.array(2, dtype=DEFAULT_CUDA_FLOAT_TYPE); // y, x
+    coarse_ref_sub_pos[0] = output_pixel_idy / scale;
+    coarse_ref_sub_pos[1] = output_pixel_idx / scale;
+    
+    for chan in 0..n_channels {
+        acc[chan] = 0.0;
+        val[chan] = 0.0;
+    }
+    
+    // computing kernel
+    // this is rather slow and could probably be sped up
+    if !iso_kernel {
+        let interpolated_cov = cuda.local.array((2, 2), dtype = DEFAULT_CUDA_FLOAT_TYPE);
+        let cov_i = cuda.local.array((2, 2), dtype=DEFAULT_CUDA_FLOAT_TYPE);
+        
+        
+        // fetching the 4 closest covs
+        let close_covs = cuda.local.array((2, 2, 2 ,2), DEFAULT_CUDA_FLOAT_TYPE);
+        let grey_pos = cuda.local.array(2, DEFAULT_CUDA_FLOAT_TYPE);
+        if bayer_mode {
+            grey_pos[0] = (coarse_ref_sub_pos[0]-0.5)/2; // grey grid is offseted and twice more sparse
+            grey_pos[1] = (coarse_ref_sub_pos[1]-0.5)/2;
+        } else {
+            grey_pos[0] = coarse_ref_sub_pos[0]; // grey grid is exactly the coarse grid
+            grey_pos[1] = coarse_ref_sub_pos[1];
+        }
+    
+        // clipping the coordinates to stay in bound
+        let floor_x = int(max(math.floor(grey_pos[1]), 0));
+        let floor_y = int(max(math.floor(grey_pos[0]), 0));
+        
+        let ceil_x = min(floor_x + 1, covs.shape[1]-1);
+        let ceil_y = min(floor_y + 1, covs.shape[0]-1);
+        for i in 0..2 {
+            for j in 0..2 {
+                close_covs[0, 0, i, j] = covs[floor_y, floor_x, i, j];
+                close_covs[0, 1, i, j] = covs[floor_y, ceil_x, i, j];
+                close_covs[1, 0, i, j] = covs[ceil_y, floor_x, i, j];
+                close_covs[1, 1, i, j] = covs[ceil_y, ceil_x, i, j];
+            }
+        }
+
+        // interpolating covs
+        interpolate_cov(close_covs, grey_pos, interpolated_cov);
+        invert_2x2(interpolated_cov, cov_i);
+    }
+
+    // fetching acc robustness if required
+    // Acc robustness is known for each raw pixel. An implicit interpolation done
+    // from LR to HR using nearest neighbor. 
+    if robustness_denoise {
+        let local_acc_r = acc_rob[min(round(coarse_ref_sub_pos[0]), acc_rob.shape[0]-1),
+                                  min(round(coarse_ref_sub_pos[1]), acc_rob.shape[1]-1)];
+        
+        additional_denoise_power = denoise_power_merge(local_acc_r, max_multiplier, max_frame_count);
+        rad = denoise_range_merge(local_acc_r, rad_max, max_frame_count);
+    } else {
+        additional_denoise_power = 1;
+        rad = 1;
+    }
+        
+    let center_x = round(coarse_ref_sub_pos[1]);
+    let center_y = round(coarse_ref_sub_pos[0]);
+    for i in -rad..=rad {
+        for j in -rad..=rad {
+            let pixel_idx = center_x + j;
+            let pixel_idy = center_y + i;
+            
+            // in bound condition
+            if (0 <= pixel_idx && pixel_idx < input_size_x) &&
+                (0 <= pixel_idy && pixel_idy < input_size_y) {
+            
+                // checking if pixel is r, g or b
+                let channel if bayer_mode : 
+                    local_CFA[pixel_idy%2, pixel_idx%2]
+                else:
+                    0
+                };
+                    
+                // By fetching the value now, we can compute the kernel weight 
+                // while it is called from global memory
+                let c = ref_img[pixel_idy, pixel_idx];
+            
+                // computing distance
+                let dist_x = pixel_idx - coarse_ref_sub_pos[1];
+                let dist_y = pixel_idy - coarse_ref_sub_pos[0];
+            
+                // Computing w
+                let mut y = if iso_kernel { 
+                    max(0, 2 * (dist_x * dist_x + dist_y * dist_y))
+                } else {
+                    max(0, quad_mat_prod(cov_i, dist_x, dist_y))
+                };
+
+                // y can be slightly negative because of numerical precision.
+                // I clamp it to not explode the error with exp
+
+                // this is equivalent to multiplying the covariance,
+                // but at the cost of one scalar operation (instead of 4)
+                y /= additional_denoise_power;
+                
+                let w = if bayer_mode { 
+                    exp(-0.5 * y)
+                } else {
+                    exp(-0.5 * 4.0 * y) // original kernel constants are designed for bayer distances, not greys, Hence x4
+                };
+                // -----
+                val[channel] += c * w;
+                acc[channel] += w;
+            }
+        }
+    }
+
+    if robustness_denoise and local_acc_r < max_frame_count {
+        // Overwritting values to enforce single frame demosaicing        
+
+        for chan in 0..n_channels {
+            num[output_pixel_idy, output_pixel_idx, chan] = val[chan];
+            den[output_pixel_idy, output_pixel_idx, chan] = acc[chan];
+        }
+    } else {
+        for chan in 0..n_channels {
+            num[output_pixel_idy, output_pixel_idx, chan] += val[chan];
+            den[output_pixel_idy, output_pixel_idx, chan] += acc[chan];
+        }
+    }
+}
+
+pub struct MergeConfig {
+    pub scale: f32,
+}
+    
+/// 
+/// Implementation of Alg. 4: Accumulation
+/// Accumulates comp_img (J_n, n>1) into num and den, based on the alignment
+/// V_n, the covariance matrices Omega_n and the robustness mask estimated before.
+/// The size of the merge_result is adjustable with params['scale']
+/// 
+/// 
+/// Parameters
+/// ----------
+/// comp_imgs : device Array [imsize_y, imsize_x]
+///     The non-reference image to merge (J_n)
+/// alignments : device Array[n_tiles_y, n_tiles_x, 2]
+///     The final estimation of the tiles' alignment V_n(p)
+/// covs : device array[imsize_y//2, imsize_x//2, 2, 2]
+///     covariance matrices Omega_n
+/// r : Device_Array[imsize_y, imsize_x]
+///     Robustness mask r_n
+/// num : device Array[s*imshape_y, s*imshape_x, c]
+///     Numerator of the accumulator
+/// den : device Array[s*imshape_y, s*imshape_x, c]
+///     Denominator of the accumulator
+///     
+/// options : Dict
+///     Options to pass
+/// params : Dict
+///     parameters
+/// 
+/// Returns
+/// -------
+/// None
+/// 
+fn merge(comp_img: &GrayImage, alignments: &Alignments, covs: image::Tensor<f32 4>, r, num, den, cfg: MergeConfig) {
+    // scale = params['scale']
+    
+    // CFA_pattern = cuda.to_device(params['exif']['CFA Pattern'])
+    // bayer_mode = params['mode'] == 'bayer'
+    // iso_kernel = params['kernel'] == 'iso'
+    // tile_size = params['tuning']['tileSize']
+
+    // native_im_size = comp_img.shape
+
+    // // casting to integer to account for floating scale
+    // output_size = (round(scale*native_im_size[0]), round(scale*native_im_size[1]))
+
+
+    // dispatching threads. 1 thread for 1 output pixel
+    // threadsperblock = (DEFAULT_THREADS, DEFAULT_THREADS) # maximum, we may take less
+    // blockspergrid_x = math.ceil(output_size[1]/threadsperblock[1])
+    // blockspergrid_y = math.ceil(output_size[0]/threadsperblock[0])
+    // blockspergrid = (blockspergrid_x, blockspergrid_y)
+                    
+    // accumulate[blockspergrid, threadsperblock](
+    //     comp_img, alignments, covs, r,
+    //     bayer_mode, iso_kernel, scale, tile_size, CFA_pattern,
+    //     num, den)
+}
+
+
+#[cubecl::cube]
+fn accumulate(comp_img, alignments, covs, r,
+               bayer_mode, iso_kernel, scale, tile_size, CFA_pattern,
+               num, den) {
+
+    let output_pixel_idx, output_pixel_idy = cuda.grid(2);
+    
+    let output_size_y, output_size_x, _ = num.shape;
+    let input_size_y, input_size_x = comp_img.shape;
+    
+    if output_pixel_idx >= output_size_x || output_pixel_idy >= output_size_y {
+        return;
+    }
+    
+    if bayer_mode {
+        n_channels = 3;
+        acc = cuda.local.array(3, dtype=DEFAULT_CUDA_FLOAT_TYPE);
+        val = cuda.local.array(3, dtype=DEFAULT_CUDA_FLOAT_TYPE);
+    } else {
+        n_channels = 1;
+        acc = cuda.local.array(1, dtype=DEFAULT_CUDA_FLOAT_TYPE);
+        val = cuda.local.array(1, dtype=DEFAULT_CUDA_FLOAT_TYPE);
+    }
+    
+    // Copying CFA locally. We will read that 9 times, so it's worth it
+    let local_cfa = cuda.local.array((2,2), uint8);
+    for i in 0..2 {
+        for j in 0..2 {
+            local_CFA[i,j] = uint8(cfa_pattern[i,j]);
+        }
+    }
+    
+    let coarse_ref_sub_pos = cuda.local.array(2, dtype=DEFAULT_CUDA_FLOAT_TYPE); // y, x
+    
+    coarse_ref_sub_pos[0] = output_pixel_idy / scale;
+    coarse_ref_sub_pos[1] = output_pixel_idx / scale;
+
+    // fetch of the flow, as early as possible
+    let local_optical_flow = cuda.local.array(2, dtype=DEFAULT_CUDA_FLOAT_TYPE);
+    let patch_idy = int(coarse_ref_sub_pos[0] / tile_size);
+    let patch_idx = int(coarse_ref_sub_pos[1] /tile_size);
+
+    local_optical_flow[0] = alignments[patch_idy, patch_idx, 0];
+    local_optical_flow[1] = alignments[patch_idy, patch_idx, 1];
+    
+    for chan in 0..n_channels {
+        acc[chan] = 0.0;
+        val[chan] = 0.0;
+    }
+    
+    let patch_center_pos = cuda.local.array(2, DEFAULT_CUDA_FLOAT_TYPE); // y, x
+
+
+    // fetching robustness
+    // The robustness coefficient is known for every raw pixel, and implicitely
+    // interpolated to HR using nearest neighboor interpolations.
+    
+    let y_r = clamp(round(coarse_ref_sub_pos[0]), 0, r.shape[0]-1);
+    let x_r = clamp(round(coarse_ref_sub_pos[1]), 0, r.shape[1]-1);
+    let local_r = r[y_r, x_r];
+
+    patch_center_pos[1] = coarse_ref_sub_pos[1] + local_optical_flow[0];
+    patch_center_pos[0] = coarse_ref_sub_pos[0] + local_optical_flow[1];
+
+    // updating inbound condition
+    if patch_center_pos[1] >= input_size_x || patch_center_pos[0] >= input_size_y {
+        return;
+    }
+    
+    // computing kernel
+    if !iso_kernel {
+        let interpolated_cov = cuda.local.array((2, 2), dtype = DEFAULT_CUDA_FLOAT_TYPE);
+        let cov_i = cuda.local.array((2, 2), dtype=DEFAULT_CUDA_FLOAT_TYPE);
+        // fetching the 4 closest covs
+        let close_covs = cuda.local.array((2, 2, 2 ,2), DEFAULT_CUDA_FLOAT_TYPE);
+        let grey_pos = cuda.local.array(2, DEFAULT_CUDA_FLOAT_TYPE);
+        
+        if bayer_mode {
+            grey_pos[0] = (patch_center_pos[0] - 0.5) /2; // grey grid is offseted and twice more sparse
+            grey_pos[1] = (patch_center_pos[1] - 0.5) /2;
+        } else {
+            grey_pos[0] = patch_center_pos[0]; // grey grid is exactly the coarse grid
+            grey_pos[1] = patch_center_pos[1];
+        }
+        
+        // clipping the coordinates to stay in bound
+        let floor_x = int(max(math.floor(grey_pos[1]), 0));
+        let floor_y = int(max(math.floor(grey_pos[0]), 0));
+        
+        let ceil_x = min(floor_x + 1, covs.shape[1]-1);
+        let ceil_y = min(floor_y + 1, covs.shape[0]-1);
+        for i in 0..2 {
+            for j in range(0, 2) {
+                close_covs[0, 0, i, j] = covs[floor_y, floor_x, i, j];
+                close_covs[0, 1, i, j] = covs[floor_y, ceil_x, i, j];
+                close_covs[1, 0, i, j] = covs[ceil_y, floor_x, i, j];
+                close_covs[1, 1, i, j] = covs[ceil_y, ceil_x, i, j];
+            }
+        }
+
+        // interpolating covs at the desired spot
+        interpolate_cov(close_covs, grey_pos, interpolated_cov);
+        invert_2x2(interpolated_cov, cov_i);
+    }
+        
+    let center_x = round(patch_center_pos[1]);
+    let center_y = round(patch_center_pos[0]);
+
+    for i in -1..=1 {
+        for j in -1..=1 {
+            let pixel_idx = center_x + j;
+            let pixel_idy = center_y + i;
+            
+            // in bound condition
+            if (0 <= pixel_idx && pixel_idx< input_size_x) && 
+                (0 <= pixel_idy && pixel_idy< input_size_y) {
+            
+                // checking if pixel is r, g or b
+                let channel = if bayer_mode { 
+                    local_CFA[pixel_idy%2, pixel_idx%2]
+                } else {
+                    0
+                };
+                // By fetching the value now, we can compute the kernel weight 
+                // while it is called from global memory
+                let c = comp_img[pixel_idy, pixel_idx];
+            
+                // computing distance
+                let dist_x = pixel_idx - patch_center_pos[1];
+                let dist_y = pixel_idy - patch_center_pos[0];
+            
+                // Computing w
+                let y = if iso_kernel {
+                    max(0, 2 * (dist_x * dist_x + dist_y * dist_y))
+                } else {
+                    max(0, quad_mat_prod(cov_i, dist_x, dist_y))
+                    // y can be slightly negative because of numerical precision.
+                    // I clamp it to not explode the error with exp
+                };
+                
+                w = math.exp(-0.5*y)
+
+                //---
+                    
+                val[channel] += c * w * local_r;
+                acc[channel] += w * local_r;
+            }
+        }
+    }
+    
+    for chan in 0..n_channels {
+        num[output_pixel_idy, output_pixel_idx, chan] += val[chan];
+        den[output_pixel_idy, output_pixel_idx, chan] += acc[chan];
+    }
+}
+    
+    
--- a/src/misc.rs
+++ b/src/misc.rs
@ -0,0 +1,5 @@
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub enum DistanceKind {
+    L1,
+    L2,
+}