Initial

2024-12-05 17:54:52 +04:00 · 2024-12-05 17:54:52 +04:00 · 80a8d585e2
commit 80a8d585e2
44 changed files with 10650 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1 @@
+/target
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@ -0,0 +1,18 @@
+[package]
+name = "cub"
+version = "0.1.0"
+edition = "2021"
+
+[dependencies]
+arrayvec = "0.7.6"
+rawloader = { git = "https://github.com/andreytkachenko/rawloader" }
+cubecl = { git = "https://github.com/tracel-ai/cubecl.git", features = [
+    # "wgpu-spirv",
+    "wgpu",
+    # "cuda",
+] }
+smallvec = "1.13.2"
+bytemuck = "1.19.0"
+image = "0.25.5"
+thiserror = "2.0.4"
+log = "0.4.22"
--- a/readme.md
+++ b/readme.md
@ -0,0 +1,28 @@
+align_merge_spatial_domain     ok
+color_difference               ok
+estimate_color_noise           ok
+robust_merge                   ok
+align_image                    ok
+avg_pool                       -
+build_pyramid                  ok
+compute_tile_diff              -
+correct_upsampling_error       -
+find_best_tile_alignment       - 
+warp_image                     partial
+add_image                      ok
+add_image_highlights           ok
+add_image_exposure             ok
+add_image_weighted             ok
+blur                           partial
+calculate_black_levels         -
+calculate_weight_highlights    -
+crop_image                     -
+image_mean                     -
+upsample_alignments            -
+find_hotpixels                 -
+normalize_image                -
+prepare_image                  - 
+image_like                     -
+calculate_temporal_average     -
+correct_exposure               partial
+image_max                      -
--- a/src/alignment.rs
+++ b/src/alignment.rs
--- a/src/backend.rs
+++ b/src/backend.rs
@ -0,0 +1,66 @@
+use std::ops;
+
+use cubecl::{client::ComputeClient, prelude::CubePrimitive, Runtime};
+
+use crate::tensor::{CpuTensor, GpuTensor, Tensor};
+
+#[derive(Debug, Clone)]
+pub struct Backend<R: Runtime> {
+    client: ComputeClient<R::Server, R::Channel>,
+}
+
+impl<Rt: Runtime> Backend<Rt> {
+    pub fn alloc_tensor<T: CubePrimitive, const R: usize>(
+        &self,
+        shape: [usize; R],
+    ) -> GpuTensor<T, R> {
+        Tensor::create(
+            self.client
+                .empty(shape.into_iter().product::<usize>() * size_of::<T>()),
+            shape,
+        )
+    }
+
+    pub fn load_tensor<T: bytemuck::NoUninit + CubePrimitive, const R: usize>(
+        &self,
+        data: &[T],
+        shape: [usize; R],
+    ) -> GpuTensor<T, R> {
+        let size = shape.into_iter().product::<usize>();
+
+        assert_eq!(
+            data.len(),
+            size,
+            "load_tensor: Data size mismatch shape size!"
+        );
+
+        Tensor::create(self.client.create(bytemuck::cast_slice(data)), shape)
+    }
+
+    pub fn store_tensor<T: bytemuck::AnyBitPattern + CubePrimitive, const R: usize>(
+        &self,
+        tensor: GpuTensor<T, R>,
+    ) -> CpuTensor<T, R> {
+        let data = self.client.read_one(tensor.data.binding());
+
+        let size = data.len() / std::mem::size_of::<T>();
+        let data = Box::into_raw(data.into_boxed_slice()) as *mut T;
+        let data = unsafe { Vec::from_raw_parts(data, size, size) };
+
+        CpuTensor::create(data, tensor.shape)
+    }
+
+    pub fn new(device: &Rt::Device) -> Self {
+        Self {
+            client: Rt::client(device),
+        }
+    }
+}
+
+impl<R: Runtime> ops::Deref for Backend<R> {
+    type Target = ComputeClient<R::Server, R::Channel>;
+
+    fn deref(&self) -> &Self::Target {
+        &self.client
+    }
+}
--- a/src/image.rs
+++ b/src/image.rs
@ -0,0 +1,250 @@
+use std::path::Path;
+
+use crate::{backend::Backend, tensor::GpuTensor};
+
+use cubecl::{prelude::*, CubeCount, CubeDim, Runtime, PLANE_DIM_APPROX};
+use image::{GrayImage, RgbImage};
+use rawloader::{RawImageData, Tag};
+
+pub type GpuImage<T> = GpuTensor<T, 3>;
+
+impl<T: bytemuck::AnyBitPattern + CubePrimitive> GpuImage<T> {
+    #[inline]
+    pub fn height(&self) -> u32 {
+        self.shape[0] as u32
+    }
+
+    #[inline]
+    pub fn width(&self) -> u32 {
+        self.shape[1] as u32
+    }
+
+    pub fn channels(&self) -> u32 {
+        self.shape[2] as u32
+    }
+}
+
+impl<T: bytemuck::AnyBitPattern + CubePrimitive> GpuImage<T> {
+    #[inline]
+    pub fn new<Rt: Runtime>(client: &Backend<Rt>, width: u32, height: u32, channels: u32) -> Self {
+        let dim = CubeDim::default();
+        let corr_width = width.next_multiple_of(dim.x);
+        let corr_height = height.next_multiple_of(dim.y);
+
+        client.alloc_tensor([corr_height as usize, corr_width as usize, channels as usize])
+    }
+}
+
+impl GpuImage<f32> {
+    pub fn to_image<R: Runtime>(self, client: &Backend<R>) -> image::DynamicImage {
+        let width = self.width();
+        let height = self.height();
+        let channels = self.channels();
+        let data = client.store_tensor(self);
+
+        if channels == 1 {
+            GrayImage::from_par_fn(width, height, move |x, y| {
+                let x = (data[[y as usize, x as usize, 0]] * 2.0).clamp(0.0, 1.0);
+
+                image::Luma([(x * 255.0) as u8])
+            })
+            .into()
+        } else {
+            RgbImage::from_par_fn(width, height, move |x, y| {
+                let r = (data[[y as usize, x as usize, 0]] * 4.0).clamp(0.0, 1.0) * 255.0;
+                let g = (data[[y as usize, x as usize, 1]] * 3.0).clamp(0.0, 1.0) * 255.0;
+                let b = (data[[y as usize, x as usize, 2]] * 4.0).clamp(0.0, 1.0) * 255.0;
+
+                image::Rgb([r as u8, g as u8, b as u8])
+            })
+            .into()
+        }
+    }
+}
+
+#[derive(Debug, Clone)]
+pub struct ImageMetadata {
+    /// ISO value used to capture the image
+    pub iso_speed: f32,
+
+    /// Shutter Speed used to capture the image
+    pub shutter_speed: f32,
+
+    /// Exposure Time used to capture the image
+    pub exposure_time: f32,
+
+    /// Exposure Bias
+    pub exposure_bias: f32,
+
+    /// Aperture value
+    pub aperture_value: f32,
+
+    /// F-number
+    pub f_number: f32,
+
+    /// Focal length
+    pub focal_length: f32,
+
+    /// Colour conversion matrix: xyz to rgb
+    // pub ccm: Mat4x3f,
+
+    /// Whitebalance coefficients encoded in the file in RGBE order
+    pub wb_coeffs: [f32; 4],
+}
+
+pub fn load_raw<R: Runtime, P: AsRef<Path>>(
+    client: &Backend<R>,
+    path: P,
+) -> Result<GpuImage<f32>, rawloader::RawLoaderError> {
+    println!("image `{}`:", path.as_ref().display());
+    let image = rawloader::decode_file(path)?;
+
+    println!("  CFA {:?}", image.cfa.name);
+    println!("  white_levels {:?}", image.whitelevels);
+    println!("  black_levels {:?}", image.blacklevels);
+    println!("  size {}x{}", image.width, image.height);
+    println!("  crop {:?}", image.crops);
+    println!();
+
+    let _meta = if let Some(exif) = image.exif.as_ref() {
+        let iso_speed = exif
+            .get_rational(Tag::ISOSpeed)
+            .or_else(|| exif.get_uint(Tag::ISOSpeed).map(|x| x as f32))
+            .unwrap_or(0.0);
+
+        let shutter_speed = exif.get_rational(Tag::ShutterSpeedValue).unwrap_or(0.0);
+        let aperture_value = exif.get_rational(Tag::ApertureValue).unwrap_or(0.0);
+        let exposure_time = exif.get_rational(Tag::ExposureTime).unwrap_or(0.0);
+        let exposure_bias = exif
+            .get_rational(Tag::ExposureBiasValue)
+            .or_else(|| {
+                exif.get_uint(Tag::ExposureBiasValue)
+                    .map(|x| x as i32 as f32)
+            })
+            .unwrap_or(0.0);
+
+        let f_number = exif.get_rational(Tag::FNumber).unwrap_or(0.0);
+        let focal_length = exif
+            .get_rational(Tag::FocalLength)
+            .or_else(|| exif.get_uint(Tag::FocalLength).map(|x| x as f32))
+            .unwrap_or(0.0);
+
+        Some(ImageMetadata {
+            iso_speed,
+            shutter_speed,
+            exposure_time,
+            exposure_bias,
+            aperture_value,
+            f_number,
+            focal_length,
+            // ccm: image.cam_to_xyz().into(),
+            wb_coeffs: image.wb_coeffs,
+        })
+    } else {
+        None
+    };
+
+    let image_data = match image.data {
+        RawImageData::Integer(vec) => vec,
+        RawImageData::Float(_) => unimplemented!(),
+    };
+
+    let pad_top = image.crops[0];
+    let pad_right = image.crops[1];
+    let pad_bottom = image.crops[2];
+    let pad_left = image.crops[3];
+
+    let (pad_top, pad_left) = match (image.cfa.name.as_str(), pad_top & 1 == 0, pad_left & 1 == 0) {
+        ("RGGB", false, false) => (pad_top + 1, pad_left + 1),
+        ("RGGB", false, true) => (pad_top + 1, pad_left),
+        ("RGGB", true, false) => (pad_top, pad_left + 1),
+        ("RGGB", true, true) => (pad_top, pad_left),
+
+        ("BGGR", false, false) => (pad_top, pad_left),
+        ("BGGR", false, true) => (pad_top, pad_left + 1),
+        ("BGGR", true, false) => (pad_top + 1, pad_left),
+        ("BGGR", true, true) => (pad_top + 1, pad_left + 1),
+
+        ("GRBG", false, false) => (pad_top + 1, pad_left),
+        ("GRBG", false, true) => (pad_top + 1, pad_left + 1),
+        ("GRBG", true, false) => (pad_top, pad_left),
+        ("GRBG", true, true) => (pad_top, pad_left + 1),
+
+        _ => panic!("not imaplemented {}", image.cfa.name),
+    };
+
+    let real_image_width = image.width - (pad_right + pad_left);
+    let real_image_height = image.height - (pad_top + pad_bottom);
+    let aligned_width = real_image_width.next_multiple_of(PLANE_DIM_APPROX);
+
+    let src_img = client.load_tensor(&image_data, [image.height, image.width, 1]);
+    let dst_img = GpuImage::new(client, real_image_width as u32, real_image_height as u32, 1);
+
+    let mut white_levels = SequenceArg::new();
+    for wl in image.whitelevels {
+        white_levels.push(ScalarArg::new(wl as u32));
+    }
+
+    let mut black_levels = SequenceArg::new();
+    for bl in image.blacklevels {
+        black_levels.push(ScalarArg::new(bl as u32));
+    }
+
+    #[cube(launch_unchecked)]
+    fn normalize_image(
+        input: &Tensor<u32>,
+        output: &mut Tensor<f32>,
+        white_levels: &Sequence<u32>,
+        black_levels: &Sequence<u32>,
+        pad_left: u32,
+        pad_top: u32,
+    ) {
+        let input_x = ABSOLUTE_POS_X + pad_left;
+        let input_y = ABSOLUTE_POS_Y + pad_top;
+
+        let var_x = input_x & 1;
+        let var_y = input_y & 1;
+
+        let pix = input[input_y * input.stride(0) + input_x / 2];
+        let pix = match var_x {
+            0 => pix & 0xFFFF,
+            _ => pix >> 16u32,
+        };
+
+        let bayer_offset = var_y + var_y + var_x;
+        let wl = match bayer_offset {
+            0 => *white_levels.index(0u32) as i32,
+            3 => *white_levels.index(2u32) as i32,
+            _ => *white_levels.index(1u32) as i32,
+        };
+
+        let bl = match bayer_offset {
+            0 => *black_levels.index(0u32) as i32,
+            3 => *black_levels.index(2u32) as i32,
+            _ => *black_levels.index(1u32) as i32,
+        };
+
+        output[ABSOLUTE_POS] = (pix as i32 - bl) as f32 / (wl - bl) as f32;
+    }
+
+    let dim = CubeDim::default();
+    unsafe {
+        normalize_image::launch_unchecked::<R>(
+            &client,
+            CubeCount::Static(
+                aligned_width as u32 / dim.x,
+                real_image_height as u32 / dim.y,
+                1,
+            ),
+            dim,
+            src_img.as_word_aligned_tensor_ref().as_tensor_arg(1),
+            dst_img.as_word_aligned_tensor_ref().as_tensor_arg(1),
+            white_levels,
+            black_levels,
+            ScalarArg::new(pad_left as u32),
+            ScalarArg::new(pad_top as u32),
+        )
+    };
+
+    Ok(dst_img)
+}
--- a/src/main.rs
+++ b/src/main.rs
@ -0,0 +1,56 @@
+pub mod alignment;
+pub mod backend;
+pub mod image;
+pub mod ops;
+pub mod tensor;
+
+use backend::Backend;
+use cubecl::{prelude::*, wgpu::WgpuDevice};
+use image::load_raw;
+use ops::{decimate_bayer::decimate_to_gray, downsample::downsample};
+
+pub fn launch<R: Runtime>(device: &R::Device) {
+    println!("device: {:?}", device);
+
+    let client = Backend::<R>::new(device);
+    let input_img = load_raw(&client, "test_burst1/im_00.dng").unwrap();
+    let gray_img = decimate_to_gray(&client, &input_img);
+
+    // let img = pad::<R>(
+    //     cube,
+    //     img,
+    //     0,
+    //     padding_patches_width,
+    //     0,
+    //     padding_patches_height,
+    // );
+    //
+    //
+    // let (h, w) = img.shape();
+
+    // // if needed, pad images with zeros so that getTiles contains all image pixels
+    // let padding_patches_height = tile_size - (h % tile_size);
+    // let padding_patches_width = tile_size - (w % tile_size);
+
+    // if padding_patches_width != 0 || padding_patches_height != 0 {
+    //     build_pyramid(cube, &img, factors)
+    // } else {
+    //     build_pyramid(cube, img, factors)
+    // }
+
+    let small1_img = downsample(&client, &gray_img, 2);
+    let small2_img = downsample(&client, &small1_img, 2);
+    let small3_img = downsample(&client, &small2_img, 2);
+
+    let gray = small3_img.to_image(&client);
+
+    println!("finish");
+    gray.save("test.png").unwrap();
+    println!("saved to test.png");
+}
+
+fn main() {
+    let device = WgpuDevice::IntegratedGpu(0);
+    launch::<cubecl::wgpu::WgpuRuntime>(&device);
+    // launch::<cubecl::cuda::CudaRuntime>(&Default::default());
+}
--- a/src/ops/avg_pool.rs
+++ b/src/ops/avg_pool.rs
@ -0,0 +1,58 @@
+use cubecl::prelude::*;
+
+#[cube(launch_unchecked)]
+fn cube_avg_pool(src: &Tensor<f32>, dst: &mut Tensor<f32>, scale: u32, black_level: f32) {
+    let mut out_pixel = 0.0;
+    let x0 = ABSOLUTE_POS_X * scale;
+    let y0 = ABSOLUTE_POS_Y * scale;
+
+    for dx in 0..scale {
+        for dy in 0..scale {
+            let x = x0 + dx;
+            let y = y0 + dy;
+
+            out_pixel += src[y * src.stride(0) + x * src.stride(1)] - black_level;
+        }
+    }
+
+    out_pixel /= (scale * scale) as f32;
+    dst[ABSOLUTE_POS_Y * dst.stride(0) + ABSOLUTE_POS_X * dst.stride(1)] = out_pixel;
+}
+
+#[cube(launch_unchecked)]
+fn cube_avg_pool_normalization(
+    src: &Tensor<f32>,
+    dst: &mut Tensor<f32>,
+    scale: u32,
+    black_level: f32,
+    factor_red: f32,
+    factor_green: f32,
+    factor_blue: f32,
+) {
+    let mut out_pixel = 0.0f32;
+    let x0 = ABSOLUTE_POS_X * scale;
+    let y0 = ABSOLUTE_POS_Y * scale;
+
+    let mut norm_factors: Line<f32> = Line::empty(4);
+    norm_factors[0] = factor_red;
+    norm_factors[1] = factor_green;
+    norm_factors[2] = factor_green;
+    norm_factors[3] = factor_blue;
+
+    let mean_factor =
+        0.25 * (norm_factors[0] + norm_factors[1] + norm_factors[2] + norm_factors[3]);
+
+    for dx in 0..scale {
+        for dy in 0..scale {
+            let x = x0 + dx;
+            let y = y0 + dy;
+
+            out_pixel += mean_factor / norm_factors[dy * scale + dx]
+                * src[y * src.stride(0) + x * src.stride(1)]
+                - black_level;
+        }
+    }
+
+    out_pixel /= (scale * scale) as f32;
+    dst[ABSOLUTE_POS_Y * dst.stride(0) + ABSOLUTE_POS_X * dst.stride(1)] = out_pixel;
+}
--- a/src/ops/block_matching.rs
+++ b/src/ops/block_matching.rs
@ -0,0 +1,466 @@
+use cubecl::{prelude::*, CubeCount, CubeDim, Runtime};
+
+use crate::{backend::Backend, image::GpuImage, tensor::GpuTensor};
+
+use super::gauss_pyramid::build_gauss_pyramid;
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub enum DistanceKind {
+    L1,
+    L2,
+}
+
+pub struct AlingLevelConfig {
+    pub factor: u32,
+    pub upsampling_factor: u32,
+    pub tile_size: u32,
+    pub search_radius: u32,
+    pub distance: DistanceKind,
+}
+
+pub struct AlignmentConfig {
+    pub levels: Vec<AlingLevelConfig>,
+}
+///
+/// Align the reference image with the img : returns a patchwise flow such that
+/// for patches py, px :
+///     img[py, px] ~= ref_img[py + alignments[py, px, 1],
+///                            px + alignments[py, px, 0]]
+///
+/// Parameters
+/// ----------
+/// ref_img     : Img [imshape_y, imshape_x]
+///     Image to be compared
+///
+/// ref_pyramid : [Img]
+///     Pyramid representation of the ref image J_1
+///
+/// Returns
+/// -------
+/// alignments : Alignments
+///     a device array[n_patchs_y, n_patchs_x, 2]
+///     Patchwise flow : V_n(p) for each patch (p)
+///
+pub fn align_image_block_matching<R: Runtime>(
+    cube: &Backend<R>,
+    img: &GpuImage<f32>,
+    ref_pyramid: &[GpuImage<f32>],
+    level_configs: &[AlingLevelConfig],
+) -> GpuTensor<f32, 3> {
+    let alt_pyramid = build_gauss_pyramid(cube, img, level_configs.iter().map(|x| x.factor));
+
+    // Align alternate image to the reference image
+    let mut alignments = None;
+    let cfg_iter = level_configs.iter().rev();
+    let mut prev_tile_size = 64;
+
+    for (lv, cfg) in cfg_iter.enumerate() {
+        alignments = Some(align_on_a_level::<R>(
+            cube,
+            &ref_pyramid[lv],
+            &alt_pyramid[lv],
+            cfg,
+            alignments.as_ref(),
+            prev_tile_size,
+        ));
+
+        prev_tile_size = cfg.tile_size;
+    }
+
+    alignments.unwrap()
+}
+
+///
+/// Alignment will always be an integer with this function, however it is
+/// set to DEFAULT_FLOAT_TYPE. This enables to directly use the outputed
+/// alignment for ICA without any casting from int to float
+///
+pub(crate) fn align_on_a_level<R: Runtime>(
+    cube: &Backend<R>,
+    ref_pyramid_lvl: &GpuImage<f32>,
+    alt_pyramid_lvl: &GpuImage<f32>,
+    cfg: &AlingLevelConfig,
+    prev: Option<&GpuTensor<f32, 3>>,
+    prev_tile_size: u32,
+) -> GpuTensor<f32, 3> {
+    let [ref_h, ref_w, _] = ref_pyramid_lvl.shape();
+
+    // Number of patches that can fit on this level
+    let h = ref_h as u32 / cfg.tile_size;
+    let w = ref_w as u32 / cfg.tile_size;
+
+    let mut alignments = cube.alloc_tensor([h as usize, w as usize, 2]);
+
+    if let Some(prev) = prev {
+        // use the upsampled previous alignments as initial guesses
+        upsample_alignments::<R>(
+            cube,
+            &mut alignments,
+            ref_pyramid_lvl,
+            alt_pyramid_lvl,
+            prev,
+            prev_tile_size,
+            cfg,
+        )
+    }
+
+    local_search::<R>(cube, &mut alignments, ref_pyramid_lvl, alt_pyramid_lvl, cfg);
+
+    alignments
+}
+
+pub fn local_search<R: Runtime>(
+    cube: &Backend<R>,
+    upsampled_alignments: &mut GpuTensor<f32, 3>,
+    ref_pyramid_lvl: &GpuImage<f32>,
+    alt_pyramid_lvl: &GpuImage<f32>,
+    cfg: &AlingLevelConfig,
+) {
+    let threadsperblock = CubeDim::default();
+
+    let w = upsampled_alignments.shape()[0] as u32;
+    let h = upsampled_alignments.shape()[1] as u32;
+
+    let blockspergrid_x = (w + threadsperblock.x - 1) / threadsperblock.x;
+    let blockspergrid_y = (h + threadsperblock.y - 1) / threadsperblock.y;
+
+    unsafe {
+        cube_local_search::launch_unchecked::<R>(
+            cube,
+            CubeCount::Static(blockspergrid_x, blockspergrid_y, 1),
+            threadsperblock,
+            ref_pyramid_lvl.as_tensor_ref().as_tensor_arg(1),
+            alt_pyramid_lvl.as_tensor_ref().as_tensor_arg(1),
+            upsampled_alignments
+                .as_word_aligned_tensor_ref()
+                .as_tensor_arg(1),
+            ScalarArg::new(cfg.tile_size),
+            ScalarArg::new(cfg.search_radius as _),
+            cfg.distance,
+        )
+    }
+}
+
+#[cubecl::cube(launch_unchecked)]
+#[allow(clippy::identity_op)]
+fn cube_local_search(
+    ref_pyramid_lvl: &Tensor<f32>,
+    alt_pyramid_lvl: &Tensor<f32>,
+    upsampled_alignments: &mut Tensor<f32>,
+    tile_size: u32,
+    search_radius: i32,
+    #[comptime] distance: DistanceKind,
+) {
+    let n_patchs_y = upsampled_alignments.shape(0);
+    let n_patchs_x = upsampled_alignments.shape(1);
+
+    let tile_x = ABSOLUTE_POS_X;
+    let tile_y = ABSOLUTE_POS_Y;
+
+    if tile_y >= n_patchs_y || tile_x >= n_patchs_x {
+        return;
+    }
+
+    let offset = tile_y * upsampled_alignments.stride(0) + tile_x * upsampled_alignments.stride(1);
+
+    let mut local_flow = Line::empty(2);
+    local_flow[0] = upsampled_alignments[offset + 0];
+    local_flow[1] = upsampled_alignments[offset + 1];
+
+    // position of the pixel in the top left corner of the patch
+    let patch_pos_x = tile_x * tile_size;
+    let patch_pos_y = tile_y * tile_size;
+
+    // this should be rewritten to allow patchs bigger than 32
+    let mut local_ref = Array::new(tile_size * tile_size);
+    for i in 0..tile_size {
+        for j in 0..tile_size {
+            let idx = patch_pos_x + j;
+            let idy = patch_pos_y + i;
+
+            local_ref[i * tile_size + j] = ref_pyramid_lvl[idy * ref_pyramid_lvl.stride(0) + idx];
+        }
+    }
+
+    let mut min_dist = f32::new(f32::INFINITY); // init as infty
+    let mut min_shift_y: i32 = 0;
+    let mut min_shift_x: i32 = 0;
+
+    // window search
+    for search_shift_y in -search_radius..=search_radius {
+        for search_shift_x in -search_radius..=search_radius {
+            // computing dist
+            let dist = cube_compute_dist(
+                &local_ref,
+                alt_pyramid_lvl,
+                &local_flow,
+                patch_pos_x as i32 + search_shift_x,
+                patch_pos_y as i32 + search_shift_y,
+                tile_size,
+                distance,
+            );
+
+            if dist < min_dist {
+                min_dist = dist;
+                min_shift_y = search_shift_y;
+                min_shift_x = search_shift_x;
+            }
+        }
+    }
+
+    upsampled_alignments[offset + 0] = local_flow[0] + min_shift_x as f32;
+    upsampled_alignments[offset + 1] = local_flow[1] + min_shift_y as f32;
+}
+
+///
+/// Upsample alignements to adapt them to the next pyramid level (Section 3.2 of the IPOL article).
+///
+pub(crate) fn upsample_alignments<R: Runtime>(
+    cube: &Backend<R>,
+    dst: &mut GpuTensor<f32, 3>,
+    ref_pyramid_lvl: &GpuImage<f32>,
+    alt_pyramid_lvl: &GpuImage<f32>,
+    prev_alignments: &GpuTensor<f32, 3>,
+    prev_tile_size: u32,
+    cfg: &AlingLevelConfig,
+) {
+    let threadsperblock = CubeDim::default();
+
+    let n_tiles_y_new = dst.shape()[0] as u32;
+    let n_tiles_x_new = dst.shape()[1] as u32;
+
+    let blockspergrid_x = (n_tiles_x_new + threadsperblock.x - 1) / threadsperblock.x;
+    let blockspergrid_y = (n_tiles_y_new + threadsperblock.y - 1) / threadsperblock.y;
+
+    unsafe {
+        cube_upsample_alignments::launch_unchecked::<R>(
+            cube,
+            CubeCount::Static(blockspergrid_x, blockspergrid_y, 1),
+            threadsperblock,
+            ref_pyramid_lvl.as_tensor_ref().as_tensor_arg(1),
+            alt_pyramid_lvl.as_tensor_ref().as_tensor_arg(1),
+            prev_alignments
+                .as_word_aligned_tensor_ref()
+                .as_tensor_arg(1),
+            dst.as_word_aligned_tensor_ref().as_tensor_arg(1),
+            ScalarArg::new(cfg.upsampling_factor),
+            ScalarArg::new(cfg.tile_size),
+            ScalarArg::new(prev_tile_size),
+        )
+    }
+}
+
+#[allow(clippy::identity_op)]
+#[cubecl::cube(launch_unchecked)]
+fn cube_upsample_alignments(
+    ref_pyramid_lvl: &Tensor<f32>,
+    alt_pyramid_lvl: &Tensor<f32>,
+    previous_alignments: &Tensor<f32>,
+    upsampled_alignments: &mut Tensor<f32>,
+    upsampling_factor: u32,
+    tile_size: u32,
+    prev_tile_size: u32,
+) {
+    let subtile_x = ABSOLUTE_POS_X;
+    let subtile_y = ABSOLUTE_POS_Y;
+
+    let n_tiles_y_prev = previous_alignments.shape(0);
+    let n_tiles_x_prev = previous_alignments.shape(1);
+
+    let n_tiles_y_new = upsampled_alignments.shape(0);
+    let n_tiles_x_new = upsampled_alignments.shape(1);
+
+    let w = ref_pyramid_lvl.shape(1);
+
+    let repeat_factor = upsampling_factor / (tile_size / prev_tile_size);
+
+    if subtile_x >= n_tiles_x_new || subtile_y >= n_tiles_y_new {
+        return;
+    }
+
+    let des_offset =
+        subtile_y * upsampled_alignments.stride(0) + subtile_x * upsampled_alignments.stride(1);
+
+    // the new subtile is on the side of the image, and is not contained within a bigger old tile
+    if subtile_x >= repeat_factor * n_tiles_x_prev || subtile_y >= repeat_factor * n_tiles_y_prev {
+        upsampled_alignments[des_offset + 0] = 0.;
+        upsampled_alignments[des_offset + 1] = 0.;
+        return;
+    }
+
+    let prev_tile_x = subtile_x / repeat_factor;
+    let prev_tile_y = subtile_y / repeat_factor;
+
+    // position of the top left pixel in the subtile
+    let subtile_pos_y = subtile_y * tile_size;
+    let subtile_pos_x = subtile_x * tile_size;
+
+    // copying ref patch into local memory, because it needs to be read 3 times
+    let mut local_ref = Array::new(tile_size * tile_size);
+
+    for i in 0..tile_size {
+        for j in 0..tile_size {
+            let idx = subtile_pos_x + j;
+            let idy = subtile_pos_y + i;
+
+            local_ref[i * tile_size + j] = ref_pyramid_lvl[idy * w + idx];
+        }
+    }
+
+    // position of the new tile within the old tile
+    let ups_subtile_x = subtile_x % repeat_factor;
+    let ups_subtile_y = subtile_y % repeat_factor;
+
+    // computing id for the 3 closest patchs
+    let x_shift = if 2 * ups_subtile_x + 1 > repeat_factor {
+        1i32
+    } else {
+        -(1i32)
+    };
+
+    let y_shift = if 2 * ups_subtile_y + 1 > repeat_factor {
+        1i32
+    } else {
+        -(1i32)
+    };
+
+    // Choosing the best of the 3 alignments by minimising L1 dist
+    let mut dist = f32::new(1.0f32) / 0.0;
+    let mut optimal_flow_x = 0.0;
+    let mut optimal_flow_y = 0.0;
+
+    // 3 Candidates alignments are fetched (by fetching them as early as possible, we may received
+    // them from global memory before we even require them, as calculations are performed during this delay)
+    let candidate_alignment_0_shift = cube_alignment_flow(
+        previous_alignments,
+        prev_tile_x,
+        prev_tile_y,
+        upsampling_factor as f32,
+    );
+
+    let candidate_alignment_vert_shift = cube_alignment_flow(
+        previous_alignments,
+        prev_tile_x,
+        clamp(prev_tile_y as i32 + y_shift, 0, n_tiles_y_prev as i32 - 1) as u32,
+        upsampling_factor as f32,
+    );
+
+    let candidate_alignment_horizontal_shift = cube_alignment_flow(
+        previous_alignments,
+        clamp(prev_tile_x as i32 + x_shift, 0, n_tiles_x_prev as i32 - 1) as u32,
+        prev_tile_y,
+        upsampling_factor as f32,
+    );
+
+    // 0 shift
+    let dist_ = cube_compute_dist(
+        &local_ref,
+        alt_pyramid_lvl,
+        &candidate_alignment_0_shift,
+        subtile_pos_x as i32,
+        subtile_pos_y as i32,
+        tile_size,
+        DistanceKind::L1,
+    );
+
+    if dist_ < dist {
+        dist = dist_;
+        optimal_flow_x = candidate_alignment_0_shift[0];
+        optimal_flow_y = candidate_alignment_0_shift[1];
+    }
+
+    // vertical shift
+    let dist_ = cube_compute_dist(
+        &local_ref,
+        alt_pyramid_lvl,
+        &candidate_alignment_vert_shift,
+        subtile_pos_x as i32,
+        subtile_pos_y as i32,
+        tile_size,
+        DistanceKind::L1,
+    );
+
+    if dist_ < dist {
+        dist = dist_;
+        optimal_flow_x = candidate_alignment_vert_shift[0];
+        optimal_flow_y = candidate_alignment_vert_shift[1];
+    }
+
+    // horizontal shift
+    let dist_ = cube_compute_dist(
+        &local_ref,
+        alt_pyramid_lvl,
+        &candidate_alignment_horizontal_shift,
+        subtile_pos_x as i32,
+        subtile_pos_y as i32,
+        tile_size,
+        DistanceKind::L1,
+    );
+
+    if dist_ < dist {
+        optimal_flow_x = candidate_alignment_horizontal_shift[0];
+        optimal_flow_y = candidate_alignment_horizontal_shift[1];
+    }
+
+    // applying best flow
+    upsampled_alignments[des_offset + 0] = optimal_flow_x;
+    upsampled_alignments[des_offset + 1] = optimal_flow_y;
+}
+
+#[cubecl::cube]
+fn clamp(a: i32, min: i32, max: i32) -> i32 {
+    // Clamp::clamp(a, min, max)
+    Min::min(Max::max(a, min), max)
+}
+
+#[cubecl::cube]
+#[allow(clippy::identity_op)]
+fn cube_alignment_flow(
+    previous_alignments: &Tensor<f32>,
+    x: u32,
+    y: u32,
+    factor: f32,
+) -> Line<f32> {
+    let offset = y * previous_alignments.stride(0) + x * previous_alignments.stride(1);
+
+    let mut candidate_alignment: Line<f32> = Line::empty(2);
+    candidate_alignment[0] = previous_alignments[offset + 0] * factor;
+    candidate_alignment[1] = previous_alignments[offset + 1] * factor;
+    candidate_alignment
+}
+
+#[cubecl::cube]
+pub(crate) fn cube_compute_dist(
+    local_ref: &Array<f32>,
+    alt_pyramid_lvl: &Tensor<f32>,
+    candidate_alignment: &Line<f32>,
+    pos_x: i32,
+    pos_y: i32,
+    tile_size: u32,
+    #[comptime] distance: DistanceKind,
+) -> f32 {
+    let h = alt_pyramid_lvl.shape(0);
+    let w = alt_pyramid_lvl.shape(1);
+
+    let mut dist = 0.0f32;
+    for i in 0..tile_size {
+        for j in 0..tile_size {
+            let new_idx = pos_x + j as i32 + candidate_alignment[0] as i32;
+            let new_idy = pos_y + i as i32 + candidate_alignment[1] as i32;
+
+            if (0 <= new_idx && new_idx < w as i32) && (0 <= new_idy && new_idy < h as i32) {
+                let alt_offset = new_idy as u32 * w + new_idx as u32;
+                let diff = local_ref[i * 32 + j] - alt_pyramid_lvl[alt_offset];
+
+                dist += match distance {
+                    DistanceKind::L1 => Abs::abs(diff),
+                    DistanceKind::L2 => diff * diff,
+                };
+            } else {
+                dist = f32::new(f32::INFINITY)
+            }
+        }
+    }
+
+    dist
+}
--- a/src/ops/decimate_bayer.rs
+++ b/src/ops/decimate_bayer.rs
@ -0,0 +1,68 @@
+use cubecl::{prelude::*, Runtime};
+
+use crate::{backend::Backend, image::GpuImage};
+
+pub fn decimate_to_rgb<R: Runtime>(img: &GpuImage<f32>, client: &Backend<R>) -> GpuImage<f32> {
+    let mut output_img = GpuImage::new(client, img.width() / 2, img.height() / 2, 3);
+
+    unsafe {
+        decimate::launch_unchecked::<R>(
+            &client,
+            output_img.cube_count(1, 1),
+            CubeDim::default(),
+            img.as_tensor_ref().as_tensor_arg(1),
+            output_img.as_tensor_mut().as_tensor_arg(1),
+            true,
+        )
+    };
+
+    output_img
+}
+
+#[inline]
+pub fn decimate_to_gray<R: Runtime>(client: &Backend<R>, img: &GpuImage<f32>) -> GpuImage<f32> {
+    let mut dst = GpuImage::new(client, img.width() / 2, img.height() / 2, 1);
+    decimate_to_gray_into(client, &mut dst, img);
+    dst
+}
+
+pub fn decimate_to_gray_into<R: Runtime>(
+    client: &Backend<R>,
+    dst: &mut GpuImage<f32>,
+    img: &GpuImage<f32>,
+) {
+    unsafe {
+        decimate::launch_unchecked::<R>(
+            &client,
+            dst.cube_count(1, 1),
+            CubeDim::default(),
+            img.as_tensor_ref().as_tensor_arg(1),
+            dst.as_tensor_mut().as_tensor_arg(1),
+            false,
+        )
+    };
+}
+
+#[cube(launch_unchecked)]
+fn decimate(input: &Tensor<f32>, output: &mut Tensor<f32>, #[comptime] rgb: bool) {
+    let in_offset_0 =
+        (ABSOLUTE_POS_Y * 2) * input.stride(0) + (ABSOLUTE_POS_X * 2) * input.stride(1);
+
+    let in_offset_1 =
+        ((ABSOLUTE_POS_Y * 2) + 1) * input.stride(0) + (ABSOLUTE_POS_X * 2) * input.stride(1);
+
+    let r0 = input[in_offset_0 + 0];
+    let g1 = input[in_offset_0 + 1];
+    let g2 = input[in_offset_1 + 0];
+    let b0 = input[in_offset_1 + 1];
+    let g0 = (g1 + g2) * 0.5;
+
+    let offset = ABSOLUTE_POS_Y * output.stride(0) + ABSOLUTE_POS_X * output.stride(1);
+    if rgb {
+        output[offset + 0] = r0;
+        output[offset + 1] = g0;
+        output[offset + 2] = b0;
+    } else {
+        output[offset] = (r0 * 0.299 + g0 * 0.587 + b0 * 0.114) * 2.0;
+    }
+}
--- a/src/ops/demosaic.rs
+++ b/src/ops/demosaic.rs
@ -0,0 +1,207 @@
+use cubecl::prelude::*;
+
+use crate::{backend::Backend, image::GpuImage};
+
+pub fn demosaic_rgb<R: Runtime>(client: &Backend<R>, img: &GpuImage<f32>) -> GpuImage<f32> {
+    let mut output_img = GpuImage::new(client, img.width(), img.height(), 3);
+
+    unsafe {
+        demosaic::launch_unchecked::<R>(
+            &client,
+            output_img.cube_count(1, 1),
+            CubeDim::default(),
+            img.as_tensor_ref().as_tensor_arg(1),
+            output_img.as_tensor_mut().as_tensor_arg(1),
+            true,
+        )
+    };
+
+    output_img
+}
+
+pub fn demosaic_gray<R: Runtime>(client: &Backend<R>, img: &GpuImage<f32>) -> GpuImage<f32> {
+    let mut output_img = GpuImage::new(client, img.width(), img.height(), 1);
+
+    unsafe {
+        demosaic::launch_unchecked::<R>(
+            &client,
+            output_img.cube_count(1, 1),
+            CubeDim::default(),
+            img.as_tensor_ref().as_tensor_arg(1),
+            output_img.as_tensor_mut().as_tensor_arg(1),
+            false,
+        )
+    };
+
+    output_img
+}
+
+///
+/// Malvar-He-Cutler Linear Image Demosaicking
+///   - https://www.ipol.im/pub/art/2011/g_mhcd/revisions/2011-08-14/article.pdf
+///   - https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/Demosaicing_ICASSP04.pdf
+#[cube(launch_unchecked)]
+fn demosaic(input: &Tensor<f32>, output: &mut Tensor<f32>, #[comptime] write_rgb: bool) {
+    if ABSOLUTE_POS_Y > 0
+        && ABSOLUTE_POS_Y < input.shape(0) - 1
+        && ABSOLUTE_POS_X > 0
+        && ABSOLUTE_POS_X < input.shape(1) - 1
+    {
+        let var_x = ABSOLUTE_POS_X & 1;
+        let var_y = ABSOLUTE_POS_Y & 1;
+
+        let case = var_y + var_y + var_x;
+
+        let in_offset_top2 = (ABSOLUTE_POS_Y - 2) * input.stride(0) + ABSOLUTE_POS_X;
+        let in_offset_top = (ABSOLUTE_POS_Y - 1) * input.stride(0) + ABSOLUTE_POS_X;
+        let in_offset_mid = ABSOLUTE_POS_Y * input.stride(0) + ABSOLUTE_POS_X;
+        let in_offset_bot = (ABSOLUTE_POS_Y + 1) * input.stride(0) + ABSOLUTE_POS_X;
+        let in_offset_bot2 = (ABSOLUTE_POS_Y + 2) * input.stride(0) + ABSOLUTE_POS_X;
+
+        //           t2c0
+        //      t1l1 t1c0 t1r1
+        // m0l2 m0l1 m0c0 m0r1 m0r2
+        //      t1l1 b1c0 b1r1
+        //           b2c0
+
+        let t2c0 = input[in_offset_top2];
+        let t1c0 = input[in_offset_top];
+        let m0c0 = input[in_offset_mid];
+        let b1c0 = input[in_offset_bot];
+        let b2c0 = input[in_offset_bot2];
+
+        let m0l2 = input[in_offset_mid - 2];
+        let m0l1 = input[in_offset_mid - 1];
+        let m0r1 = input[in_offset_mid + 1];
+        let m0r2 = input[in_offset_mid + 2];
+
+        let t1l1 = input[in_offset_top - 1];
+        let t1r1 = input[in_offset_top + 1];
+        let b1l1 = input[in_offset_bot - 1];
+        let b1r1 = input[in_offset_bot + 1];
+
+        let mut r = 0.0f32;
+        let mut g = 0.0f32;
+        let mut b = 0.0f32;
+
+        match case {
+            // red
+            0 => {
+                r = m0c0;
+
+                //       -1
+                //        2
+                // -1  2  4  2  -1
+                //        2
+                //       -1
+                g = (-(t2c0 + b2c0 + m0l2 + m0r2)
+                        + 2.0 * (t1c0 + b1c0 + m0l1 + m0r1) //-
+                        + 4.0 * m0c0)
+                    / 8.0;
+
+                //        -3/2
+                //       2     2
+                // -3/2     6     -3/2
+                //       2     2
+                //        -3/2
+                b = (-1.5 * (t2c0 + b2c0 + m0l2 + m0r2)
+                    + 2.0 * (t1l1 + t1r1 + b1l1 + b1r1)
+                    + 6.0 * m0c0)
+                    / 8.0;
+            }
+
+            // green1
+            1 => {
+                //        1/2
+                //     -1    -1
+                //  -1  4  5  4 -1
+                //     -1    -1
+                //        1/2
+
+                r = (-(m0l2 + m0r2 + t1l1 + t1r1 + b1l1 + b1r1)
+                    + 0.5 * (t2c0 + b2c0)
+                    + 4.0 * (m0l1 + m0r1)
+                    + 5.0 * m0c0)
+                    / 8.0;
+
+                g = m0c0;
+
+                //        -1
+                //     -1  4 -1
+                // 1/2     5     1/2
+                //     -1  4 -1
+                //        -1
+
+                b = (-(t2c0 + b2c0 + t1l1 + t1r1 + b1l1 + b1r1)
+                    + 0.5 * (m0l2 + m0r2)
+                    + 4.0 * (t1c0 + b1c0)
+                    + 5.0 * m0c0)
+                    / 8.0;
+            }
+
+            // green2
+            2 => {
+                //        -1
+                //     -1  4 -1
+                // 1/2     5     1/2
+                //     -1  4 -1
+                //        -1
+
+                r = (-(t2c0 + b2c0 + t1l1 + t1r1 + b1l1 + b1r1)
+                    + 0.5 * (m0l2 + m0r2)
+                    + 4.0 * (t1c0 + b1c0)
+                    + 5.0 * m0c0)
+                    / 8.0;
+
+                g = m0c0;
+
+                //        1/2
+                //     -1    -1
+                //  -1  4  5  4 -1
+                //     -1    -1
+                //        1/2
+
+                b = (-(m0l2 + m0r2 + t1l1 + t1r1 + b1l1 + b1r1)
+                    + 0.5 * (t2c0 + b2c0)
+                    + 4.0 * (m0l1 + m0r1)
+                    + 5.0 * m0c0)
+                    / 8.0;
+            }
+
+            // blue
+            3 => {
+                //        -3/2
+                //       2     2
+                // -3/2     6     -3/2
+                //       2     2
+                //        -3/2
+                r = (-1.5 * (t2c0 + b2c0 + m0l2 + m0r2)
+                    + 2.0 * (t1l1 + t1r1 + b1l1 + b1r1)
+                    + 6.0 * m0c0)
+                    / 8.0;
+
+                //       -1
+                //        2
+                // -1  2  4  2  -1
+                //        2
+                //       -1
+                g = (-(t2c0 + b2c0 + m0l2 + m0r2)
+                        + 2.0 * (t1c0 + b1c0 + m0l1 + m0r1) //-
+                        + 4.0 * m0c0)
+                    / 8.0;
+
+                b = m0c0;
+            }
+            _ => {}
+        }
+        let out_offset = ABSOLUTE_POS_Y * output.stride(0) + ABSOLUTE_POS_X * output.stride(1);
+
+        if write_rgb {
+            output[out_offset] = (r * 0.299 + g * 0.587 + b * 0.114) * 2.0;
+        } else {
+            output[out_offset + 0] = r;
+            output[out_offset + 1] = g;
+            output[out_offset + 2] = b;
+        }
+    }
+}
--- a/src/ops/downsample.rs
+++ b/src/ops/downsample.rs
@ -0,0 +1,120 @@
+use cubecl::{prelude::*, CubeDim, Runtime};
+
+use crate::{backend::Backend, image::GpuImage};
+
+fn create_gaussian_kernel1d(sigma: f32, radius: u32) -> Vec<f32> {
+    let sigma2 = sigma * sigma;
+
+    let mut phi_x = (-(radius as i32)..=radius as i32)
+        .map(|x| f32::exp(-0.5 / sigma2 * (x * x) as f32))
+        .collect::<Vec<_>>();
+
+    let sum: f32 = phi_x.iter().cloned().sum();
+
+    phi_x.iter_mut().for_each(|x| *x /= sum);
+    phi_x
+}
+
+///
+/// Apply a convolution by a kernel if required, then downsample an image.
+/// Args:
+///     client: ComputeClient
+///  	image: Device Array the input image (WARNING: single channel only!)
+///  	factor: downsampling factor
+pub fn downsample<R: Runtime>(
+    client: &Backend<R>,
+    img: &GpuImage<f32>,
+    factor: u32,
+) -> GpuImage<f32> {
+    let kern_host = create_gaussian_kernel1d(
+        factor as f32 * 0.5,
+        (4.0 * factor as f32 * 0.5 + 0.5) as u32,
+    );
+
+    let kern = client.load_tensor(&kern_host, [kern_host.len()]);
+    let mut middle_img: GpuImage<f32> = GpuImage::new(client, img.width(), img.height(), 1);
+
+    unsafe {
+        conv1d::launch_unchecked::<R>(
+            &client,
+            middle_img.cube_count(1, 1),
+            CubeDim::default(),
+            img.as_tensor_ref().as_tensor_arg(1),
+            kern.as_array_arg(1),
+            middle_img.as_tensor_mut().as_tensor_arg(1),
+            false,
+        )
+    };
+
+    unsafe {
+        conv1d::launch_unchecked::<R>(
+            &client,
+            middle_img.cube_count(1, 1),
+            CubeDim::default(),
+            middle_img.as_tensor_ref().as_tensor_arg(1),
+            kern.as_array_arg(1),
+            middle_img.as_tensor_ref().as_tensor_arg(1),
+            true,
+        )
+    };
+
+    let mut output_img = GpuImage::new(client, img.width() / factor, img.height() / factor, 1);
+
+    unsafe {
+        subsample::launch_unchecked::<R>(
+            &client,
+            middle_img.cube_count(1, 1),
+            CubeDim::default(),
+            middle_img.as_tensor_ref().as_tensor_arg(1),
+            output_img.as_tensor_mut().as_tensor_arg(1),
+            ScalarArg::new(factor),
+            ScalarArg::new(factor),
+        )
+    };
+
+    output_img
+}
+
+#[cube(launch_unchecked)]
+pub fn conv1d(
+    input: &Tensor<f32>,
+    kern: &Array<f32>,
+    output: &mut Tensor<f32>,
+    #[comptime] transpose: bool,
+) {
+    let kern_half = kern.len() as i32 / 2;
+
+    let h = output.shape(0) as i32 - 1;
+    let w = output.shape(1) as i32 - 1;
+
+    let px = ABSOLUTE_POS_X as i32 - kern_half;
+    let py = ABSOLUTE_POS_Y as i32 - kern_half;
+
+    let mut val = 0.0f32;
+    for i in 0..kern.len() as u32 {
+        let ox = if transpose {
+            ABSOLUTE_POS_X
+        } else {
+            i32::abs(w - i32::abs(px + i as i32 - w)) as u32
+        };
+
+        let oy = if transpose {
+            i32::abs(h - i32::abs(py + i as i32 - h)) as u32
+        } else {
+            ABSOLUTE_POS_Y
+        };
+
+        val += input[oy * input.stride(0) + ox] * kern[i];
+    }
+
+    output[ABSOLUTE_POS_Y * output.stride(0) + ABSOLUTE_POS_X] = val;
+}
+
+#[cube(launch_unchecked)]
+pub fn subsample(input: &Tensor<f32>, output: &mut Tensor<f32>, step_x: u32, step_y: u32) {
+    let in_x = ABSOLUTE_POS_X * step_x;
+    let in_y = ABSOLUTE_POS_Y * step_y;
+
+    output[ABSOLUTE_POS_Y * output.stride(0) + ABSOLUTE_POS_X] =
+        input[in_y * input.stride(0) + in_x];
+}
--- a/src/ops/estimate_kernels.rs
+++ b/src/ops/estimate_kernels.rs
@ -0,0 +1,442 @@
+use cubecl::prelude::*;
+
+use crate::{backend::Backend, image::GpuImage, tensor::GpuTensor};
+
+#[derive(Debug, Clone, Copy, PartialEq)]
+pub struct EstimateKernelsConfig {
+    pub k_detail: f32,
+    pub k_denoise: f32,
+    pub d_th: f32,
+    pub d_tr: f32,
+    pub k_stretch: f32,
+    pub k_shrink: f32,
+}
+
+#[derive(Debug, Clone, Copy, PartialEq)]
+pub struct NoiseConfig {
+    pub alpha: f32,
+    pub beta: f32,
+}
+
+///
+/// Generalized Ascombe Transform
+/// noise model : std² = alpha * I + beta
+/// Where alpha and beta are iso dependant.
+///
+/// Parameters
+/// ----------
+/// image : TYPE
+///     DESCRIPTION.
+/// alpha : float
+///     value of alpha for the given iso
+/// iso : float
+///     ISO value
+/// beta : float
+///     Value of beta for the given iso
+///
+/// Returns
+/// -------
+/// VST_image : TYPE
+///     input image with stabilized variance
+///
+pub fn generalized_ascombe_transform<R: Runtime>(
+    client: &Backend<R>,
+    image: &GpuImage<f32>,
+    nc: NoiseConfig,
+) -> GpuImage<f32> {
+    let mut dst = GpuImage::new(client, image.width(), image.height(), 1);
+
+    generalized_ascombe_transform_into(client, &mut dst, image, nc);
+
+    dst
+}
+
+pub fn generalized_ascombe_transform_into<R: Runtime>(
+    client: &Backend<R>,
+    dst: &mut GpuImage<f32>,
+    image: &GpuImage<f32>,
+    nc: NoiseConfig,
+) {
+    unsafe {
+        cube_gat::launch_unchecked::<R>(
+            client,
+            image.cube_count(1, 1),
+            CubeDim::default(),
+            image.as_tensor_ref().as_tensor_arg(1),
+            dst.as_tensor_mut().as_tensor_arg(1),
+            ScalarArg::new(nc.alpha),
+            ScalarArg::new(nc.beta),
+        )
+    };
+}
+
+#[cube(launch_unchecked)]
+fn cube_gat(image: &Tensor<f32>, vst_image: &mut Tensor<f32>, alpha: f32, beta: f32) {
+    let x = ABSOLUTE_POS_X;
+    let y = ABSOLUTE_POS_Y;
+
+    let imshape_y = image.shape(0);
+    let imshape_x = image.shape(1);
+
+    if y >= imshape_y || x >= imshape_x {
+        return;
+    }
+
+    let offset = y * image.stride(0) + x;
+
+    // ISO should not appear here,  since alpha and beta are
+    // already iso dependant.
+
+    let vst = f32::max(
+        0.0,
+        alpha * image[offset] + 3.0 / 8.0 * alpha * alpha + beta,
+    );
+
+    vst_image[offset] = 2.0 / alpha * f32::sqrt(vst);
+}
+
+pub fn estimate_kernels<R: Runtime>(
+    client: &Backend<R>,
+    img: &GpuImage<f32>,
+    cfg: EstimateKernelsConfig,
+    noise_cfg: NoiseConfig,
+) -> GpuTensor<f32, 4> {
+    // Performing Variance Stabilization Transform
+    let img = generalized_ascombe_transform(client, img, noise_cfg);
+
+    // Decimate to grey
+    // if bayer_mode :
+    //     img_grey = compute_grey_images_(img, method="decimating")
+
+    // else :
+    //     img_grey = img # no need to copy now, they will be copied to gpu later.
+
+    let img_gray = img;
+    let [grey_imshape_y, grey_imshape_x, _] = img_gray.shape();
+
+    // Computing grads
+    // let th_grey_img = th.as_tensor(img_grey, dtype = DEFAULT_TORCH_FLOAT_TYPE, device = "cuda");
+
+    // Horizontal filters
+    // let grad_kernel1 = np.array([[[[-0.5, 0.5]]], [[[0.5, 0.5]]]]);
+
+    // Vertical filters
+    // let grad_kernel2 = np.array([[[[0.5], [0.5]]], [[[-0.5], [0.5]]]]);
+
+    // let tmp = conv1d(th_grey_img, grad_kernel1);
+    // let th_full_grad = conv1d(tmp, grad_kernel2, groups = 2);
+
+    // The default padding mode reduces the shape of grey_img of 1 pixel in each
+    // direction, as expected
+
+    // let cuda_full_grads =
+    // cuda.as_cuda_array(th_full_grad.squeeze().transpose(0, 1).transpose(1, 2));
+
+    // shape [y, x, 2]
+    // let covs = cuda.device_array(grey_imshape + (2, 2), DEFAULT_NUMPY_FLOAT_TYPE);
+
+    // threadsperblock = (DEFAULT_THREADS, DEFAULT_THREADS)
+    // blockspergrid_x = math.ceil(grey_imshape_x/threadsperblock[1])
+    // blockspergrid_y = math.ceil(grey_imshape_y/threadsperblock[0])
+    // blockspergrid = (blockspergrid_x, blockspergrid_y)
+
+    // cuda_estimate_kernel[blockspergrid, threadsperblock](cuda_full_grads,
+    //                                 k_detail, k_denoise, D_th, D_tr, k_stretch, k_shrink,
+    //                                 covs)
+    // covs
+    todo!()
+}
+
+#[allow(clippy::identity_op)]
+#[cubecl::cube]
+fn cube_estimate_kernel(
+    full_grads: &Tensor<f32>,
+    covs: &mut Tensor<f32>,
+    k_detail: f32,
+    k_denoise: f32,
+    d_th: f32,
+    d_tr: f32,
+    k_stretch: f32,
+    k_shrink: f32,
+) {
+    let pixel_idx = ABSOLUTE_POS_X;
+    let pixel_idy = ABSOLUTE_POS_Y;
+
+    let imshape_y = covs.shape(0);
+    let imshape_x = covs.shape(1);
+
+    if pixel_idy >= imshape_y || pixel_idx >= imshape_x {
+        return;
+    }
+
+    let covs_offset = pixel_idy * imshape_x + pixel_idx;
+
+    let mut structure_tensor: Line<f32> = Line::empty(4);
+    structure_tensor[0] = 0.0;
+    structure_tensor[1] = 0.0;
+    structure_tensor[2] = 0.0;
+    structure_tensor[3] = 0.0;
+
+    for i in 0..2 {
+        for j in 0..2 {
+            let x = (pixel_idx + j) as i32 - 1;
+            let y = (pixel_idy + i) as i32 - 1;
+
+            if (0 <= y && y < full_grads.shape(0) as i32)
+                && (0 <= x && x < full_grads.shape(1) as i32)
+            {
+                let full_grad_offset = y as u32 * full_grads.stride(0) + x as u32;
+                let full_grad_x = full_grads[full_grad_offset + 0];
+                let full_grad_y = full_grads[full_grad_offset + 1];
+
+                structure_tensor[0] += full_grad_x * full_grad_x;
+                structure_tensor[1] += full_grad_x * full_grad_y;
+                structure_tensor[2] += full_grad_x * full_grad_y;
+                structure_tensor[3] += full_grad_y * full_grad_y;
+            }
+        }
+    }
+
+    let mut l = Line::empty(2);
+    let mut e1 = Line::empty(2);
+    let mut e2 = Line::empty(2);
+    let mut k = Line::empty(2);
+
+    get_eigen_elmts_2x2(&structure_tensor, &mut l, &mut e1, &mut e2);
+    compute_k(
+        &mut k, l[0], l[1], k_detail, k_denoise, d_th, d_tr, k_stretch, k_shrink,
+    );
+
+    let k_1_sq = k[0] * k[0];
+    let k_2_sq = k[1] * k[1];
+
+    covs[covs_offset + 0] = k_1_sq * e1[0] * e1[0] + k_2_sq * e2[0] * e2[0];
+    covs[covs_offset + 1] = k_1_sq * e1[0] * e1[1] + k_2_sq * e2[0] * e2[1];
+    covs[covs_offset + 2] = k_1_sq * e1[0] * e1[1] + k_2_sq * e2[0] * e2[1];
+    covs[covs_offset + 3] = k_1_sq * e1[1] * e1[1] + k_2_sq * e2[1] * e2[1];
+}
+
+///
+/// Cuda function for resolving the 2x2 system A*X = B
+/// by using the analytical formula
+///
+/// Parameters
+/// ----------
+/// A : Array[2,2]
+///
+/// B : Array[2]
+///
+/// Returns
+/// -------
+/// None
+///
+///
+
+#[cubecl::cube]
+fn solve_2x2(a: &Line<f32>, b: &Line<f32>, x: &mut Line<f32>) {
+    let det_a = a[0] * a[3] - a[1] * a[2];
+
+    x[0] = (a[3] * b[0] - a[1] * b[1]) / det_a;
+    x[1] = (a[0] * b[1] - a[2] * b[0]) / det_a;
+}
+
+///
+/// inverts the 2x2 M array
+///
+/// Parameters
+/// ----------
+/// M : Array[2, 2]
+///     Array to invert
+/// M_i : Array[2, 2]
+///
+/// Returns
+/// -------
+/// None.
+///
+///
+#[cubecl::cube]
+fn invert_2x2(m: &Line<f32>, m_i: &mut Line<f32>) {
+    let det = m[0] * m[3] - m[1] * m[2];
+
+    if Abs::abs(det) > f32::new(f32::EPSILON) {
+        let det_i = 1.0 / det;
+
+        m_i[0] = m[3] * det_i;
+        m_i[1] = -m[1] * det_i;
+        m_i[2] = -m[2] * det_i;
+        m_i[3] = m[0] * det_i;
+    } else {
+        m_i[0] = 1.0;
+        m_i[1] = 0.0;
+        m_i[2] = 0.0;
+        m_i[3] = 1.0;
+    }
+}
+
+///
+/// Returns the two roots of the polynom a*X^2 + b*X + c = 0 for a, b and c
+/// real numbers. The function only returns real roots : make sure they exist
+/// before calling the function. l[0] contains the root with the biggest module
+/// and l[1] the smallest
+///
+///
+/// Parameters
+/// ----------
+/// a : float
+///
+/// b : float
+///
+/// c : float
+///
+/// roots : Array[2]
+///
+/// Returns
+/// -------
+/// None
+///
+#[cubecl::cube]
+fn get_real_polyroots_2(a: f32, b: f32, c: f32, roots: &mut Line<f32>) {
+    // numerical instabilities can cause delta to be slightly negative despite
+    // the equation admitting 2 real roots.
+    let delta_root = f32::sqrt(f32::max(b * b - 4.0 * a * c, 0.0));
+    let r1 = (-b + delta_root) / (2.0 * a);
+    let r2 = (-b - delta_root) / (2.0 * a);
+
+    if f32::abs(r1) >= f32::abs(r2) {
+        roots[0] = r1;
+        roots[1] = r2;
+    } else {
+        roots[0] = r2;
+        roots[1] = r1;
+    }
+}
+
+#[cubecl::cube]
+fn get_eigen_val_2x2(m: &Line<f32>, l: &mut Line<f32>) {
+    let a = 1.0;
+    let b = -(m[0] + m[3]);
+    let c = m[0] * m[3] - m[1] * m[2];
+
+    get_real_polyroots_2(a, b, c, l)
+}
+
+///
+/// return the eigen vectors with norm 1 for the eigen values l
+/// M.e1 = l1.e1 ; M.e2 = l2.e2
+///
+/// Parameters
+/// ----------
+/// M : Array[2,2]
+///   Real Symmetric array for which eigen values are to be determined
+/// l : Array[2]
+/// e1, e2 : Array[2]
+///     sorted Eigenvalues
+/// e1, e2 : Array[2, 2]
+///     Computed orthogonal and normalized eigen vectors
+///
+/// Returns
+/// -------
+/// None.
+///
+#[cubecl::cube]
+fn get_eigen_vect_2x2(m: &Line<f32>, l: &Line<f32>, e1: &mut Line<f32>, e2: &mut Line<f32>) {
+    // 2x2 algorithm : https://en.wikipedia.org/wiki/Eigenvalue_algorithm
+    if m[1] == 0.0 && m[0] == m[3] {
+        // m is multiple of identity, picking 2 ortogonal eigen vectors.
+        e1[0] = 1.0;
+        e1[1] = 0.0;
+        e2[0] = 0.0;
+        e2[1] = 1.0;
+    } else {
+        // averaging 2 for increased reliability
+        e1[0] = m[0] + m[1] - l[1];
+        e1[1] = m[2] + m[3] - l[1];
+
+        if e1[0] == 0.0 {
+            e1[1] = 1.0;
+            e2[0] = 1.0;
+            e2[1] = 0.0;
+        } else if e1[1] == 0.0 {
+            e1[0] = 1.0;
+            e2[0] = 0.0;
+            e2[1] = 1.0;
+        } else {
+            let norm_ = f32::sqrt(e1[0] * e1[0] + e1[1] * e1[1]);
+            e1[0] /= norm_;
+            e1[1] /= norm_;
+
+            let sign = copysign(e1[0]);
+            e2[1] = f32::abs(e1[0]);
+            e2[0] = -e1[1] * sign
+        }
+    }
+}
+
+#[cubecl::cube]
+fn get_eigen_elmts_2x2(m: &Line<f32>, l: &mut Line<f32>, e1: &mut Line<f32>, e2: &mut Line<f32>) {
+    get_eigen_val_2x2(m, l);
+    get_eigen_vect_2x2(m, l, e1, e2);
+}
+
+///
+/// Computes k_1 and k_2 based on lambda1, lambda2 and the constants.
+///
+/// Parameters
+/// ----------
+/// l1 : float
+///     lambda1 (dominant eigen value)
+/// l2 : float
+///     lambda2 : second eigenvalue
+/// k : Array[2]
+///     empty vector where k_1 and k_2 will be stored
+/// k_detail : float
+/// k_denoise : float
+/// D_th : float
+/// D_tr : float
+/// k_stretch : float
+/// k_shrink : float
+///     Parameters to compute k_1 and k_2, all detailed in the article.
+///
+#[cubecl::cube]
+fn compute_k(
+    k: &mut Line<f32>,
+    l1: f32,
+    l2: f32,
+    k_detail: f32,
+    k_denoise: f32,
+    d_th: f32,
+    d_tr: f32,
+    k_stretch: f32,
+    k_shrink: f32,
+) {
+    // When A is Nan, we fall back to this condition
+    let mut k1 = 1.0f32;
+    let mut k2 = 1.0f32;
+
+    let a: f32 = 1.0 + f32::sqrt((l1 - l2) / (l1 + l2));
+    let d: f32 = clamp(1.0 - f32::sqrt(l1) / d_tr + d_th, 0.0, 1.0);
+
+    // This is a very aggressive way of driving anisotropy, but it works well so far.
+    if a > 1.95 {
+        k1 = 1.0 / k_shrink;
+        k2 = k_stretch;
+    }
+
+    k[0] = k_detail * ((1.0 - d) * k1 + d * k_denoise);
+    k[1] = k_detail * ((1.0 - d) * k2 + d * k_denoise);
+}
+
+#[cubecl::cube]
+fn copysign(val: f32) -> f32 {
+    if val < 0.0f32 {
+        -(1.0f32)
+    } else {
+        1.0f32
+    }
+}
+
+#[cubecl::cube]
+fn clamp(a: f32, min: f32, max: f32) -> f32 {
+    // Clamp::clamp(a, min, max)
+    f32::min(f32::max(a, min), max)
+}
--- a/src/ops/exposure.rs
+++ b/src/ops/exposure.rs
@ -0,0 +1,3 @@
+use cubecl::prelude::*;
+
+pub fn correct_exposure() {}
--- a/src/ops/fft.rs
+++ b/src/ops/fft.rs
@ -0,0 +1,420 @@
+use cubecl::prelude::*;
+
+/**
+ Highly-optimized fast Fourier transform applied to each color channel independently
+ The aim of this function is to provide improved performance compared to the more simple function backward_dft() while providing equal results. It uses the following features for reduced calculation times:
+ - the four color channels are stored as a float4 and all calculations employ SIMD instructions.
+ - the one-dimensional transformation along y-direction employs the fast Fourier transform algorithm: At first, 4 small DFTs are calculated and then final results are obtained by two steps of cross-combination of values (based on a so-called butterfly diagram). This approach reduces the total number of memory reads and computational steps considerably.
+ - the one-dimensional transformation along x-direction employs the fast Fourier transform algorithm: At first, 4 small DFTs are calculated and then final results are obtained by two steps of cross-combination of values (based on a so-called butterfly diagram). This approach reduces the total number of memory reads and computational steps considerably.
+ */
+ #[cube(launch_unchecked)]
+ #[rustfmt::skip]
+fn backward_fft(in_texture_ft: &Tensor<Line<f32>>, out_texture: &mut Tensor<Line<f32>>, tile_size: u32, n_textures: u32) {
+
+    // compute tile positions from gid
+    let m0 = ABSOLUTE_POS_X * tile_size;
+    let n0 = ABSOLUTE_POS_Y * tile_size;
+
+    let tile_size_14 = tile_size / 4;
+    let tile_size_24 = tile_size / 2;
+    let tile_size_34 = tile_size / 4 * 3;
+
+    // pre-calculate factors for sine and cosine calculation
+    let angle = -2.0 * f32::new(std::f32::consts::PI) / tile_size as f32;
+
+    // pre-initalize some vectors
+    let norm_factor = Line::empty(4).fill(1.0 / (n_textures * tile_size  * tile_size) as f32);
+
+    let mut tmp_data: Array<Line<f32>> = Array::new(16);
+    let mut tmp_tile: Array<Line<f32>> = Array::new(128);
+
+    // row-wise one-dimensional fast Fourier transform along x-direction
+    for dn in 0..tile_size {
+        let n_tmp = dn * 2 * tile_size;
+
+        // copy data to temp vector
+        for dm in 0..tile_size {
+            let offset = (n0 + dn) * in_texture_ft.stride(0) + 2 * (m0 + dm);
+
+            tmp_data[2 * dm + 0] = in_texture_ft[offset + 0];
+            tmp_data[2 * dm + 1] = in_texture_ft[offset + 1];
+        }
+
+        // calculate 4 small discrete Fourier transforms
+        for dm in 0..tile_size / 4 {
+            // fill with zeros
+            let mut re0 = Line::empty(4).fill(0.0);
+            let mut re1 = Line::empty(4).fill(0.0);
+            let mut re2 = Line::empty(4).fill(0.0);
+            let mut re3 = Line::empty(4).fill(0.0);
+            let mut im0 = Line::empty(4).fill(0.0);
+            let mut im1 = Line::empty(4).fill(0.0);
+            let mut im2 = Line::empty(4).fill(0.0);
+            let mut im3 = Line::empty(4).fill(0.0);
+
+            for dx in 0..tile_size / 4 {
+                let offset = dx * 8;
+                let dmdx = (dm * dx * 4) as f32;
+
+                // calculate coefficients
+                let coef_re = Line::<f32>::empty(4).fill(f32::cos(angle * dmdx));
+                let coef_im = Line::<f32>::empty(4).fill(f32::sin(angle * dmdx));
+
+                // DFT0
+                let data_re = tmp_data[offset + 0];
+                let data_im = tmp_data[offset + 1];
+
+                re0   += coef_re * data_re + coef_im * data_im;
+                im0   += coef_im * data_re - coef_re * data_im;
+
+                // DFT1
+                let data_re = tmp_data[offset + 2];
+                let data_im = tmp_data[offset + 3];
+
+                re2   += coef_re * data_re + coef_im * data_im;
+                im2   += coef_im * data_re - coef_re * data_im;
+
+                // DFT2
+                let data_re = tmp_data[offset + 4];
+                let data_im = tmp_data[offset + 5];
+
+                re1   += coef_re * data_re + coef_im * data_im;
+                im1   += coef_im * data_re - coef_re * data_im;
+
+                //DFT3
+                let data_re = tmp_data[offset + 6];
+                let data_im = tmp_data[offset + 7];
+
+                re3   += coef_re * data_re + coef_im * data_im;
+                im3   += coef_im * data_re - coef_re * data_im;
+            }
+
+            // first butterfly to combine result
+            let coef_re = Line::<f32>::empty(4).fill(f32::cos(angle * (2 * dm) as f32));
+            let coef_im = Line::<f32>::empty(4).fill(f32::sin(angle * (2 * dm) as f32));
+
+            let re_00 = re0 + coef_re * re1 - coef_im * im1;
+            let im_00 = im0 + coef_im * re1 + coef_re * im1;
+            let re_22 = re2 + coef_re * re3 - coef_im * im3;
+            let im_22 = im2 + coef_im * re3 + coef_re * im3;
+
+            let coef_re = Line::<f32>::empty(4).fill(f32::cos(angle * (2 * (dm + tile_size_14)) as f32));
+            let coef_im = Line::<f32>::empty(4).fill(f32::sin(angle * (2 * (dm + tile_size_14)) as f32));
+
+            let re_11 = re0 + coef_re * re1 - coef_im * im1;
+            let im_11 = im0 + coef_im * re1 + coef_re * im1;
+            let re_33 = re2 + coef_re * re3 - coef_im * im3;
+            let im_33 = im2 + coef_im * re3 + coef_re * im3;
+
+            // second butterfly to combine results
+            let re0 = re_00 + Line::new(f32::cos(angle * dm as f32)) * re_22                  - Line::new(f32::sin(angle * dm as f32)) * im_22;
+            let re1 = re_11 + Line::new(f32::cos(angle * (dm + tile_size_14) as f32)) * re_33 - Line::new(f32::sin(angle * (dm + tile_size_14) as f32)) * im_33;
+            let re2 = re_00 + Line::new(f32::cos(angle * (dm + tile_size_24) as f32)) * re_22 - Line::new(f32::sin(angle * (dm + tile_size_24) as f32)) * im_22;
+            let re3 = re_11 + Line::new(f32::cos(angle * (dm + tile_size_34) as f32)) * re_33 - Line::new(f32::sin(angle * (dm + tile_size_34) as f32)) * im_33;
+
+            let im0 = im_00 + Line::new(f32::sin(angle * dm as f32)) * re_22                  + Line::new(f32::cos(angle * dm as f32)) * im_22;
+            let im1 = im_11 + Line::new(f32::sin(angle * (dm + tile_size_14) as f32)) * re_33 + Line::new(f32::cos(angle * (dm + tile_size_14) as f32)) * im_33;
+            let im2 = im_00 + Line::new(f32::sin(angle * (dm + tile_size_24) as f32)) * re_22 + Line::new(f32::cos(angle * (dm + tile_size_24) as f32)) * im_22;
+            let im3 = im_11 + Line::new(f32::sin(angle * (dm + tile_size_34) as f32)) * re_33 + Line::new(f32::cos(angle * (dm + tile_size_34) as f32)) * im_33;
+
+            // write into temporary tile storage
+            tmp_tile[n_tmp + 2 * dm + 0]                    = re0;
+            tmp_tile[n_tmp + 2 * dm + 1]                    = im0 * Line::new(-1.0);
+            tmp_tile[n_tmp + 2 * dm + tile_size_24 + 0]     = re1;
+            tmp_tile[n_tmp + 2 * dm + tile_size_24 + 1]     = im1 * Line::new(-1.0);
+            tmp_tile[n_tmp + 2 * dm + tile_size + 0]        = re2;
+            tmp_tile[n_tmp + 2 * dm + tile_size + 1]        = im2 * Line::new(-1.0);
+            tmp_tile[n_tmp + 2 * dm + tile_size_24 * 3 + 0] = re3;
+            tmp_tile[n_tmp + 2 * dm + tile_size_24 * 3 + 1] = im3 * Line::new(-1.0);
+        }
+    };
+
+    // column-wise one-dimensional fast Fourier transform along y-direction
+    for dm in 0..tile_size {
+        let m = m0 + dm;
+
+        // copy data to temp vector
+        for dn in 0..tile_size {
+            tmp_data[2 * dn + 0] = tmp_tile[dn * 2 * tile_size + 2 * dm + 0];
+            tmp_data[2 * dn + 1] = tmp_tile[dn * 2 * tile_size + 2 * dm + 1];
+        }
+
+        // calculate 4 small discrete Fourier transforms
+        for dn in 0..tile_size / 4 {
+            let n = n0 + dn;
+
+            // fill with zeros
+            let mut re0 = Line::empty(4).fill(0.0);
+            let mut re1 = Line::empty(4).fill(0.0);
+            let mut re2 = Line::empty(4).fill(0.0);
+            let mut re3 = Line::empty(4).fill(0.0);
+            let mut im0 = Line::empty(4).fill(0.0);
+            let mut im1 = Line::empty(4).fill(0.0);
+            let mut im2 = Line::empty(4).fill(0.0);
+            let mut im3 = Line::empty(4).fill(0.0);
+
+            for dy in 0..tile_size / 4 {
+                let offset = dy * 8;
+                let dndy = (dn * dy * 4) as f32;
+
+                // calculate coefficients
+                let coef_re = Line::new(f32::cos(angle * dndy));
+                let coef_im = Line::new(f32::sin(angle * dndy));
+
+                // DFT0
+                let data_re = tmp_data[offset + 0];
+                let data_im = tmp_data[offset + 1];
+                re0   += coef_re * data_re + coef_im * data_im;
+                im0   += coef_im * data_re - coef_re * data_im;
+
+                // DFT1
+                let data_re = tmp_data[offset + 2];
+                let data_im = tmp_data[offset + 3];
+                re2   += coef_re*data_re + coef_im*data_im;
+                im2   += coef_im*data_re - coef_re*data_im;
+
+                // DFT2
+                let data_re = tmp_data[offset + 4];
+                let data_im = tmp_data[offset + 5];
+                re1   += coef_re * data_re + coef_im * data_im;
+                im1   += coef_im * data_re - coef_re * data_im;
+
+                // DFT3
+                let data_re = tmp_data[offset + 6];
+                let data_im = tmp_data[offset + 7];
+                re3   += coef_re * data_re + coef_im * data_im;
+                im3   += coef_im * data_re - coef_re * data_im;
+            }
+
+            // first butterfly to combine results
+            let coef_re = Line::new(f32::cos(angle * (2 * dn) as f32));
+            let coef_im = Line::new(f32::sin(angle * (2 * dn) as f32));
+
+            let re_00 = re0 + coef_re * re1 - coef_im * im1;
+            // let im_00 = im0 + coef_im * re1 + coef_re * im1;
+            let re_22 = re2 + coef_re * re3 - coef_im * im3;
+            let im_22 = im2 + coef_im * re3 + coef_re * im3;
+
+            let coef_re = Line::new(f32::cos(angle * (2 * (dn + tile_size_14)) as f32));
+            let coef_im = Line::new(f32::sin(angle * (2 * (dn + tile_size_14)) as f32));
+
+            let re_11 = re0 + coef_re * re1 - coef_im * im1;
+            // let im_11 = im0 + coef_im * re1 + coef_re * im1;
+            let re_33 = re2 + coef_re * re3 - coef_im * im3;
+            let im_33 = im2 + coef_im * re3 + coef_re * im3;
+
+            // second butterfly to combine results
+            let re0 = re_00 + Line::new(f32::cos(angle * dn as f32)) * re_22                  - Line::new(f32::sin(angle * dn as f32)) * im_22;
+            let re2 = re_00 + Line::new(f32::cos(angle * (dn + tile_size_24) as f32)) * re_22 - Line::new(f32::sin(angle * (dn + tile_size_24) as f32)) * im_22;
+            let re1 = re_11 + Line::new(f32::cos(angle * (dn + tile_size_14) as f32)) * re_33 - Line::new(f32::sin(angle * (dn + tile_size_14) as f32)) * im_33;
+            let re3 = re_11 + Line::new(f32::cos(angle * (dn + tile_size_34) as f32)) * re_33 - Line::new(f32::sin(angle * (dn + tile_size_34) as f32)) * im_33;
+
+            // write into output textures
+
+            out_texture[n * out_texture.stride(0) + m] = re0 * norm_factor;
+            out_texture[(n + tile_size_14) * out_texture.stride(0) + m] = re1 * norm_factor;
+            out_texture[(n + tile_size_24) * out_texture.stride(0) + m] = re2 * norm_factor;
+            out_texture[(n + tile_size_34) * out_texture.stride(0) + m] = re3 * norm_factor;
+        }
+    }
+}
+
+/**
+ Highly-optimized fast Fourier transform applied to each color channel independently
+ The aim of this function is to provide improved performance compared to the more simple function forward_dft() while providing equal results. It uses the following features for reduced calculation times:
+ - the four color channels are stored as a float4 and all calculations employ SIMD instructions.
+ - the one-dimensional transformation along y-direction is a discrete Fourier transform. As the input image is real-valued, the frequency domain representation is symmetric and only values for N/2+1 rows have to be calculated.
+ - the one-dimensional transformation along x-direction employs the fast Fourier transform algorithm: At first, 4 small DFTs are calculated and then final results are obtained by two steps of cross-combination of values (based on a so-called butterfly diagram). This approach reduces the total number of memory reads and computational steps considerably.
+ - due to the symmetry mentioned earlier, only N/2+1 rows have to be transformed and the remaining N/2-1 rows can be directly inferred.
+ */
+
+ #[cube(launch_unchecked)]
+ #[rustfmt::skip]
+fn forward_fft(in_texture: &Tensor<Line<f32>>, out_texture_ft: &mut Tensor<Line<f32>>, tile_size: u32) {
+
+    // compute tile positions from gid
+    let m0 = ABSOLUTE_POS_X * tile_size;
+    let n0 = ABSOLUTE_POS_Y * tile_size;
+
+    let tile_size_14 = tile_size / 4;
+    let tile_size_24 = tile_size / 2;
+    let tile_size_34 = tile_size / 4 * 3;
+
+
+    // pre-calculate factors for sine and cosine calculation
+    let angle = -2.0 * f32::new(std::f32::consts::PI) / tile_size as f32;
+
+    // pre-initalize some vectors
+    let mut tmp_data: Array<Line<f32>> = Array::new(16);
+    let mut tmp_tile: Array<Line<f32>> = Array::new(80);
+
+    // column-wise one-dimensional discrete Fourier transform along y-direction
+    for dm in 0..tile_size / 2 {
+        let dm = dm * 2;
+        let m = m0 + dm;
+
+        // copy data to temp vector
+        for dn in 0..tile_size {
+            let offset = (n0 + dn) * in_texture.stride(0) + m;
+
+            tmp_data[2 * dn + 0] = in_texture[offset + 0];
+            tmp_data[2 * dn + 1] = in_texture[offset + 1];
+        }
+
+        // exploit symmetry of real dft and calculate reduced number of rows
+        for dn in 0..=tile_size / 2 {
+            let n_tmp = dn * 2 * tile_size;
+
+            // fill with zeros
+            let mut re0 = Line::empty(4).fill(0.0);
+            let mut re1 = Line::empty(4).fill(0.0);
+            let mut im0 = Line::empty(4).fill(0.0);
+            let mut im1 = Line::empty(4).fill(0.0);
+
+            for dy in 0..tile_size {
+                // see section "Overlapped tiles" in https://graphics.stanford.edu/papers/hdrp/hasinoff-hdrplus-sigasia16.pdf or
+                //     section "Overlapped Tiles and Raised Cosine Window" in https://www.ipol.im/pub/art/2021/336/
+                // calculate modified raised cosine window weight for blending tiles to suppress artifacts
+                let norm_cosine0 = Line::new((0.5 - 0.5 * f32::cos(-angle * (dm as f32 + 0.5))) * (0.5 - 0.5 * f32::cos(-angle * (dy as f32 + 0.5))));
+                let norm_cosine1 = Line::new((0.5 - 0.5 * f32::cos(-angle * (dm as f32 + 1.5))) * (0.5 - 0.5 * f32::cos(-angle * (dy as f32 + 0.5))));
+
+                // calculate coefficients
+                let coef_re = Line::new(f32::cos(angle * (dn * dy) as f32));
+                let coef_im = Line::new(f32::sin(angle * (dn * dy) as f32));
+
+                let data_re = norm_cosine0 * tmp_data[2 * dy + 0];
+                re0 += coef_re * data_re;
+                im0 += coef_im * data_re;
+
+                let data_re = norm_cosine1 * tmp_data[2 * dy + 1];
+                re1 += coef_re * data_re;
+                im1 += coef_im * data_re;
+            }
+
+            // write into temporary tile storage
+            tmp_tile[n_tmp + 2 * dm + 0] = re0;
+            tmp_tile[n_tmp + 2 * dm + 1] = im0;
+            tmp_tile[n_tmp + 2 * dm + 2] = re1;
+            tmp_tile[n_tmp + 2 * dm + 3] = im1;
+        }
+    }
+
+    // row-wise one-dimensional fast Fourier transform along x-direction
+    // exploit symmetry of real dft and calculate reduced number of rows
+    for dn in 0..=tile_size / 2 {
+        let n = n0 + dn;
+
+        // copy data to temp vector
+        for dm in 0..tile_size {
+            let offset = dn * 2 * tile_size + 2 * dm;
+
+            tmp_data[2 * dm + 0] = tmp_tile[offset + 0];
+            tmp_data[2 * dm + 1] = tmp_tile[offset + 1];
+        }
+
+        // calculate 4 small discrete Fourier transforms
+        for dm in 0..tile_size / 4 {
+            let m = 2 * (m0 + dm);
+
+            let mut re0 = Line::empty(4).fill(0.0);
+            let mut re1 = Line::empty(4).fill(0.0);
+            let mut re2 = Line::empty(4).fill(0.0);
+            let mut re3 = Line::empty(4).fill(0.0);
+            let mut im0 = Line::empty(4).fill(0.0);
+            let mut im1 = Line::empty(4).fill(0.0);
+            let mut im2 = Line::empty(4).fill(0.0);
+            let mut im3 = Line::empty(4).fill(0.0);
+
+            for dx in 0..tile_size / 4 {
+                let dx = dx * 4;
+
+                // calculate coefficients
+                let coef_re = Line::new(f32::cos(angle * (dm * dx) as f32));
+                let coef_im = Line::new(f32::sin(angle * (dm * dx) as f32));
+
+                // DFT0
+                let data_re = tmp_data[2 * dx + 0];
+                let data_im = tmp_data[2 * dx + 1];
+                re0   += coef_re * data_re - coef_im * data_im;
+                im0   += coef_im * data_re + coef_re * data_im;
+
+                // DFT1
+                let data_re = tmp_data[2 * dx + 2];
+                let data_im = tmp_data[2 * dx + 3];
+                re2   += coef_re * data_re - coef_im * data_im;
+                im2   += coef_im * data_re + coef_re * data_im;
+
+                // DFT2
+                let data_re = tmp_data[2 * dx + 4];
+                let data_im = tmp_data[2 * dx + 5];
+                re1   += coef_re * data_re - coef_im * data_im;
+                im1   += coef_im * data_re + coef_re * data_im;
+
+                // DFT3
+                let data_re = tmp_data[2 * dx + 6];
+                let data_im = tmp_data[2 * dx + 7];
+                re3   += coef_re * data_re - coef_im * data_im;
+                im3   += coef_im * data_re + coef_re * data_im;
+            }
+
+            // first butterfly to combine results
+            let coef_re = Line::new(f32::cos(angle * (2 * dm) as f32));
+            let coef_im = Line::new(f32::sin(angle * (2 * dm) as f32));
+
+            let re_00 = re0 + coef_re * re1 - coef_im * im1;
+            let im_00 = im0 + coef_im * re1 + coef_re * im1;
+            let re_22 = re2 + coef_re * re3 - coef_im * im3;
+            let im_22 = im2 + coef_im * re3 + coef_re * im3;
+
+            let coef_re = Line::new(f32::cos(angle * (2 * (dm + tile_size_14)) as f32));
+            let coef_im = Line::new(f32::sin(angle * (2 * (dm + tile_size_14)) as f32));
+            let re_11 = re0 + coef_re * re1 - coef_im * im1;
+            let im_11 = im0 + coef_im * re1 + coef_re * im1;
+            let re_33 = re2 + coef_re * re3 - coef_im * im3;
+            let im_33 = im2 + coef_im * re3 + coef_re * im3;
+
+            // second butterfly to combine results
+            let re0 = re_00 + Line::new(f32::cos(angle * dm as f32)) * re_22                - Line::new(f32::sin(angle * dm as f32)) * im_22;
+            let im0 = im_00 + Line::new(f32::sin(angle * dm as f32)) * re_22                + Line::new(f32::cos(angle * dm as f32)) * im_22;
+            let re2 = re_00 + Line::new(f32::cos(angle * (dm+tile_size_24) as f32)) * re_22 - Line::new(f32::sin(angle * (dm+tile_size_24) as f32)) * im_22;
+            let im2 = im_00 + Line::new(f32::sin(angle * (dm+tile_size_24) as f32)) * re_22 + Line::new(f32::cos(angle * (dm+tile_size_24) as f32)) * im_22;
+            let re1 = re_11 + Line::new(f32::cos(angle * (dm+tile_size_14) as f32)) * re_33 - Line::new(f32::sin(angle * (dm+tile_size_14) as f32)) * im_33;
+            let im1 = im_11 + Line::new(f32::sin(angle * (dm+tile_size_14) as f32)) * re_33 + Line::new(f32::cos(angle * (dm+tile_size_14) as f32)) * im_33;
+            let re3 = re_11 + Line::new(f32::cos(angle * (dm+tile_size_34) as f32)) * re_33 - Line::new(f32::sin(angle * (dm+tile_size_34) as f32)) * im_33;
+            let im3 = im_11 + Line::new(f32::sin(angle * (dm+tile_size_34) as f32)) * re_33 + Line::new(f32::cos(angle * (dm+tile_size_34) as f32)) * im_33;
+
+            // write into output texture
+            let offset = n * out_texture_ft.stride(0) + m;
+            out_texture_ft[offset + 0] = re0;
+            out_texture_ft[offset + 1] = im0;
+            out_texture_ft[offset + tile_size_24 + 0] = re1;
+            out_texture_ft[offset + tile_size_24 + 1] = im1;
+            out_texture_ft[offset + tile_size + 0] = re2;
+            out_texture_ft[offset + tile_size + 1] = im2;
+            out_texture_ft[offset + tile_size_24 * 3 + 0] = re3;
+            out_texture_ft[offset + tile_size_24 * 3 + 1] = im3;
+
+            // exploit symmetry of real dft and set values for remaining rows
+            if dn > 0 && dn != tile_size / 2 {
+                let n2 = n0 + tile_size - dn;
+                //int const m20 = 2*(m0 + (dm==0 ? 0 : tile_size-dm));
+                let m20 = 2 * (m0 + Min::min(dm, 1) * (tile_size - dm));
+                let m21 = 2 * (m0 + tile_size - dm - tile_size_14);
+                let m22 = 2 * (m0 + tile_size - dm - tile_size_24);
+                let m23 = 2 * (m0 + tile_size - dm - tile_size_14 * 3);
+
+                // write into output texture
+                let offset = n2 * out_texture_ft.stride(0);
+
+                out_texture_ft[offset + m20 + 0] = re0;
+                out_texture_ft[offset + m20 + 1] = im0 * Line::new(-1.0);
+                out_texture_ft[offset + m21 + 0] = re1;
+                out_texture_ft[offset + m21 + 1] = im1 * Line::new(-1.0);
+                out_texture_ft[offset + m22 + 0] = re2;
+                out_texture_ft[offset + m22 + 1] = im2 * Line::new(-1.0);
+                out_texture_ft[offset + m23 + 0] = re3;
+                out_texture_ft[offset + m23 + 1] = im3 * Line::new(-1.0);
+            }
+        }
+    }
+}
--- a/src/ops/gauss_pyramid.rs
+++ b/src/ops/gauss_pyramid.rs
@ -0,0 +1,37 @@
+use cubecl::Runtime;
+
+use crate::{backend::Backend, image::GpuImage};
+
+use super::downsample::downsample;
+
+pub struct PyramidLayerConfig {
+    pub tile_size: u32,
+    pub factor: u32,
+}
+
+///
+/// Construct N-level coarse-to-fine gaussian pyramid
+///
+/// Args:
+///         image: input image (expected to be a grayscale image downsampled from a Bayer raw image)
+///         factors: [int], dowsampling factors (fine-to-coarse)
+///
+pub(crate) fn build_gauss_pyramid<R: Runtime>(
+    cube: &Backend<R>,
+    img: &GpuImage<f32>,
+    factors: impl Iterator<Item = u32>,
+) -> Vec<GpuImage<f32>> {
+    // Subsequent pyramid levels are successively created
+    // with convolution by a kernel followed by downsampling
+    let init = Vec::with_capacity(factors.size_hint().1.unwrap_or(factors.size_hint().0));
+
+    let mut levels = factors.fold(init, |mut acc, factor| {
+        acc.push(downsample(cube, acc.last().unwrap_or(img), factor));
+        acc
+    });
+
+    // Reverse the pyramid to get it coarse-to-fine
+    levels.reverse();
+
+    levels
+}
--- a/src/ops/lucas_kanade.rs
+++ b/src/ops/lucas_kanade.rs
@ -0,0 +1,216 @@
+use cubecl::prelude::*;
+
+use crate::{image::GpuImage, tensor::GpuTensor};
+
+fn compute_grads(img: &GpuImage<f32>) -> GpuTensor<f32, 3> {
+    todo!()
+}
+
+fn compute_hessian(grads: &GpuTensor<f32, 3>, tile_size: u32) -> GpuTensor<f32, 3> {
+    todo!()
+}
+
+#[cube(launch_unchecked)]
+fn cube_hessian(grad: &Tensor<f32>, hess: &mut Tensor<f32>, tile_size: u32) {
+    let patch_idx = ABSOLUTE_POS_X;
+    let patch_idy = ABSOLUTE_POS_Y;
+
+    let n_patch_y = hess.shape(0);
+    let n_patch_x = hess.shape(1);
+
+    // discarding non existing patches
+    if patch_idy >= n_patch_y || patch_idx >= n_patch_x {
+        return;
+    }
+
+    let patch_pos_idx = tile_size * patch_idx; // global position on the coarse grey grid.
+                                               // Because of extremity padding, it can be out of bound
+    let patch_pos_idy = tile_size * patch_idy;
+
+    let mut local_hessian = Line::empty(4);
+    local_hessian[0] = 0.0;
+    local_hessian[1] = 0.0;
+    local_hessian[2] = 0.0;
+    local_hessian[3] = 0.0;
+
+    for i in 0..tile_size {
+        for j in 0..tile_size {
+            let pixel_global_idy = patch_pos_idy + i;
+            let pixel_global_idx = patch_pos_idx + j;
+
+            if pixel_global_idy < grad.shape(0) && pixel_global_idx < grad.shape(1) {
+                let offset = pixel_global_idy * grad.stride(0) + pixel_global_idx * grad.stride(1);
+                let local_gradx = grad[offset + 0];
+                let local_grady = grad[offset + 1];
+
+                local_hessian[0] += local_gradx * local_gradx;
+                local_hessian[1] += local_gradx * local_grady;
+                local_hessian[2] += local_gradx * local_grady;
+                local_hessian[3] += local_grady * local_grady;
+            }
+        }
+    }
+
+    let offset = patch_idy * hess.stride(0) + patch_idx * hess.stride(1);
+    hess[offset + 0] = local_hessian[0];
+    hess[offset + 1] = local_hessian[1];
+    hess[offset + 2] = local_hessian[2];
+    hess[offset + 3] = local_hessian[3];
+}
+
+///
+///   The update relies on solving AX = B, a 2 by 2 system.
+///   A is precomputed, but B is evaluated each time.
+///
+#[cube(launch_unchecked)]
+fn get_new_flow(
+    ref_img: &Tensor<f32>,
+    alignment: &mut Tensor<f32>,
+    comp_img: &Tensor<f32>,
+    grad: &Tensor<f32>,
+    hessian: &Tensor<f32>,
+    tile_size: u32,
+) {
+    let imsize_y = comp_img.shape(0);
+    let imsize_x = comp_img.shape(1);
+
+    let n_patchs_y = alignment.shape(0);
+    let n_patchs_x = alignment.shape(1);
+    let patch_idx = ABSOLUTE_POS_X;
+    let patch_idy = ABSOLUTE_POS_Y;
+
+    if patch_idy >= n_patchs_y || patch_idx >= n_patchs_x {
+        return;
+    }
+
+    let alig_offset = patch_idy * alignment.stride(0) + patch_idx * alignment.stride(1);
+    let hess_offset = patch_idy * hessian.stride(0) + patch_idx * hessian.stride(1);
+
+    let patch_pos_x = tile_size * patch_idx;
+    let patch_pos_y = tile_size * patch_idy;
+
+    let mut a = Line::empty(4);
+    a[0] = hessian[hess_offset + 0];
+    a[1] = hessian[hess_offset + 1];
+    a[2] = hessian[hess_offset + 2];
+    a[3] = hessian[hess_offset + 3];
+
+    // By putting non solvable exit this early, the remaining calculations are
+    // skipped for burned patches, which represents most of over-exposed images !
+    if f32::abs(a[0] * a[3] - a[1] * a[2]) < 1e-10 {
+        // system is Not solvable
+        return;
+    }
+
+    let mut b = Line::empty(2);
+    b[0] = 0.0;
+    b[1] = 0.0;
+
+    let mut local_alignment = Line::empty(2);
+    local_alignment[0] = alignment[alig_offset + 0];
+    local_alignment[1] = alignment[alig_offset + 1];
+
+    let mut buffer_val = Line::empty(4);
+    let mut pos = Line::empty(2); // y, x
+
+    for i in 0..tile_size {
+        for j in 0..tile_size {
+            let pixel_global_idx = patch_pos_x + j; // global position on the coarse grey grid.
+                                                    // Because of extremity padding, it can be out of bound
+            let pixel_global_idy = patch_pos_y + i;
+
+            if pixel_global_idx < imsize_x && pixel_global_idy < imsize_y {
+                let grad_offset =
+                    pixel_global_idy * grad.stride(0) + pixel_global_idx * grad.stride(1);
+
+                let local_gradx = grad[grad_offset + 0];
+                let local_grady = grad[grad_offset + 1];
+
+                // Warp I with W(x; p) to compute I(W(x; p))
+                let new_idx = local_alignment[0] + pixel_global_idx as f32;
+                let new_idy = local_alignment[1] + pixel_global_idy as f32;
+                if new_idx >= 0.0
+                    && new_idx < imsize_x as f32 - 1.0 // -1 for bicubic interpolation
+                    && new_idy >= 0.0
+                    && new_idy < imsize_y as f32 - 1.0
+                // -1 for bicubic interpolation
+                {
+                    // bicubic interpolation
+                    // https://www.rollpie.com/post/252
+                    // separating floor and floating part
+                    let floor_x = f32::floor(new_idx);
+                    let normalised_pos_x = new_idx - floor_x;
+
+                    let floor_y = f32::floor(new_idy);
+                    let normalised_pos_y = new_idy - floor_y;
+
+                    let floor_x = floor_x as i32;
+                    let floor_y = floor_y as i32;
+
+                    let ceil_x = floor_x + 1;
+                    let ceil_y = floor_y + 1;
+
+                    pos[0] = normalised_pos_y;
+                    pos[1] = normalised_pos_x;
+
+                    let floor_y_offset = floor_y as u32 * comp_img.stride(0);
+                    buffer_val[0] = comp_img[floor_y_offset + floor_x as u32 * comp_img.stride(1)];
+                    buffer_val[1] = comp_img[floor_y_offset + ceil_x as u32 * comp_img.stride(1)];
+
+                    let ceil_y_offset = ceil_y as u32 * comp_img.stride(0);
+                    buffer_val[2] = comp_img[ceil_y_offset + floor_x as u32 * comp_img.stride(1)];
+                    buffer_val[3] = comp_img[ceil_y_offset + ceil_x as u32 * comp_img.stride(1)];
+
+                    let comp_val = bilinear_interpolation(&buffer_val, &pos);
+                    let ref_img_offset =
+                        pixel_global_idy * ref_img.stride(0) + pixel_global_idx * ref_img.stride(1);
+
+                    let gradt = comp_val - ref_img[ref_img_offset];
+
+                    b[0] += -local_gradx * gradt;
+                    b[1] += -local_grady * gradt;
+                }
+            }
+        }
+    }
+
+    let mut alignment_step = Line::empty(2);
+
+    // solvability is ensured by design
+    solve_2x2(&a, &b, &mut alignment_step);
+
+    alignment[alig_offset + 0] = local_alignment[0] + alignment_step[0];
+    alignment[alig_offset + 1] = local_alignment[1] + alignment_step[1];
+}
+
+#[cube]
+fn solve_2x2(a: &Line<f32>, b: &Line<f32>, x: &mut Line<f32>) {
+    let det_a = a[0] * a[3] - a[1] * a[2];
+
+    x[0] = (a[3] * b[0] - a[1] * b[1]) / det_a;
+    x[1] = (a[0] * b[1] - a[2] * b[0]) / det_a;
+}
+
+///
+///    Parameters
+///    ----------
+///    values : Array[2, 2]
+///        values on the 4 closest neighboors
+///    pos : Array[2]
+///        position where interpolation must be done (in [0, 1]x[0, 1]). y, x
+///
+///    Returns
+///    -------
+///    val : float
+///        interpolated value
+///
+#[cube]
+fn bilinear_interpolation(values: &Line<f32>, pos: &Line<f32>) -> f32 {
+    let posy = pos[0];
+    let posx = pos[1];
+
+    values[0] * (1.0 - posx) * (1.0 - posy)
+        + values[1] * (posx) * (1.0 - posy)
+        + values[2] * (1.0 - posx) * (posy)
+        + values[3] * posx * posy
+}
--- a/src/ops/merge.rs
+++ b/src/ops/merge.rs
@ -0,0 +1,156 @@
+// use cubecl::prelude::*;
+
+// #[cube]
+// fn accumulate(comp_img, alignments, covs, r,
+//                bayer_mode, iso_kernel, scale, tile_size, CFA_pattern,
+//                num, den) {
+
+//     let output_pixel_idx, output_pixel_idy = cuda.grid(2);
+
+//     let output_size_y, output_size_x, _ = num.shape;
+//     let input_size_y, input_size_x = comp_img.shape;
+
+//     if output_pixel_idx >= output_size_x || output_pixel_idy >= output_size_y {
+//         return;
+//     }
+
+//     if bayer_mode {
+//         n_channels = 3;
+//         acc = cuda.local.array(3, dtype=DEFAULT_CUDA_FLOAT_TYPE);
+//         val = cuda.local.array(3, dtype=DEFAULT_CUDA_FLOAT_TYPE);
+//     } else {
+//         n_channels = 1;
+//         acc = cuda.local.array(1, dtype=DEFAULT_CUDA_FLOAT_TYPE);
+//         val = cuda.local.array(1, dtype=DEFAULT_CUDA_FLOAT_TYPE);
+//     }
+
+//     // Copying CFA locally. We will read that 9 times, so it's worth it
+//     let local_cfa = cuda.local.array((2,2), uint8);
+//     for i in 0..2 {
+//         for j in 0..2 {
+//             local_CFA[i,j] = uint8(cfa_pattern[i,j]);
+//         }
+//     }
+
+//     let coarse_ref_sub_pos = cuda.local.array(2, dtype=DEFAULT_CUDA_FLOAT_TYPE); // y, x
+
+//     coarse_ref_sub_pos[0] = output_pixel_idy / scale;
+//     coarse_ref_sub_pos[1] = output_pixel_idx / scale;
+
+//     // fetch of the flow, as early as possible
+//     let local_optical_flow = cuda.local.array(2, dtype=DEFAULT_CUDA_FLOAT_TYPE);
+//     let patch_idy = int(coarse_ref_sub_pos[0] / tile_size);
+//     let patch_idx = int(coarse_ref_sub_pos[1] /tile_size);
+
+//     local_optical_flow[0] = alignments[patch_idy, patch_idx, 0];
+//     local_optical_flow[1] = alignments[patch_idy, patch_idx, 1];
+
+//     for chan in 0..n_channels {
+//         acc[chan] = 0.0;
+//         val[chan] = 0.0;
+//     }
+
+//     let patch_center_pos = cuda.local.array(2, DEFAULT_CUDA_FLOAT_TYPE); // y, x
+
+//     // fetching robustness
+//     // The robustness coefficient is known for every raw pixel, and implicitely
+//     // interpolated to HR using nearest neighboor interpolations.
+
+//     let y_r = clamp(round(coarse_ref_sub_pos[0]), 0, r.shape[0]-1);
+//     let x_r = clamp(round(coarse_ref_sub_pos[1]), 0, r.shape[1]-1);
+//     let local_r = r[y_r, x_r];
+
+//     patch_center_pos[1] = coarse_ref_sub_pos[1] + local_optical_flow[0];
+//     patch_center_pos[0] = coarse_ref_sub_pos[0] + local_optical_flow[1];
+
+//     // updating inbound condition
+//     if patch_center_pos[1] >= input_size_x || patch_center_pos[0] >= input_size_y {
+//         return;
+//     }
+
+//     // computing kernel
+//     if !iso_kernel {
+//         let interpolated_cov = cuda.local.array((2, 2), dtype = DEFAULT_CUDA_FLOAT_TYPE);
+//         let cov_i = cuda.local.array((2, 2), dtype=DEFAULT_CUDA_FLOAT_TYPE);
+//         // fetching the 4 closest covs
+//         let close_covs = cuda.local.array((2, 2, 2 ,2), DEFAULT_CUDA_FLOAT_TYPE);
+//         let grey_pos = cuda.local.array(2, DEFAULT_CUDA_FLOAT_TYPE);
+
+//         if bayer_mode {
+//             grey_pos[0] = (patch_center_pos[0] - 0.5) /2; // grey grid is offseted and twice more sparse
+//             grey_pos[1] = (patch_center_pos[1] - 0.5) /2;
+//         } else {
+//             grey_pos[0] = patch_center_pos[0]; // grey grid is exactly the coarse grid
+//             grey_pos[1] = patch_center_pos[1];
+//         }
+
+//         // clipping the coordinates to stay in bound
+//         let floor_x = int(max(math.floor(grey_pos[1]), 0));
+//         let floor_y = int(max(math.floor(grey_pos[0]), 0));
+
+//         let ceil_x = min(floor_x + 1, covs.shape[1]-1);
+//         let ceil_y = min(floor_y + 1, covs.shape[0]-1);
+//         for i in 0..2 {
+//             for j in range(0, 2) {
+//                 close_covs[0, 0, i, j] = covs[floor_y, floor_x, i, j];
+//                 close_covs[0, 1, i, j] = covs[floor_y, ceil_x, i, j];
+//                 close_covs[1, 0, i, j] = covs[ceil_y, floor_x, i, j];
+//                 close_covs[1, 1, i, j] = covs[ceil_y, ceil_x, i, j];
+//             }
+//         }
+
+//         // interpolating covs at the desired spot
+//         interpolate_cov(close_covs, grey_pos, interpolated_cov);
+//         invert_2x2(interpolated_cov, cov_i);
+//     }
+
+//     let center_x = round(patch_center_pos[1]);
+//     let center_y = round(patch_center_pos[0]);
+
+//     for i in -1..=1 {
+//         for j in -1..=1 {
+//             let pixel_idx = center_x + j;
+//             let pixel_idy = center_y + i;
+
+//             // in bound condition
+//             if (0 <= pixel_idx && pixel_idx< input_size_x) &&
+//                 (0 <= pixel_idy && pixel_idy< input_size_y) {
+
+//                 // checking if pixel is r, g or b
+//                 let channel = if bayer_mode {
+//                     local_CFA[pixel_idy%2, pixel_idx%2]
+//                 } else {
+//                     0
+//                 };
+//                 // By fetching the value now, we can compute the kernel weight
+//                 // while it is called from global memory
+//                 let c = comp_img[pixel_idy, pixel_idx];
+
+//                 // computing distance
+//                 let dist_x = pixel_idx - patch_center_pos[1];
+//                 let dist_y = pixel_idy - patch_center_pos[0];
+
+//                 // Computing w
+//                 let y = if iso_kernel {
+//                     max(0, 2 * (dist_x * dist_x + dist_y * dist_y))
+//                 } else {
+//                     max(0, quad_mat_prod(cov_i, dist_x, dist_y))
+//                     // y can be slightly negative because of numerical precision.
+//                     // I clamp it to not explode the error with exp
+//                 };
+
+//                 w = math.exp(-0.5*y)
+
+//                 //---
+
+//                 val[channel] += c * w * local_r;
+//                 acc[channel] += w * local_r;
+//             }
+//         }
+//     }
+
+//     for chan in 0..n_channels {
+//         num[output_pixel_idy, output_pixel_idx, chan] += val[chan];
+//         den[output_pixel_idy, output_pixel_idx, chan] += acc[chan];
+//     }
+// }
--- a/src/ops/mod.rs
+++ b/src/ops/mod.rs
@ -0,0 +1,12 @@
+pub mod avg_pool;
+pub mod block_matching;
+pub mod decimate_bayer;
+pub mod demosaic;
+pub mod downsample;
+pub mod estimate_kernels;
+pub mod exposure;
+pub mod fft;
+pub mod gauss_pyramid;
+pub mod lucas_kanade;
+pub mod merge;
+pub mod spatial_merge;
--- a/src/ops/spatial_merge.rs
+++ b/src/ops/spatial_merge.rs
--- a/src/tensor.rs
+++ b/src/tensor.rs
@ -0,0 +1,189 @@
+use std::marker::PhantomData;
+
+use cubecl::{
+    prelude::{ArrayArg, CubePrimitive, TensorHandleRef},
+    server::Handle,
+    CubeCount, CubeDim, Runtime,
+};
+
+pub type GpuTensor<T, const R: usize> = Tensor<T, Handle, R>;
+pub type CpuTensor<T, const R: usize> = Tensor<T, Vec<T>, R>;
+
+#[derive(Debug)]
+pub struct Tensor<T: CubePrimitive, D, const R: usize> {
+    pub(crate) data: D,
+    pub(crate) shape: [usize; R],
+    pub(crate) stride: [usize; R],
+
+    pub(crate) word_aligned_shape: [usize; R],
+    pub(crate) word_aligned_stride: [usize; R],
+    _m: PhantomData<T>,
+}
+
+impl<T: CubePrimitive, D, const R: usize> Tensor<T, D, R> {
+    pub(crate) fn create(data: D, shape: [usize; R]) -> Self {
+        let mut strides = [1usize; R];
+        for x in (0..R - 1).rev() {
+            strides[x] = strides[x + 1] * shape[x + 1];
+        }
+
+        Self::with_strides(data, shape, strides)
+    }
+
+    pub(crate) fn with_strides(data: D, shape: [usize; R], stride: [usize; R]) -> Self {
+        let mut word_aligned_shape = shape;
+        let mut word_aligned_stride = stride;
+
+        match (core::mem::size_of::<T>(), shape[R - 1]) {
+            (1, 4) => {
+                word_aligned_shape[R - 1] /= 4;
+                for i in 0..R - 1 {
+                    word_aligned_stride[i] /= 4;
+                }
+            }
+            (1, 2) => {
+                word_aligned_shape[R - 1] /= 2;
+                word_aligned_shape[R - 2] /= 2;
+                word_aligned_stride[R - 2] /= 2;
+                for i in 0..R - 2 {
+                    word_aligned_stride[i] /= 4;
+                }
+            }
+            (1, 1) => {
+                word_aligned_shape[R - 2] /= 4;
+                for i in 0..R - 2 {
+                    word_aligned_stride[i] /= 4;
+                }
+            }
+            (2, 4) => {
+                word_aligned_shape[R - 1] /= 2;
+                for i in 0..R - 1 {
+                    word_aligned_stride[i] /= 2;
+                }
+            }
+            (2, 2) => {
+                word_aligned_shape[R - 1] /= 2;
+                for i in 0..R - 2 {
+                    word_aligned_stride[i] /= 2;
+                }
+            }
+            (2, 1) => {
+                word_aligned_shape[R - 2] /= 2;
+                for i in 0..R - 2 {
+                    word_aligned_stride[i] /= 2;
+                }
+            }
+            (4, _) => {}
+            _ => unimplemented!(),
+        };
+
+        Self {
+            data,
+            shape,
+            stride,
+            word_aligned_shape,
+            word_aligned_stride,
+            _m: PhantomData,
+        }
+    }
+
+    #[inline]
+    pub fn shape(&self) -> [usize; R] {
+        self.shape
+    }
+
+    #[inline]
+    pub fn stride(&self) -> [usize; R] {
+        self.stride
+    }
+
+    #[inline]
+    pub fn cube_count(&self, div_x: u32, div_y: u32) -> CubeCount {
+        let dim = CubeDim::default();
+        let x = self.shape[1] / dim.x as usize;
+        let y = (self.shape[0] + dim.y as usize - 1) / dim.y as usize;
+
+        CubeCount::Static(x as u32 / div_x, y as u32 / div_y, 1)
+    }
+
+    #[inline]
+    fn len(&self) -> usize {
+        self.shape.iter().product()
+    }
+}
+
+#[derive(Debug)]
+pub struct CubeGpuTensor<'a, Rt: Runtime, const R: usize> {
+    pub(crate) data: &'a Handle,
+    pub(crate) shape: [usize; R],
+    pub(crate) stride: [usize; R],
+    _m: PhantomData<Rt>,
+}
+
+impl<'a, Rt: Runtime, const R: usize> CubeGpuTensor<'a, Rt, R> {
+    pub fn as_tensor_ref(&'a self) -> TensorHandleRef<'a, Rt> {
+        TensorHandleRef {
+            handle: self.data,
+            strides: &self.stride,
+            shape: &self.shape,
+            elem_size: 4,
+            runtime: PhantomData,
+        }
+    }
+}
+
+impl<T: CubePrimitive, const R: usize> GpuTensor<T, R> {
+    #[inline]
+    pub fn as_array_arg<Rt: Runtime>(&self, vectorization: u8) -> ArrayArg<'_, Rt> {
+        let factor = usize::max(4 / core::mem::size_of::<T>(), 1);
+
+        unsafe {
+            ArrayArg::from_raw_parts_and_size(&self.data, self.len() / factor, vectorization, 4)
+        }
+    }
+
+    #[inline]
+    pub fn as_tensor_ref<Rt: Runtime>(&self) -> TensorHandleRef<'_, Rt> {
+        TensorHandleRef {
+            handle: &self.data,
+            strides: &self.stride,
+            shape: &self.shape,
+            elem_size: 4,
+            runtime: PhantomData,
+        }
+    }
+
+    #[inline]
+    pub fn as_tensor_mut<Rt: Runtime>(&mut self) -> TensorHandleRef<'_, Rt> {
+        TensorHandleRef {
+            handle: &self.data,
+            strides: &self.stride,
+            shape: &self.shape,
+            elem_size: 4,
+            runtime: PhantomData,
+        }
+    }
+
+    #[inline]
+    pub fn as_word_aligned_tensor_ref<Rt: Runtime>(&self) -> TensorHandleRef<'_, Rt> {
+        TensorHandleRef {
+            handle: &self.data,
+            strides: &self.word_aligned_stride,
+            shape: &self.word_aligned_shape,
+            elem_size: 4,
+            runtime: PhantomData,
+        }
+    }
+}
+
+impl<T: CubePrimitive, const R: usize> std::ops::Index<[usize; R]> for CpuTensor<T, R> {
+    type Output = T;
+
+    fn index(&self, index: [usize; R]) -> &Self::Output {
+        self.data.index(
+            std::iter::zip(self.stride, index)
+                .map(|(s, i)| s * i)
+                .sum::<usize>(),
+        )
+    }
+}
--- a/test.cr2
+++ b/test.cr2
--- a/test.dng
+++ b/test.dng
--- a/test.png
+++ b/test.png
--- a/test_burst1/im_00.dng
+++ b/test_burst1/im_00.dng
--- a/test_burst1/im_00.dng.pp3
+++ b/test_burst1/im_00.dng.pp3
@ -0,0 +1,783 @@
+[Version]
+AppVersion=5.11
+Version=351
+
+[General]
+ColorLabel=0
+InTrash=false
+
+[Exposure]
+Auto=false
+Clip=0.02
+Compensation=0
+Brightness=0
+Contrast=0
+Saturation=-100
+Black=-13804
+HighlightCompr=0
+HighlightComprThreshold=0
+ShadowCompr=50
+HistogramMatching=false
+CurveFromHistogramMatching=false
+ClampOOG=true
+CurveMode=FilmLike
+CurveMode2=Standard
+Curve=0;
+Curve2=0;
+
+[HLRecovery]
+Enabled=true
+Method=Coloropp
+Hlbl=0
+Hlth=1
+
+[Retinex]
+Enabled=false
+Str=20
+Scal=3
+Iter=1
+Grad=1
+Grads=1
+Gam=1.3
+Slope=3
+Median=false
+Neigh=80
+Offs=0
+Vart=200
+Limd=8
+highl=4
+skal=3
+complexMethod=normal
+RetinexMethod=high
+mapMethod=none
+viewMethod=none
+Retinexcolorspace=Lab
+Gammaretinex=none
+CDCurve=0;
+MAPCurve=0;
+CDHCurve=0;
+LHCurve=0;
+Highlights=0
+HighlightTonalWidth=80
+Shadows=0
+ShadowTonalWidth=80
+Radius=40
+TransmissionCurve=1;0;0.5;0.34999999999999998;0.34999999999999998;0.59999999999999998;0.75;0.34999999999999998;0.34999999999999998;1;0.5;0.34999999999999998;0.34999999999999998;
+GainTransmissionCurve=1;0;0.10000000000000001;0.34999999999999998;0;0.25;0.25;0.34999999999999998;0.34999999999999998;0.69999999999999996;0.25;0.34999999999999998;0.34999999999999998;1;0.10000000000000001;0;0;
+
+[Local Contrast]
+Enabled=false
+Radius=80
+Amount=0.20000000000000001
+Darkness=1
+Lightness=1
+
+[Channel Mixer]
+Enabled=false
+Red=1000;0;0;
+Green=0;1000;0;
+Blue=0;0;1000;
+
+[Black & White]
+Enabled=false
+Method=Desaturation
+Auto=false
+ComplementaryColors=true
+Setting=RGB-Rel
+Filter=None
+MixerRed=33
+MixerOrange=33
+MixerYellow=33
+MixerGreen=33
+MixerCyan=33
+MixerBlue=33
+MixerMagenta=33
+MixerPurple=33
+GammaRed=0
+GammaGreen=0
+GammaBlue=0
+Algorithm=SP
+LuminanceCurve=0;
+BeforeCurveMode=Standard
+AfterCurveMode=Standard
+BeforeCurve=0;
+AfterCurve=0;
+
+[Luminance Curve]
+Enabled=false
+Brightness=0
+Contrast=0
+Chromaticity=0
+Gamutmunse=MUN
+RedAndSkinTonesProtection=0
+LCredsk=true
+LCurve=0;
+aCurve=0;
+bCurve=0;
+ccCurve=0;
+chCurve=0;
+lhCurve=0;
+hhCurve=0;
+LcCurve=0;
+ClCurve=0;
+
+[Sharpening]
+Enabled=false
+Contrast=30
+Method=rld
+Radius=0.5
+BlurRadius=0.20000000000000001
+Amount=200
+Threshold=20;80;2000;1200;
+OnlyEdges=false
+EdgedetectionRadius=1.8999999999999999
+EdgeTolerance=1800
+HalocontrolEnabled=false
+HalocontrolAmount=85
+DeconvRadius=0.48999999999999999
+DeconvAmount=100
+DeconvDamping=0
+DeconvIterations=30
+
+[Vibrance]
+Enabled=false
+Pastels=40
+Saturated=40
+PSThreshold=0;75;
+ProtectSkins=false
+AvoidColorShift=true
+PastSatTog=true
+SkinTonesCurve=0;
+
+[SharpenEdge]
+Enabled=false
+Passes=2
+Strength=50
+ThreeChannels=false
+
+[SharpenMicro]
+Enabled=false
+Matrix=false
+Strength=20
+Contrast=20
+Uniformity=5
+
+[White Balance]
+Enabled=false
+Setting=Camera
+Temperature=5149
+Green=1.1970000000000001
+Equal=1
+TemperatureBias=0
+StandardObserver=TWO_DEGREES
+Itcwb_green=0
+Itcwb_rangegreen=1
+Itcwb_nopurple=false
+Itcwb_alg=false
+Itcwb_prim=beta
+Itcwb_sampling=false
+CompatibilityVersion=2
+
+[Color appearance]
+Enabled=false
+Degree=90
+AutoDegree=true
+Degreeout=90
+AutoDegreeout=true
+Surround=Average
+complex=normal
+ModelCat=16
+CatCat=clas
+Surrsrc=Average
+AdaptLum=16
+Badpixsl=0
+Model=RawT
+Illum=i50
+Algorithm=JC
+J-Light=0
+Q-Bright=0
+C-Chroma=0
+S-Chroma=0
+M-Chroma=0
+J-Contrast=0
+Q-Contrast=0
+H-Hue=0
+RSTProtection=0
+AdaptScene=2000
+AutoAdapscen=true
+YbScene=18
+Autoybscen=true
+SurrSource=false
+Gamut=true
+Tempout=5003
+Autotempout=true
+Greenout=1
+Tempsc=5003
+Greensc=1
+Ybout=18
+Datacie=false
+Tonecie=false
+CurveMode=Lightness
+CurveMode2=Brightness
+CurveMode3=Chroma
+Curve=0;
+Curve2=0;
+Curve3=0;
+
+[Impulse Denoising]
+Enabled=false
+Threshold=1
+
+[Defringing]
+Enabled=false
+Radius=2
+Threshold=13
+HueCurve=1;0.16666666699999999;0;0.34999999999999998;0.34999999999999998;0.34699999999999998;0;0.34999999999999998;0.34999999999999998;0.51366742600000004;0;0.34999999999999998;0.34999999999999998;0.66894457100000004;0;0.34999999999999998;0.34999999999999998;0.82877752459999998;0.97835991;0.34999999999999998;0.34999999999999998;0.99088838270000001;0;0.34999999999999998;0.34999999999999998;
+
+[Dehaze]
+Enabled=false
+Strength=50
+ShowDepthMap=false
+Depth=25
+Saturation=50
+
+[Directional Pyramid Denoising]
+Enabled=false
+Enhance=false
+Median=false
+Luma=48.390000000000001
+Ldetail=21.890000000000001
+Chroma=6.0999999999999996
+Method=Lab
+LMethod=SLI
+CMethod=MAN
+C2Method=MANU
+SMethod=shal
+MedMethod=soft
+RGBMethod=soft
+MethodMed=Lonly
+Redchro=1.6000000000000001
+Bluechro=-1.2
+AutoGain=true
+Gamma=1.7
+Passes=1
+LCurve=1;0.050000000000000003;0.14999999999999999;0.34999999999999998;0.34999999999999998;0.14055299539170502;0.39631336405529954;0.34999999999999998;0.34999999999999998;0.19815668202764983;0.096774193548387136;0.34999999999999998;0.34999999999999998;0.25345622119815681;0.91474654377880205;0.34999999999999998;0.34999999999999998;0.55000000000000004;0.040000000000000001;0.34999999999999998;0.34999999999999998;0.83410138248847965;0.73271889400921653;0.34999999999999998;0.34999999999999998;
+CCCurve=0;
+
+[EPD]
+Enabled=false
+Strength=0.5
+Gamma=1
+EdgeStopping=1.3999999999999999
+Scale=1
+ReweightingIterates=0
+
+[FattalToneMapping]
+Enabled=false
+Threshold=30
+Amount=20
+Anchor=50
+
+[Shadows & Highlights]
+Enabled=false
+Highlights=0
+HighlightTonalWidth=70
+Shadows=0
+ShadowTonalWidth=30
+Radius=40
+Lab=false
+
+[ToneEqualizer]
+Enabled=false
+Band0=0
+Band1=0
+Band2=0
+Band3=0
+Band4=0
+Band5=0
+Regularization=0
+Pivot=0
+
+[Crop]
+Enabled=false
+X=0
+Y=0
+W=4024
+H=3016
+FixedRatio=true
+Ratio=As Image
+Orientation=As Image
+Guide=Frame
+
+[Coarse Transformation]
+Rotate=0
+HorizontalFlip=false
+VerticalFlip=false
+
+[Common Properties for Transformations]
+Method=log
+AutoFill=true
+
+[Rotation]
+Degree=0
+
+[Distortion]
+Amount=0
+
+[LensProfile]
+LcMode=lfauto
+LCPFile=
+UseDistortion=true
+UseVignette=false
+UseCA=false
+LFCameraMake=
+LFCameraModel=
+LFLens=
+
+[Perspective]
+Method=simple
+Horizontal=0
+Vertical=0
+CameraCropFactor=0
+CameraFocalLength=4.1900000000000004
+CameraPitch=0
+CameraRoll=0
+CameraShiftHorizontal=0
+CameraShiftVertical=0
+CameraYaw=0
+ProjectionShiftHorizontal=0
+ProjectionPitch=0
+ProjectionRotate=0
+ProjectionShiftVertical=0
+ProjectionYaw=0
+ControlLineValues=
+ControlLineTypes=
+
+[Gradient]
+Enabled=false
+Degree=0
+Feather=25
+Strength=0.59999999999999998
+CenterX=0
+CenterY=0
+
+[Locallab]
+Enabled=false
+Selspot=0
+
+[PCVignette]
+Enabled=false
+Strength=0.59999999999999998
+Feather=50
+Roundness=50
+
+[CACorrection]
+Red=0
+Blue=0
+
+[Vignetting Correction]
+Amount=0
+Radius=50
+Strength=1
+CenterX=0
+CenterY=0
+
+[Resize]
+Enabled=false
+Scale=0.22
+AppliesTo=Cropped area
+Method=Lanczos
+DataSpecified=3
+Width=900
+Height=900
+LongEdge=900
+ShortEdge=900
+AllowUpscaling=false
+
+[PostDemosaicSharpening]
+Enabled=false
+Contrast=17
+AutoContrast=true
+AutoRadius=true
+DeconvRadius=0.59999999999999998
+DeconvRadiusOffset=0
+DeconvIterCheck=true
+DeconvIterations=20
+
+[PostResizeSharpening]
+Enabled=false
+Contrast=15
+Method=rld
+Radius=0.5
+Amount=200
+Threshold=20;80;2000;1200;
+OnlyEdges=false
+EdgedetectionRadius=1.8999999999999999
+EdgeTolerance=1800
+HalocontrolEnabled=false
+HalocontrolAmount=85
+DeconvRadius=0.45000000000000001
+DeconvAmount=100
+DeconvDamping=0
+DeconvIterations=100
+
+[Color Management]
+InputProfile=(camera)
+ToneCurve=false
+ApplyLookTable=true
+ApplyBaselineExposureOffset=true
+ApplyHueSatMap=true
+DCPIlluminant=0
+WorkingProfile=ProPhoto
+WorkingTRC=none
+Will=D50
+Wprim=def
+Wcat=brad
+WorkingTRCGamma=2.3999999999999999
+WorkingTRCSlope=12.92
+Wmidtcie=0
+Wsmoothcie=false
+Redx=0.73470000000000002
+Redy=0.26529999999999998
+Grex=0.15959999999999999
+Grey=0.84040000000000004
+Blux=0.036600000000000001
+Bluy=0.0001
+Refi=0
+Shiftx=0
+Shifty=0
+LabGridcieALow=0.51763000000000003
+LabGridcieBLow=-0.33582000000000001
+LabGridcieAHigh=-0.75163000000000002
+LabGridcieBHigh=-0.81799999999999995
+LabGridcieGx=-0.69164000000000003
+LabGridcieGy=-0.70909
+LabGridcieWx=-0.18964
+LabGridcieWy=-0.16636000000000001
+LabGridcieMx=0
+LabGridcieMy=0
+Preser=0
+Fbw=false
+TrcExp=false
+Gamut=false
+OutputProfile=RTv4_sRGB
+aIntent=Relative
+OutputProfileIntent=Relative
+OutputBPC=true
+
+[Wavelet]
+Enabled=false
+Strength=100
+Balance=0
+Sigmafin=1
+Sigmaton=1
+Sigmacol=1
+Sigmadir=1
+Rangeab=20
+Protab=0
+Iter=0
+MaxLev=7
+TilesMethod=full
+complexMethod=normal
+mixMethod=mix7
+sliMethod=sli
+quaMethod=cons
+DaubMethod=4_
+ChoiceLevMethod=all
+BackMethod=grey
+LevMethod=4
+DirMethod=all
+CBgreenhigh=0
+CBgreenmed=0
+CBgreenlow=0
+CBbluehigh=0
+CBbluemed=0
+CBbluelow=0
+Ballum=7
+Sigm=1
+Levden=0
+Thrden=0
+Limden=0
+Balchrom=0
+Chromfine=0
+Chromcoarse=0
+MergeL=20
+MergeC=20
+Softrad=0
+Softradend=0
+Strend=50
+Detend=0
+Thrend=0
+Expcontrast=false
+Expchroma=false
+Expedge=false
+expbl=false
+Expresid=false
+Expfinal=false
+Exptoning=false
+Expnoise=false
+Expclari=false
+LabGridALow=0
+LabGridBLow=0
+LabGridAHigh=0
+LabGridBHigh=0
+Contrast1=0
+Contrast2=0
+Contrast3=0
+Contrast4=0
+Contrast5=0
+Contrast6=0
+Contrast7=0
+Contrast8=0
+Contrast9=0
+Chroma1=0
+Chroma2=0
+Chroma3=0
+Chroma4=0
+Chroma5=0
+Chroma6=0
+Chroma7=0
+Chroma8=0
+Chroma9=0
+ContExtra=0
+HSMethod=with
+HLRange=50;75;100;98;
+SHRange=0;2;50;25;
+Edgcont=0;10;75;40;
+Level0noise=0;0;
+Level1noise=0;0;
+Level2noise=0;0;
+Level3noise=0;0;
+Leveldenoise=0;0;
+Levelsigm=1;1;
+ThresholdHighlight=4
+ThresholdShadow=5
+Edgedetect=90
+Edgedetectthr=20
+EdgedetectthrHi=0
+Edgesensi=60
+Edgeampli=10
+ThresholdChroma=5
+CHromaMethod=link
+Medgreinf=less
+Ushamethod=clari
+CHSLromaMethod=SL
+EDMethod=CU
+NPMethod=none
+BAMethod=none
+TMMethod=cont
+ChromaLink=0
+ContrastCurve=1;0;0.25;0.34999999999999998;0.34999999999999998;0.5;0.75;0.34999999999999998;0.34999999999999998;0.90000000000000002;0;0.34999999999999998;0.34999999999999998;
+blcurve=1;0;0;0;0.34999999999999998;0.5;0;0.34999999999999998;0.34999999999999998;1;0;0.34999999999999998;0.34999999999999998;
+Pastlev=0;2;30;20;
+Satlev=30;45;130;100;
+OpacityCurveRG=1;0;0.5;0.34999999999999998;0.34999999999999998;1;0.5;0.34999999999999998;0.34999999999999998;
+OpacityCurveBY=1;0;0.5;0.34999999999999998;0.34999999999999998;1;0.5;0.34999999999999998;0.34999999999999998;
+wavdenoise=1;0;1;0.34999999999999998;0.34999999999999998;0.5;1;0.34999999999999998;0.34999999999999998;1;1;0.34999999999999998;0.34999999999999998;
+wavdenoiseh=1;0;1;0.34999999999999998;0.34999999999999998;0.5;1;0.34999999999999998;0.34999999999999998;1;1;0.34999999999999998;0.34999999999999998;
+OpacityCurveW=1;0;0.34999999999999998;0.34999999999999998;0;0.34999999999999998;0.75;0.34999999999999998;0.34999999999999998;0.59999999999999998;0.75;0.34999999999999998;0.34999999999999998;1;0.34999999999999998;0;0;
+OpacityCurveWL=1;0;0.5;0.34999999999999998;0.34999999999999998;1;0.5;0.34999999999999998;0.34999999999999998;
+HHcurve=0;
+Wavguidcurve=0;
+Wavhuecurve=0;
+CHcurve=0;
+WavclCurve=0;
+Median=false
+Medianlev=false
+Linkedg=false
+CBenab=false
+Lipst=false
+Skinprotect=0
+chrwav=0
+bluwav=1
+Hueskin=-5;25;170;120;
+Edgrad=15
+Edgeffect=1
+Edgval=0
+ThrEdg=10
+AvoidColorShift=false
+Showmask=false
+Oldsh=true
+TMr=false
+Sigma=1
+Offset=1
+Lowthr=40
+ResidualcontShadow=0
+ResidualcontHighlight=0
+ThresholdResidShadow=30
+ThresholdResidHighLight=70
+Residualradius=40
+Residualchroma=0
+Residualblur=0
+Residualblurc=0
+ResidualTM=0
+ResidualEDGS=1.3999999999999999
+ResidualSCALE=1
+Residualgamma=1
+HueRangeResidual=0
+HueRange=-260;-250;-130;-140;
+Contrast=0
+
+[Spot removal]
+Enabled=false
+
+[Directional Pyramid Equalizer]
+Enabled=false
+Gamutlab=false
+cbdlMethod=bef
+Mult0=1
+Mult1=1
+Mult2=1
+Mult3=1
+Mult4=1
+Mult5=1
+Threshold=0.20000000000000001
+Skinprotect=0
+Hueskin=-5;25;170;120;
+
+[HSV Equalizer]
+Enabled=false
+HCurve=0;
+SCurve=0;
+VCurve=0;
+
+[SoftLight]
+Enabled=false
+Strength=30
+
+[Film Simulation]
+Enabled=false
+ClutFilename=
+Strength=100
+
+[RGB Curves]
+Enabled=false
+LumaMode=false
+rCurve=0;
+gCurve=0;
+bCurve=0;
+
+[ColorToning]
+Enabled=false
+Method=LabRegions
+Lumamode=true
+Twocolor=Std
+Redlow=0
+Greenlow=0
+Bluelow=0
+Satlow=0
+Balance=0
+Sathigh=0
+Redmed=0
+Greenmed=0
+Bluemed=0
+Redhigh=0
+Greenhigh=0
+Bluehigh=0
+Autosat=true
+OpacityCurve=1;0;0.29999999999999999;0.34999999999999998;0;0.25;0.80000000000000004;0.34999999999999998;0.34999999999999998;0.69999999999999996;0.80000000000000004;0.34999999999999998;0.34999999999999998;1;0.29999999999999999;0;0;
+ColorCurve=1;0.050000000000000003;0.62;0.25;0.25;0.58499999999999996;0.11;0.25;0.25;
+SatProtectionThreshold=30
+SaturatedOpacity=80
+Strength=50
+HighlightsColorSaturation=60;80;
+ShadowsColorSaturation=80;208;
+ClCurve=3;0;0;0.34999999999999998;0.65000000000000002;1;1;
+Cl2Curve=3;0;0;0.34999999999999998;0.65000000000000002;1;1;
+LabGridALow=0
+LabGridBLow=0
+LabGridAHigh=0
+LabGridBHigh=0
+LabRegionA_1=0
+LabRegionB_1=0
+LabRegionSaturation_1=0
+LabRegionSlope_1=1
+LabRegionOffset_1=0
+LabRegionPower_1=1
+LabRegionHueMask_1=1;0.16666666699999999;1;0.34999999999999998;0.34999999999999998;0.82877752459999998;1;0.34999999999999998;0.34999999999999998;
+LabRegionChromaticityMask_1=1;0;1;0.34999999999999998;0.34999999999999998;1;1;0.34999999999999998;0.34999999999999998;
+LabRegionLightnessMask_1=1;0;1;0.34999999999999998;0.34999999999999998;1;1;0.34999999999999998;0.34999999999999998;
+LabRegionMaskBlur_1=0
+LabRegionChannel_1=-1
+LabRegionsShowMask=-1
+
+[RAW]
+DarkFrame=/szeva
+DarkFrameAuto=false
+FlatFieldFile=/szeva
+FlatFieldAutoSelect=false
+FlatFieldFromMetaData=false
+FlatFieldBlurRadius=32
+FlatFieldBlurType=Area Flatfield
+FlatFieldAutoClipControl=false
+FlatFieldClipControl=0
+CA=false
+CAAvoidColourshift=true
+CAAutoIterations=2
+CARed=0
+CABlue=0
+HotPixelFilter=false
+DeadPixelFilter=false
+HotDeadPixelThresh=100
+PreExposure=1
+
+[RAW Bayer]
+Method=igv
+Border=4
+ImageNum=1
+CcSteps=0
+PreBlack0=0
+PreBlack1=0
+PreBlack2=0
+PreBlack3=0
+PreTwoGreen=true
+LineDenoise=0
+LineDenoiseDirection=3
+GreenEqThreshold=0
+DCBIterations=2
+DCBEnhance=true
+LMMSEIterations=3
+DualDemosaicAutoContrast=true
+DualDemosaicContrast=16
+PixelShiftMotionCorrectionMethod=1
+PixelShiftEperIso=0
+PixelShiftSigma=1
+PixelShiftShowMotion=false
+PixelShiftShowMotionMaskOnly=false
+pixelShiftHoleFill=true
+pixelShiftAverage=false
+pixelShiftMedian=false
+pixelShiftGreen=true
+pixelShiftBlur=true
+pixelShiftSmoothFactor=0.69999999999999996
+pixelShiftEqualBright=false
+pixelShiftEqualBrightChannel=false
+pixelShiftNonGreenCross=true
+pixelShiftDemosaicMethod=amaze
+PDAFLinesFilter=false
+
+[RAW X-Trans]
+Method=3-pass (best)
+DualDemosaicAutoContrast=true
+DualDemosaicContrast=20
+Border=7
+CcSteps=0
+PreBlackRed=0
+PreBlackGreen=0
+PreBlackBlue=0
+
+[MetaData]
+Mode=1
+ExifKeys=Exif.Image.Artist;Exif.Image.Copyright;Exif.Image.ImageDescription;Exif.Image.Make;Exif.Image.Model;Exif.Image.XResolution;Exif.Image.YResolution;Exif.Photo.DateTimeOriginal;Exif.Photo.ExposureBiasValue;Exif.Photo.ExposureTime;Exif.Photo.FNumber;Exif.Photo.Flash;Exif.Photo.FocalLength;Exif.Photo.ISOSpeedRatings;Exif.Photo.LensModel;Exif.Photo.UserComment;
+
+[Film Negative]
+Enabled=false
+RedRatio=1.3600000000000001
+GreenExponent=1.5
+BlueRatio=0.85999999999999999
+ColorSpace=1
+RefInput=0;0;0;
+RefOutput=2739.961181640625;2740.07568359375;2739.944580078125;
+
+[RAW Preprocess WB]
+Mode=1
--- a/test_burst1/im_00.dng.xmp
+++ b/test_burst1/im_00.dng.xmp
--- a/test_burst1/im_01.dng
+++ b/test_burst1/im_01.dng
--- a/test_burst1/im_01.dng.pp3
+++ b/test_burst1/im_01.dng.pp3
@ -0,0 +1,783 @@
+[Version]
+AppVersion=5.11
+Version=351
+
+[General]
+ColorLabel=0
+InTrash=false
+
+[Exposure]
+Auto=false
+Clip=0.02
+Compensation=0
+Brightness=0
+Contrast=0
+Saturation=0
+Black=0
+HighlightCompr=0
+HighlightComprThreshold=0
+ShadowCompr=50
+HistogramMatching=true
+CurveFromHistogramMatching=true
+ClampOOG=true
+CurveMode=FilmLike
+CurveMode2=Standard
+Curve=0;
+Curve2=0;
+
+[HLRecovery]
+Enabled=true
+Method=Coloropp
+Hlbl=0
+Hlth=1
+
+[Retinex]
+Enabled=false
+Str=20
+Scal=3
+Iter=1
+Grad=1
+Grads=1
+Gam=1.3
+Slope=3
+Median=false
+Neigh=80
+Offs=0
+Vart=200
+Limd=8
+highl=4
+skal=3
+complexMethod=normal
+RetinexMethod=high
+mapMethod=none
+viewMethod=none
+Retinexcolorspace=Lab
+Gammaretinex=none
+CDCurve=0;
+MAPCurve=0;
+CDHCurve=0;
+LHCurve=0;
+Highlights=0
+HighlightTonalWidth=80
+Shadows=0
+ShadowTonalWidth=80
+Radius=40
+TransmissionCurve=1;0;0.5;0.34999999999999998;0.34999999999999998;0.59999999999999998;0.75;0.34999999999999998;0.34999999999999998;1;0.5;0.34999999999999998;0.34999999999999998;
+GainTransmissionCurve=1;0;0.10000000000000001;0.34999999999999998;0;0.25;0.25;0.34999999999999998;0.34999999999999998;0.69999999999999996;0.25;0.34999999999999998;0.34999999999999998;1;0.10000000000000001;0;0;
+
+[Local Contrast]
+Enabled=false
+Radius=80
+Amount=0.20000000000000001
+Darkness=1
+Lightness=1
+
+[Channel Mixer]
+Enabled=false
+Red=1000;0;0;
+Green=0;1000;0;
+Blue=0;0;1000;
+
+[Black & White]
+Enabled=false
+Method=Desaturation
+Auto=false
+ComplementaryColors=true
+Setting=RGB-Rel
+Filter=None
+MixerRed=33
+MixerOrange=33
+MixerYellow=33
+MixerGreen=33
+MixerCyan=33
+MixerBlue=33
+MixerMagenta=33
+MixerPurple=33
+GammaRed=0
+GammaGreen=0
+GammaBlue=0
+Algorithm=SP
+LuminanceCurve=0;
+BeforeCurveMode=Standard
+AfterCurveMode=Standard
+BeforeCurve=0;
+AfterCurve=0;
+
+[Luminance Curve]
+Enabled=false
+Brightness=0
+Contrast=0
+Chromaticity=0
+Gamutmunse=MUN
+RedAndSkinTonesProtection=0
+LCredsk=true
+LCurve=0;
+aCurve=0;
+bCurve=0;
+ccCurve=0;
+chCurve=0;
+lhCurve=0;
+hhCurve=0;
+LcCurve=0;
+ClCurve=0;
+
+[Sharpening]
+Enabled=false
+Contrast=20
+Method=usm
+Radius=0.5
+BlurRadius=0.20000000000000001
+Amount=200
+Threshold=20;80;2000;1200;
+OnlyEdges=false
+EdgedetectionRadius=1.8999999999999999
+EdgeTolerance=1800
+HalocontrolEnabled=false
+HalocontrolAmount=85
+DeconvRadius=0.75
+DeconvAmount=100
+DeconvDamping=0
+DeconvIterations=30
+
+[Vibrance]
+Enabled=false
+Pastels=0
+Saturated=0
+PSThreshold=0;75;
+ProtectSkins=false
+AvoidColorShift=true
+PastSatTog=true
+SkinTonesCurve=0;
+
+[SharpenEdge]
+Enabled=false
+Passes=2
+Strength=50
+ThreeChannels=false
+
+[SharpenMicro]
+Enabled=false
+Matrix=false
+Strength=20
+Contrast=20
+Uniformity=5
+
+[White Balance]
+Enabled=true
+Setting=Camera
+Temperature=5149
+Green=1.1969842088453122
+Equal=1
+TemperatureBias=0
+StandardObserver=TWO_DEGREES
+Itcwb_green=0
+Itcwb_rangegreen=1
+Itcwb_nopurple=false
+Itcwb_alg=false
+Itcwb_prim=beta
+Itcwb_sampling=false
+CompatibilityVersion=2
+
+[Color appearance]
+Enabled=false
+Degree=90
+AutoDegree=true
+Degreeout=90
+AutoDegreeout=true
+Surround=Average
+complex=normal
+ModelCat=16
+CatCat=clas
+Surrsrc=Average
+AdaptLum=16
+Badpixsl=0
+Model=RawT
+Illum=i50
+Algorithm=No
+J-Light=0
+Q-Bright=0
+C-Chroma=0
+S-Chroma=0
+M-Chroma=0
+J-Contrast=0
+Q-Contrast=0
+H-Hue=0
+RSTProtection=0
+AdaptScene=2000
+AutoAdapscen=true
+YbScene=18
+Autoybscen=true
+SurrSource=false
+Gamut=true
+Tempout=5003
+Autotempout=true
+Greenout=1
+Tempsc=5003
+Greensc=1
+Ybout=18
+Datacie=false
+Tonecie=false
+CurveMode=Lightness
+CurveMode2=Brightness
+CurveMode3=Chroma
+Curve=0;
+Curve2=0;
+Curve3=0;
+
+[Impulse Denoising]
+Enabled=false
+Threshold=50
+
+[Defringing]
+Enabled=false
+Radius=2
+Threshold=13
+HueCurve=1;0.16666666699999999;0;0.34999999999999998;0.34999999999999998;0.34699999999999998;0;0.34999999999999998;0.34999999999999998;0.51366742600000004;0;0.34999999999999998;0.34999999999999998;0.66894457100000004;0;0.34999999999999998;0.34999999999999998;0.82877752459999998;0.97835991;0.34999999999999998;0.34999999999999998;0.99088838270000001;0;0.34999999999999998;0.34999999999999998;
+
+[Dehaze]
+Enabled=false
+Strength=50
+ShowDepthMap=false
+Depth=25
+Saturation=50
+
+[Directional Pyramid Denoising]
+Enabled=false
+Enhance=false
+Median=false
+Luma=0
+Ldetail=0
+Chroma=15
+Method=Lab
+LMethod=SLI
+CMethod=MAN
+C2Method=AUTO
+SMethod=shal
+MedMethod=soft
+RGBMethod=soft
+MethodMed=none
+Redchro=0
+Bluechro=0
+AutoGain=true
+Gamma=1.7
+Passes=1
+LCurve=1;0.050000000000000003;0.14999999999999999;0.34999999999999998;0.34999999999999998;0.55000000000000004;0.040000000000000001;0.34999999999999998;0.34999999999999998;
+CCCurve=1;0.050000000000000003;0.5;0.34999999999999998;0.34999999999999998;0.34999999999999998;0.050000000000000003;0.34999999999999998;0.34999999999999998;
+
+[EPD]
+Enabled=false
+Strength=0.5
+Gamma=1
+EdgeStopping=1.3999999999999999
+Scale=1
+ReweightingIterates=0
+
+[FattalToneMapping]
+Enabled=false
+Threshold=30
+Amount=20
+Anchor=50
+
+[Shadows & Highlights]
+Enabled=false
+Highlights=0
+HighlightTonalWidth=70
+Shadows=0
+ShadowTonalWidth=30
+Radius=40
+Lab=false
+
+[ToneEqualizer]
+Enabled=false
+Band0=0
+Band1=0
+Band2=0
+Band3=0
+Band4=0
+Band5=0
+Regularization=0
+Pivot=0
+
+[Crop]
+Enabled=false
+X=-1
+Y=-1
+W=4025
+H=3017
+FixedRatio=true
+Ratio=As Image
+Orientation=As Image
+Guide=Frame
+
+[Coarse Transformation]
+Rotate=0
+HorizontalFlip=false
+VerticalFlip=false
+
+[Common Properties for Transformations]
+Method=log
+AutoFill=true
+
+[Rotation]
+Degree=0
+
+[Distortion]
+Amount=0
+
+[LensProfile]
+LcMode=lfauto
+LCPFile=
+UseDistortion=true
+UseVignette=true
+UseCA=false
+LFCameraMake=
+LFCameraModel=
+LFLens=
+
+[Perspective]
+Method=simple
+Horizontal=0
+Vertical=0
+CameraCropFactor=0
+CameraFocalLength=0
+CameraPitch=0
+CameraRoll=0
+CameraShiftHorizontal=0
+CameraShiftVertical=0
+CameraYaw=0
+ProjectionShiftHorizontal=0
+ProjectionPitch=0
+ProjectionRotate=0
+ProjectionShiftVertical=0
+ProjectionYaw=0
+ControlLineValues=
+ControlLineTypes=
+
+[Gradient]
+Enabled=false
+Degree=0
+Feather=25
+Strength=0.59999999999999998
+CenterX=0
+CenterY=0
+
+[Locallab]
+Enabled=false
+Selspot=0
+
+[PCVignette]
+Enabled=false
+Strength=0.59999999999999998
+Feather=50
+Roundness=50
+
+[CACorrection]
+Red=0
+Blue=0
+
+[Vignetting Correction]
+Amount=0
+Radius=50
+Strength=1
+CenterX=0
+CenterY=0
+
+[Resize]
+Enabled=false
+Scale=1
+AppliesTo=Cropped area
+Method=Lanczos
+DataSpecified=3
+Width=900
+Height=900
+LongEdge=900
+ShortEdge=900
+AllowUpscaling=false
+
+[PostDemosaicSharpening]
+Enabled=true
+Contrast=10
+AutoContrast=true
+AutoRadius=true
+DeconvRadius=0.75
+DeconvRadiusOffset=0
+DeconvIterCheck=true
+DeconvIterations=20
+
+[PostResizeSharpening]
+Enabled=false
+Contrast=15
+Method=rld
+Radius=0.5
+Amount=200
+Threshold=20;80;2000;1200;
+OnlyEdges=false
+EdgedetectionRadius=1.8999999999999999
+EdgeTolerance=1800
+HalocontrolEnabled=false
+HalocontrolAmount=85
+DeconvRadius=0.45000000000000001
+DeconvAmount=100
+DeconvDamping=0
+DeconvIterations=100
+
+[Color Management]
+InputProfile=(cameraICC)
+ToneCurve=false
+ApplyLookTable=true
+ApplyBaselineExposureOffset=true
+ApplyHueSatMap=true
+DCPIlluminant=0
+WorkingProfile=ProPhoto
+WorkingTRC=none
+Will=def
+Wprim=def
+Wcat=brad
+WorkingTRCGamma=2.3999999999999999
+WorkingTRCSlope=12.92
+Wmidtcie=0
+Wsmoothcie=false
+Redx=0.73470000000000002
+Redy=0.26529999999999998
+Grex=0.15959999999999999
+Grey=0.84040000000000004
+Blux=0.036600000000000001
+Bluy=0.0001
+Refi=0
+Shiftx=0
+Shifty=0
+LabGridcieALow=0.51763000000000003
+LabGridcieBLow=-0.33582000000000001
+LabGridcieAHigh=-0.75163000000000002
+LabGridcieBHigh=-0.81799999999999995
+LabGridcieGx=-0.69164000000000003
+LabGridcieGy=-0.70909
+LabGridcieWx=-0.18964
+LabGridcieWy=-0.16636000000000001
+LabGridcieMx=0
+LabGridcieMy=0
+Preser=0
+Fbw=false
+TrcExp=false
+Gamut=true
+OutputProfile=RTv4_sRGB
+aIntent=Relative
+OutputProfileIntent=Relative
+OutputBPC=true
+
+[Wavelet]
+Enabled=false
+Strength=100
+Balance=0
+Sigmafin=1
+Sigmaton=1
+Sigmacol=1
+Sigmadir=1
+Rangeab=20
+Protab=0
+Iter=0
+MaxLev=7
+TilesMethod=full
+complexMethod=normal
+mixMethod=mix
+sliMethod=sli
+quaMethod=cons
+DaubMethod=4_
+ChoiceLevMethod=all
+BackMethod=grey
+LevMethod=4
+DirMethod=all
+CBgreenhigh=0
+CBgreenmed=0
+CBgreenlow=0
+CBbluehigh=0
+CBbluemed=0
+CBbluelow=0
+Ballum=7
+Sigm=1
+Levden=0
+Thrden=0
+Limden=0
+Balchrom=0
+Chromfine=0
+Chromcoarse=0
+MergeL=20
+MergeC=20
+Softrad=0
+Softradend=0
+Strend=50
+Detend=0
+Thrend=0
+Expcontrast=false
+Expchroma=false
+Expedge=false
+expbl=false
+Expresid=false
+Expfinal=false
+Exptoning=false
+Expnoise=false
+Expclari=false
+LabGridALow=0
+LabGridBLow=0
+LabGridAHigh=0
+LabGridBHigh=0
+Contrast1=0
+Contrast2=0
+Contrast3=0
+Contrast4=0
+Contrast5=0
+Contrast6=0
+Contrast7=0
+Contrast8=0
+Contrast9=0
+Chroma1=0
+Chroma2=0
+Chroma3=0
+Chroma4=0
+Chroma5=0
+Chroma6=0
+Chroma7=0
+Chroma8=0
+Chroma9=0
+ContExtra=0
+HSMethod=with
+HLRange=50;75;100;98;
+SHRange=0;2;50;25;
+Edgcont=0;10;75;40;
+Level0noise=0;0;
+Level1noise=0;0;
+Level2noise=0;0;
+Level3noise=0;0;
+Leveldenoise=0;0;
+Levelsigm=1;1;
+ThresholdHighlight=4
+ThresholdShadow=5
+Edgedetect=90
+Edgedetectthr=20
+EdgedetectthrHi=0
+Edgesensi=60
+Edgeampli=10
+ThresholdChroma=5
+CHromaMethod=without
+Medgreinf=less
+Ushamethod=clari
+CHSLromaMethod=SL
+EDMethod=CU
+NPMethod=none
+BAMethod=none
+TMMethod=cont
+ChromaLink=0
+ContrastCurve=1;0;0.25;0.34999999999999998;0.34999999999999998;0.5;0.75;0.34999999999999998;0.34999999999999998;0.90000000000000002;0;0.34999999999999998;0.34999999999999998;
+blcurve=1;0;0;0;0.34999999999999998;0.5;0;0.34999999999999998;0.34999999999999998;1;0;0.34999999999999998;0.34999999999999998;
+Pastlev=0;2;30;20;
+Satlev=30;45;130;100;
+OpacityCurveRG=1;0;0.5;0.34999999999999998;0.34999999999999998;1;0.5;0.34999999999999998;0.34999999999999998;
+OpacityCurveBY=1;0;0.5;0.34999999999999998;0.34999999999999998;1;0.5;0.34999999999999998;0.34999999999999998;
+wavdenoise=1;0;1;0.34999999999999998;0.34999999999999998;0.5;1;0.34999999999999998;0.34999999999999998;1;1;0.34999999999999998;0.34999999999999998;
+wavdenoiseh=1;0;1;0.34999999999999998;0.34999999999999998;0.5;1;0.34999999999999998;0.34999999999999998;1;1;0.34999999999999998;0.34999999999999998;
+OpacityCurveW=1;0;0.34999999999999998;0.34999999999999998;0;0.34999999999999998;0.75;0.34999999999999998;0.34999999999999998;0.59999999999999998;0.75;0.34999999999999998;0.34999999999999998;1;0.34999999999999998;0;0;
+OpacityCurveWL=1;0;0.5;0.34999999999999998;0.34999999999999998;1;0.5;0.34999999999999998;0.34999999999999998;
+HHcurve=0;
+Wavguidcurve=0;
+Wavhuecurve=0;
+CHcurve=0;
+WavclCurve=0;
+Median=false
+Medianlev=false
+Linkedg=false
+CBenab=false
+Lipst=false
+Skinprotect=0
+chrwav=0
+bluwav=1
+Hueskin=-5;25;170;120;
+Edgrad=15
+Edgeffect=1
+Edgval=0
+ThrEdg=10
+AvoidColorShift=false
+Showmask=false
+Oldsh=true
+TMr=false
+Sigma=1
+Offset=1
+Lowthr=40
+ResidualcontShadow=0
+ResidualcontHighlight=0
+ThresholdResidShadow=30
+ThresholdResidHighLight=70
+Residualradius=40
+Residualchroma=0
+Residualblur=0
+Residualblurc=0
+ResidualTM=0
+ResidualEDGS=1.3999999999999999
+ResidualSCALE=1
+Residualgamma=1
+HueRangeResidual=0
+HueRange=-260;-250;-130;-140;
+Contrast=0
+
+[Spot removal]
+Enabled=false
+
+[Directional Pyramid Equalizer]
+Enabled=false
+Gamutlab=false
+cbdlMethod=bef
+Mult0=1
+Mult1=1
+Mult2=1
+Mult3=1
+Mult4=1
+Mult5=1
+Threshold=0.20000000000000001
+Skinprotect=0
+Hueskin=-5;25;170;120;
+
+[HSV Equalizer]
+Enabled=false
+HCurve=0;
+SCurve=0;
+VCurve=0;
+
+[SoftLight]
+Enabled=false
+Strength=30
+
+[Film Simulation]
+Enabled=false
+ClutFilename=
+Strength=100
+
+[RGB Curves]
+Enabled=false
+LumaMode=false
+rCurve=0;
+gCurve=0;
+bCurve=0;
+
+[ColorToning]
+Enabled=false
+Method=LabRegions
+Lumamode=true
+Twocolor=Std
+Redlow=0
+Greenlow=0
+Bluelow=0
+Satlow=0
+Balance=0
+Sathigh=0
+Redmed=0
+Greenmed=0
+Bluemed=0
+Redhigh=0
+Greenhigh=0
+Bluehigh=0
+Autosat=true
+OpacityCurve=1;0;0.29999999999999999;0.34999999999999998;0;0.25;0.80000000000000004;0.34999999999999998;0.34999999999999998;0.69999999999999996;0.80000000000000004;0.34999999999999998;0.34999999999999998;1;0.29999999999999999;0;0;
+ColorCurve=1;0.050000000000000003;0.62;0.25;0.25;0.58499999999999996;0.11;0.25;0.25;
+SatProtectionThreshold=30
+SaturatedOpacity=80
+Strength=50
+HighlightsColorSaturation=60;80;
+ShadowsColorSaturation=80;208;
+ClCurve=3;0;0;0.34999999999999998;0.65000000000000002;1;1;
+Cl2Curve=3;0;0;0.34999999999999998;0.65000000000000002;1;1;
+LabGridALow=0
+LabGridBLow=0
+LabGridAHigh=0
+LabGridBHigh=0
+LabRegionA_1=0
+LabRegionB_1=0
+LabRegionSaturation_1=0
+LabRegionSlope_1=1
+LabRegionOffset_1=0
+LabRegionPower_1=1
+LabRegionHueMask_1=1;0.16666666699999999;1;0.34999999999999998;0.34999999999999998;0.82877752459999998;1;0.34999999999999998;0.34999999999999998;
+LabRegionChromaticityMask_1=1;0;1;0.34999999999999998;0.34999999999999998;1;1;0.34999999999999998;0.34999999999999998;
+LabRegionLightnessMask_1=1;0;1;0.34999999999999998;0.34999999999999998;1;1;0.34999999999999998;0.34999999999999998;
+LabRegionMaskBlur_1=0
+LabRegionChannel_1=-1
+LabRegionsShowMask=-1
+
+[RAW]
+DarkFrame=
+DarkFrameAuto=false
+FlatFieldFile=
+FlatFieldAutoSelect=false
+FlatFieldFromMetaData=false
+FlatFieldBlurRadius=32
+FlatFieldBlurType=Area Flatfield
+FlatFieldAutoClipControl=false
+FlatFieldClipControl=0
+CA=true
+CAAvoidColourshift=true
+CAAutoIterations=2
+CARed=0
+CABlue=0
+HotPixelFilter=false
+DeadPixelFilter=false
+HotDeadPixelThresh=100
+PreExposure=1
+
+[RAW Bayer]
+Method=amaze
+Border=4
+ImageNum=1
+CcSteps=0
+PreBlack0=0
+PreBlack1=0
+PreBlack2=0
+PreBlack3=0
+PreTwoGreen=true
+LineDenoise=0
+LineDenoiseDirection=3
+GreenEqThreshold=0
+DCBIterations=2
+DCBEnhance=true
+LMMSEIterations=2
+DualDemosaicAutoContrast=true
+DualDemosaicContrast=20
+PixelShiftMotionCorrectionMethod=1
+PixelShiftEperIso=0
+PixelShiftSigma=1
+PixelShiftShowMotion=false
+PixelShiftShowMotionMaskOnly=false
+pixelShiftHoleFill=true
+pixelShiftAverage=false
+pixelShiftMedian=false
+pixelShiftGreen=true
+pixelShiftBlur=true
+pixelShiftSmoothFactor=0.69999999999999996
+pixelShiftEqualBright=false
+pixelShiftEqualBrightChannel=false
+pixelShiftNonGreenCross=true
+pixelShiftDemosaicMethod=amaze
+PDAFLinesFilter=false
+
+[RAW X-Trans]
+Method=3-pass (best)
+DualDemosaicAutoContrast=true
+DualDemosaicContrast=20
+Border=7
+CcSteps=0
+PreBlackRed=0
+PreBlackGreen=0
+PreBlackBlue=0
+
+[MetaData]
+Mode=1
+ExifKeys=Exif.Image.Copyright;Exif.Image.Artist;Exif.Image.ImageDescription;Exif.Photo.UserComment;Exif.Image.Make;Exif.Image.Model;Exif.Photo.LensModel;Exif.Photo.FNumber;Exif.Photo.ExposureTime;Exif.Photo.FocalLength;Exif.Photo.ISOSpeedRatings;Exif.Photo.ExposureBiasValue;Exif.Photo.Flash;Exif.Photo.DateTimeOriginal;Exif.Image.XResolution;Exif.Image.YResolution;
+
+[Film Negative]
+Enabled=false
+RedRatio=1.3600000000000001
+GreenExponent=1.5
+BlueRatio=0.85999999999999999
+ColorSpace=1
+RefInput=0;0;0;
+RefOutput=0;0;0;
+
+[RAW Preprocess WB]
+Mode=1
--- a/test_burst1/im_02.dng
+++ b/test_burst1/im_02.dng
--- a/test_burst1/im_03.dng
+++ b/test_burst1/im_03.dng
--- a/test_burst1/im_04.dng
+++ b/test_burst1/im_04.dng
--- a/test_burst1/im_05.dng
+++ b/test_burst1/im_05.dng
--- a/test_burst1/im_06.dng
+++ b/test_burst1/im_06.dng
--- a/test_burst1/im_07.dng
+++ b/test_burst1/im_07.dng
--- a/test_burst1/im_08.dng
+++ b/test_burst1/im_08.dng
--- a/test_burst1/im_09.dng
+++ b/test_burst1/im_09.dng
--- a/test_burst1/im_10.dng
+++ b/test_burst1/im_10.dng
--- a/test_burst1/im_11.dng
+++ b/test_burst1/im_11.dng
--- a/test_burst1/im_12.dng
+++ b/test_burst1/im_12.dng
--- a/test_burst1/im_13.dng
+++ b/test_burst1/im_13.dng
--- a/test_burst1/readme.txt
+++ b/test_burst1/readme.txt
@ -0,0 +1 @@
+Download and unzip the test burst of 13 images taken with the Samsung S8 camera at https://drive.google.com/file/d/1ot0E6guY5AacM-I6-GffHqFzykVb22wV/view?usp=share_link
--- a/x.png
+++ b/x.png
				`@ -0,0 +1 @@`
				`Download and unzip the test burst of 13 images taken with the Samsung S8 camera at https://drive.google.com/file/d/1ot0E6guY5AacM-I6-GffHqFzykVb22wV/view?usp=share_link`