This commit is contained in:
Andrey Tkachenko 2024-11-01 14:25:58 +04:00
commit e79a8329ea
18 changed files with 3509 additions and 0 deletions

1
.gitignore vendored Normal file
View File

@ -0,0 +1 @@
/target

1781
Cargo.lock generated Normal file

File diff suppressed because it is too large Load Diff

7
Cargo.toml Normal file
View File

@ -0,0 +1,7 @@
[package]
name = "rax"
version = "0.1.0"
edition = "2021"
[dependencies]
cubecl = { git = "https://github.com/tracel-ai/cubecl.git", features = ["wgpu", "cuda"] }

1
examples/align.rs Normal file
View File

@ -0,0 +1 @@
fn main() {}

20
src/alignment.rs Normal file
View File

@ -0,0 +1,20 @@
use crate::misc::DistanceKind;
pub mod block_matching;
pub mod gauss_pyramid;
pub mod local_search;
pub mod upsample_alignments;
pub struct AlingLevelConfig {
pub factor: u32,
pub upsampling_factor: u32,
pub tile_size: u32,
pub search_radius: u32,
pub distance: DistanceKind,
}
pub struct AlignmentConfig {
pub levels: Vec<AlingLevelConfig>,
}
pub struct Alignments {}

View File

@ -0,0 +1,96 @@
use cubecl::{client::ComputeClient, Runtime};
use crate::image::{Alignments, GrayImage};
use super::{
gauss_pyramid::build_pyramid, local_search::local_search,
upsample_alignments::upsample_alignments, AlingLevelConfig,
};
///
/// Align the reference image with the img : returns a patchwise flow such that
/// for patches py, px :
/// img[py, px] ~= ref_img[py + alignments[py, px, 1],
/// px + alignments[py, px, 0]]
///
/// Parameters
/// ----------
/// ref_img : Img [imshape_y, imshape_x]
/// Image to be compared
///
/// ref_pyramid : [Img]
/// Pyramid representation of the ref image J_1
///
/// Returns
/// -------
/// alignments : Alignments
/// a device array[n_patchs_y, n_patchs_x, 2]
/// Patchwise flow : V_n(p) for each patch (p)
///
pub fn align_image_block_matching<R: Runtime>(
cube: &ComputeClient<R::Server, R::Channel>,
img: &GrayImage,
ref_pyramid: &[GrayImage],
level_configs: &[AlingLevelConfig],
) -> Alignments {
let alt_pyramid = build_pyramid::<R>(cube, img, level_configs.iter().map(|x| x.factor));
// Align alternate image to the reference image
let mut alignments = None;
let cfg_iter = level_configs.iter().rev();
let mut prev_tile_size = 64;
for (lv, cfg) in cfg_iter.enumerate() {
alignments = Some(align_on_a_level::<R>(
cube,
&ref_pyramid[lv],
&alt_pyramid[lv],
cfg,
alignments.as_ref(),
prev_tile_size,
));
prev_tile_size = cfg.tile_size;
}
alignments.unwrap()
}
///
/// Alignment will always be an integer with this function, however it is
/// set to DEFAULT_FLOAT_TYPE. This enables to directly use the outputed
/// alignment for ICA without any casting from int to float
///
pub(crate) fn align_on_a_level<R: Runtime>(
cube: &ComputeClient<R::Server, R::Channel>,
ref_pyramid_lvl: &GrayImage,
alt_pyramid_lvl: &GrayImage,
cfg: &AlingLevelConfig,
prev: Option<&Alignments>,
prev_tile_size: u32,
) -> Alignments {
let (ref_h, ref_w) = ref_pyramid_lvl.shape();
// Number of patches that can fit on this level
let h = ref_h / cfg.tile_size;
let w = ref_w / cfg.tile_size;
let mut alignments = Alignments::new_empty::<R>(cube, w, h);
if let Some(prev) = prev {
// use the upsampled previous alignments as initial guesses
upsample_alignments::<R>(
cube,
&mut alignments,
ref_pyramid_lvl,
alt_pyramid_lvl,
prev,
prev_tile_size,
cfg,
)
}
local_search::<R>(cube, &mut alignments, ref_pyramid_lvl, alt_pyramid_lvl, cfg);
alignments
}

View File

@ -0,0 +1,71 @@
use cubecl::{client::ComputeClient, Runtime};
use crate::image::{downsample::downsample, padding::pad, GrayImage};
///
/// Construct N-level coarse-to-fine gaussian pyramid
///
/// Args:
/// image: input image (expected to be a grayscale image downsampled from a Bayer raw image)
/// factors: [int], dowsampling factors (fine-to-coarse)
///
pub(crate) fn build_pyramid<R: Runtime>(
cube: &ComputeClient<R::Server, R::Channel>,
img: &GrayImage,
factors: impl Iterator<Item = u32>,
) -> Vec<GrayImage> {
// Subsequent pyramid levels are successively created
// with convolution by a kernel followed by downsampling
let init = Vec::with_capacity(factors.size_hint().1.unwrap_or(factors.size_hint().0));
let mut levels = factors.fold(init, |mut acc, factor| {
acc.push(downsample::<R>(cube, acc.last().unwrap_or(img), factor));
acc
});
// Reverse the pyramid to get it coarse-to-fine
levels.reverse();
levels
}
///
/// Returns the pyramid representation of `img`, that will be used for
/// future block matching
///
/// Parameters
/// ----------
/// img : Img grayscale input image
/// factors : [int] pyramid scaling factors
/// tile_size : int size of the tile
///
/// Returns
/// -------
/// pyramid : [Img]
/// pyramid representation of the image
///
pub fn build_gauss_pyramid<R: Runtime>(
cube: &ComputeClient<R::Server, R::Channel>,
img: &GrayImage,
factors: impl Iterator<Item = u32>,
tile_size: u32,
) -> Vec<GrayImage> {
let (h, w) = img.shape();
// if needed, pad images with zeros so that getTiles contains all image pixels
let padding_patches_height = tile_size - (h % tile_size);
let padding_patches_width = tile_size - (w % tile_size);
if padding_patches_width != 0 || padding_patches_height != 0 {
let img = pad::<R>(
cube,
img,
0,
padding_patches_width,
0,
padding_patches_height,
);
build_pyramid::<R>(cube, &img, factors)
} else {
build_pyramid::<R>(cube, img, factors)
}
}

View File

@ -0,0 +1,114 @@
use cubecl::{
client::ComputeClient,
prelude::{Float, Line, ScalarArg, Tensor, ABSOLUTE_POS_X, ABSOLUTE_POS_Y},
CubeCount, CubeDim, Runtime,
};
use crate::{
alignment::upsample_alignments::cube_compute_dist,
image::{Alignments, GrayImage},
misc::DistanceKind,
};
use super::AlingLevelConfig;
pub fn local_search<R: Runtime>(
cube: &ComputeClient<R::Server, R::Channel>,
upsampled_alignments: &mut Alignments,
ref_pyramid_lvl: &GrayImage,
alt_pyramid_lvl: &GrayImage,
cfg: &AlingLevelConfig,
) {
let threadsperblock = CubeDim::default();
let w = upsampled_alignments.shape()[0] as u32;
let h = upsampled_alignments.shape()[1] as u32;
let blockspergrid_x = (w + threadsperblock.x - 1) / threadsperblock.x;
let blockspergrid_y = (h + threadsperblock.y - 1) / threadsperblock.y;
unsafe {
cube_local_search::launch_unchecked::<R>(
cube,
CubeCount::Static(blockspergrid_x, blockspergrid_y, 1),
threadsperblock,
ref_pyramid_lvl.as_tensor_arg().as_tensor_arg(1),
alt_pyramid_lvl.as_tensor_arg().as_tensor_arg(1),
upsampled_alignments.as_tensor_arg().as_tensor_arg(1),
ScalarArg::new(cfg.tile_size),
ScalarArg::new(cfg.search_radius as _),
cfg.distance,
)
}
}
#[cubecl::cube(launch_unchecked)]
#[allow(clippy::identity_op)]
fn cube_local_search(
ref_pyramid_lvl: &Tensor<f32>,
alt_pyramid_lvl: &Tensor<f32>,
upsampled_alignments: &mut Tensor<f32>,
tile_size: u32,
search_radius: i32,
#[comptime] distance: DistanceKind,
) {
let n_patchs_y = upsampled_alignments.shape(0);
let n_patchs_x = upsampled_alignments.shape(1);
let tile_x = ABSOLUTE_POS_X;
let tile_y = ABSOLUTE_POS_Y;
if tile_y >= n_patchs_y || tile_x >= n_patchs_x {
return;
}
let offset = tile_y * upsampled_alignments.stride(0) + tile_x * upsampled_alignments.stride(1);
let mut local_flow = Line::empty(2);
local_flow[0] = upsampled_alignments[offset + 0];
local_flow[1] = upsampled_alignments[offset + 1];
// position of the pixel in the top left corner of the patch
let patch_pos_x = tile_x * tile_size;
let patch_pos_y = tile_y * tile_size;
// this should be rewritten to allow patchs bigger than 32
let mut local_ref = Line::empty(1024);
for i in 0..tile_size {
for j in 0..tile_size {
let idx = patch_pos_x + j;
let idy = patch_pos_y + i;
local_ref[i * 32 + j] = ref_pyramid_lvl[idy * ref_pyramid_lvl.stride(0) + idx];
}
}
let mut min_dist = f32::new(f32::INFINITY); // init as infty
let mut min_shift_y: i32 = 0;
let mut min_shift_x: i32 = 0;
// window search
for search_shift_y in -search_radius..=search_radius {
for search_shift_x in -search_radius..=search_radius {
// computing dist
let dist = cube_compute_dist(
&local_ref,
alt_pyramid_lvl,
&local_flow,
patch_pos_x as i32 + search_shift_x,
patch_pos_y as i32 + search_shift_y,
tile_size,
distance,
);
if dist < min_dist {
min_dist = dist;
min_shift_y = search_shift_y;
min_shift_x = search_shift_x;
}
}
}
upsampled_alignments[offset + 0] = local_flow[0] + min_shift_x as f32;
upsampled_alignments[offset + 1] = local_flow[1] + min_shift_y as f32;
}

View File

@ -0,0 +1,263 @@
use cubecl::{
client::ComputeClient,
prelude::{Abs, Float, Line, Max, Min, ScalarArg, Tensor, ABSOLUTE_POS_X, ABSOLUTE_POS_Y},
CubeCount, CubeDim, Runtime,
};
use crate::{
image::{Alignments, GrayImage},
misc::DistanceKind,
};
use super::AlingLevelConfig;
///
/// Upsample alignements to adapt them to the next pyramid level (Section 3.2 of the IPOL article).
///
pub(crate) fn upsample_alignments<R: Runtime>(
cube: &ComputeClient<R::Server, R::Channel>,
dst: &mut Alignments,
ref_pyramid_lvl: &GrayImage,
alt_pyramid_lvl: &GrayImage,
prev_alignments: &Alignments,
prev_tile_size: u32,
cfg: &AlingLevelConfig,
) {
let threadsperblock = CubeDim::default();
let n_tiles_y_new = dst.shape()[0] as u32;
let n_tiles_x_new = dst.shape()[1] as u32;
let blockspergrid_x = (n_tiles_x_new + threadsperblock.x - 1) / threadsperblock.x;
let blockspergrid_y = (n_tiles_y_new + threadsperblock.y - 1) / threadsperblock.y;
unsafe {
cube_upsample_alignments::launch_unchecked::<R>(
cube,
CubeCount::Static(blockspergrid_x, blockspergrid_y, 1),
threadsperblock,
ref_pyramid_lvl.as_tensor_arg().as_tensor_arg(1),
alt_pyramid_lvl.as_tensor_arg().as_tensor_arg(1),
prev_alignments.as_tensor_arg().as_tensor_arg(1),
dst.as_tensor_arg().as_tensor_arg(1),
ScalarArg::new(cfg.upsampling_factor),
ScalarArg::new(cfg.tile_size),
ScalarArg::new(prev_tile_size),
)
}
}
#[allow(clippy::identity_op)]
#[cubecl::cube(launch_unchecked)]
fn cube_upsample_alignments(
ref_pyramid_lvl: &Tensor<f32>,
alt_pyramid_lvl: &Tensor<f32>,
previous_alignments: &Tensor<f32>,
upsampled_alignments: &mut Tensor<f32>,
upsampling_factor: u32,
tile_size: u32,
prev_tile_size: u32,
) {
let subtile_x = ABSOLUTE_POS_X;
let subtile_y = ABSOLUTE_POS_Y;
let n_tiles_y_prev = previous_alignments.shape(0);
let n_tiles_x_prev = previous_alignments.shape(1);
let n_tiles_y_new = upsampled_alignments.shape(0);
let n_tiles_x_new = upsampled_alignments.shape(1);
let w = ref_pyramid_lvl.shape(1);
let repeat_factor = upsampling_factor / (tile_size / prev_tile_size);
if subtile_x >= n_tiles_x_new || subtile_y >= n_tiles_y_new {
return;
}
let des_offset =
subtile_y * upsampled_alignments.stride(0) + subtile_x * upsampled_alignments.stride(1);
// the new subtile is on the side of the image, and is not contained within a bigger old tile
if subtile_x >= repeat_factor * n_tiles_x_prev || subtile_y >= repeat_factor * n_tiles_y_prev {
upsampled_alignments[des_offset + 0] = 0.;
upsampled_alignments[des_offset + 1] = 0.;
return;
}
let prev_tile_x = subtile_x / repeat_factor;
let prev_tile_y = subtile_y / repeat_factor;
// position of the top left pixel in the subtile
let subtile_pos_y = subtile_y * tile_size;
let subtile_pos_x = subtile_x * tile_size;
// copying ref patch into local memory, because it needs to be read 3 times
// this should be rewritten to allow patchs bigger than 32
let mut local_ref = Line::empty(1024);
for i in 0..tile_size {
for j in 0..tile_size {
let idx = subtile_pos_x + j;
let idy = subtile_pos_y + i;
local_ref[i * 32 + j] = ref_pyramid_lvl[idy * w + idx];
}
}
// position of the new tile within the old tile
let ups_subtile_x = subtile_x % repeat_factor;
let ups_subtile_y = subtile_y % repeat_factor;
// computing id for the 3 closest patchs
let x_shift = if 2 * ups_subtile_x + 1 > repeat_factor {
1i32
} else {
-(1i32)
};
let y_shift = if 2 * ups_subtile_y + 1 > repeat_factor {
1i32
} else {
-(1i32)
};
// Choosing the best of the 3 alignments by minimising L1 dist
let mut dist = 1.0f32 / 0.0f32;
let mut optimal_flow_x = 0.0f32;
let mut optimal_flow_y = 0.0f32;
// 3 Candidates alignments are fetched (by fetching them as early as possible, we may received
// them from global memory before we even require them, as calculations are performed during this delay)
let candidate_alignment_0_shift = cube_alignment_flow(
previous_alignments,
prev_tile_x,
prev_tile_y,
upsampling_factor as f32,
);
let candidate_alignment_vert_shift = cube_alignment_flow(
previous_alignments,
prev_tile_x,
clamp(prev_tile_y as i32 + y_shift, 0, n_tiles_y_prev as i32 - 1) as u32,
upsampling_factor as f32,
);
let candidate_alignment_horizontal_shift = cube_alignment_flow(
previous_alignments,
clamp(prev_tile_x as i32 + x_shift, 0, n_tiles_x_prev as i32 - 1) as u32,
prev_tile_y,
upsampling_factor as f32,
);
// 0 shift
let dist_ = cube_compute_dist(
&local_ref,
alt_pyramid_lvl,
&candidate_alignment_0_shift,
subtile_pos_x as i32,
subtile_pos_y as i32,
tile_size,
DistanceKind::L1,
);
if dist_ < dist {
dist = dist_;
optimal_flow_x = candidate_alignment_0_shift[0];
optimal_flow_y = candidate_alignment_0_shift[1];
}
// vertical shift
let dist_ = cube_compute_dist(
&local_ref,
alt_pyramid_lvl,
&candidate_alignment_vert_shift,
subtile_pos_x as i32,
subtile_pos_y as i32,
tile_size,
DistanceKind::L1,
);
if dist_ < dist {
dist = dist_;
optimal_flow_x = candidate_alignment_vert_shift[0];
optimal_flow_y = candidate_alignment_vert_shift[1];
}
// horizontal shift
let dist_ = cube_compute_dist(
&local_ref,
alt_pyramid_lvl,
&candidate_alignment_horizontal_shift,
subtile_pos_x as i32,
subtile_pos_y as i32,
tile_size,
DistanceKind::L1,
);
if dist_ < dist {
optimal_flow_x = candidate_alignment_horizontal_shift[0];
optimal_flow_y = candidate_alignment_horizontal_shift[1];
}
// applying best flow
upsampled_alignments[des_offset + 0] = optimal_flow_x;
upsampled_alignments[des_offset + 1] = optimal_flow_y;
}
#[cubecl::cube]
fn clamp(a: i32, min: i32, max: i32) -> i32 {
// Clamp::clamp(a, min, max)
Min::min(Max::max(a, min), max)
}
#[cubecl::cube]
#[allow(clippy::identity_op)]
fn cube_alignment_flow(
previous_alignments: &Tensor<f32>,
x: u32,
y: u32,
factor: f32,
) -> Line<f32> {
let offset = y * previous_alignments.stride(0) + x * previous_alignments.stride(1);
let mut candidate_alignment: Line<f32> = Line::empty(2);
candidate_alignment[0] = previous_alignments[offset + 0] * factor;
candidate_alignment[1] = previous_alignments[offset + 1] * factor;
candidate_alignment
}
#[cubecl::cube]
pub(crate) fn cube_compute_dist(
local_ref: &Line<f32>,
alt_pyramid_lvl: &Tensor<f32>,
candidate_alignment: &Line<f32>,
pos_x: i32,
pos_y: i32,
tile_size: u32,
#[comptime] distance: DistanceKind,
) -> f32 {
let h = alt_pyramid_lvl.shape(0);
let w = alt_pyramid_lvl.shape(1);
let mut dist = 0.0f32;
for i in 0..tile_size {
for j in 0..tile_size {
let new_idx = pos_x + j as i32 + candidate_alignment[0] as i32;
let new_idy = pos_y + i as i32 + candidate_alignment[1] as i32;
if (0 <= new_idx && new_idx < w as i32) && (0 <= new_idy && new_idy < h as i32) {
let alt_offset = new_idy as u32 * w + new_idx as u32;
let diff = local_ref[i * 32 + j] - alt_pyramid_lvl[alt_offset];
dist += match distance {
DistanceKind::L1 => Abs::abs(diff),
DistanceKind::L2 => diff * diff,
};
} else {
dist = f32::new(f32::INFINITY)
}
}
}
dist
}

69
src/gat.rs Normal file
View File

@ -0,0 +1,69 @@
use cubecl::prelude::{Max, Sqrt, Tensor, ABSOLUTE_POS_X, ABSOLUTE_POS_Y};
use crate::{image::GrayImage, kernels::NoiseConfig};
///
/// Generalized Ascombe Transform
/// noise model : std² = alpha * I + beta
/// Where alpha and beta are iso dependant.
///
/// Parameters
/// ----------
/// image : TYPE
/// DESCRIPTION.
/// alpha : float
/// value of alpha for the given iso
/// iso : float
/// ISO value
/// beta : float
/// Value of beta for the given iso
///
/// Returns
/// -------
/// VST_image : TYPE
/// input image with stabilized variance
///
pub fn generalized_ascombe_transform(image: &GrayImage, nc: NoiseConfig) -> GrayImage {
let _ = nc;
let _ = image;
// assert len(image.shape) == 2
// imshape_y, imshape_x = image.shape
// VST_image = cuda.device_array(image.shape, DEFAULT_NUMPY_FLOAT_TYPE)
// threadsperblock = (DEFAULT_THREADS, DEFAULT_THREADS)
// blockspergrid_x = math.ceil(imshape_x/threadsperblock[1])
// blockspergrid_y = math.ceil(imshape_y/threadsperblock[0])
// blockspergrid = (blockspergrid_x, blockspergrid_y)
// cuda_GAT[blockspergrid, threadsperblock](image, VST_image,
// alpha, beta)
// return VST_image
todo!()
}
#[cubecl::cube]
fn cuda_gat(image: &Tensor<f32>, vst_image: &mut Tensor<f32>, alpha: f32, beta: f32) {
let x = ABSOLUTE_POS_X;
let y = ABSOLUTE_POS_Y;
let imshape_y = image.shape(0);
let imshape_x = image.shape(1);
if y >= imshape_y || x >= imshape_x {
return;
}
let offset = y * image.stride(0) + x;
// ISO should not appear here, since alpha and beta are
// already iso dependant.
let vst = Max::max(
0.0,
alpha * image[offset] + 3.0 / 8.0 * alpha * alpha + beta,
);
vst_image[offset] = 2.0 / alpha * Sqrt::sqrt(vst);
}

133
src/image.rs Normal file
View File

@ -0,0 +1,133 @@
use std::marker::PhantomData;
use cubecl::{
client::ComputeClient,
prelude::{CubePrimitive, TensorHandleRef},
server::Handle,
Runtime,
};
pub mod convolution;
pub mod downsample;
pub mod padding;
#[derive(Debug)]
pub struct Tensor<T: CubePrimitive, const R: usize> {
pub(crate) handle: Handle,
pub(crate) shape: [usize; R],
pub(crate) stride: [usize; R],
_m: PhantomData<T>,
}
impl<T: CubePrimitive, const R: usize> Tensor<T, R> {
pub fn new_empty<Rt: Runtime>(
cube: &ComputeClient<Rt::Server, Rt::Channel>,
width: u32,
height: u32,
) -> Self {
let _ = height;
let _ = width;
let _ = cube;
todo!()
}
#[inline]
pub fn shape(&self) -> [usize; R] {
self.shape
}
#[inline]
pub fn stride(&self) -> [usize; R] {
self.stride
}
#[inline]
pub fn as_tensor_arg<Rt: Runtime>(&self) -> TensorHandleRef<'_, Rt> {
TensorHandleRef {
handle: &self.handle,
strides: &self.stride,
shape: &self.shape,
runtime: PhantomData,
}
}
#[inline]
pub fn from_slice<Rt: Runtime>(
cube: &ComputeClient<Rt::Server, Rt::Channel>,
data: &[T],
) -> Self {
let _ = data;
let _ = cube;
todo!()
}
}
pub struct ImageBuffer<T: CubePrimitive, const R: usize, const S: usize> {
data: Tensor<T, R>,
width: u32,
height: u32,
}
impl<T: CubePrimitive, const R: usize, const S: usize> std::ops::Deref for ImageBuffer<T, R, S> {
type Target = Tensor<T, R>;
fn deref(&self) -> &Self::Target {
&self.data
}
}
impl<T: CubePrimitive, const R: usize, const S: usize> std::ops::DerefMut for ImageBuffer<T, R, S> {
fn deref_mut(&mut self) -> &mut Self::Target {
&mut self.data
}
}
impl<T: CubePrimitive, const R: usize, const S: usize> ImageBuffer<T, R, S> {
pub fn from_tensor(data: Tensor<T, R>, width: u32, height: u32) -> Self {
assert!(height as usize <= data.shape[0]);
assert!(width as usize <= data.shape[0] * S);
Self {
data,
width,
height,
}
}
#[inline]
pub fn shape(&self) -> (u32, u32) {
(self.height, self.width)
}
#[inline]
pub fn height(&self) -> u32 {
self.height
}
#[inline]
pub fn width(&self) -> u32 {
self.width
}
pub fn clone<Rt: Runtime>(&self, cube: &ComputeClient<Rt::Server, Rt::Channel>) -> Self {
let _ = cube;
todo!()
}
}
// Bayer Raw Images
pub type Raw16Image = ImageBuffer<u32, 2, 2>;
pub type Raw10Image = ImageBuffer<u32, 2, 3>;
// Gray Images
pub type GrayImage = ImageBuffer<f32, 2, 1>;
pub type Gray10Image = ImageBuffer<u32, 2, 3>;
pub type Gray16Image = ImageBuffer<u32, 2, 2>;
// Rgb Images
pub type RgbImage = ImageBuffer<f32, 3, 1>;
pub type Rgba8Image = ImageBuffer<u32, 2, 4>;
pub type Rgb10Image = ImageBuffer<u32, 2, 3>;
// Alingments
pub type Alignments = Tensor<f32, 3>;

61
src/image/convolution.rs Normal file
View File

@ -0,0 +1,61 @@
use cubecl::{
client::ComputeClient,
prelude::{Abs, Float, ABSOLUTE_POS_X, ABSOLUTE_POS_Y},
Runtime,
};
use super::Tensor;
pub fn conv1d<R: Runtime>(
cube: &ComputeClient<R::Server, R::Channel>,
img: &Tensor<f32, 2>,
kern: &Tensor<f32, 1>,
) -> Tensor<f32, 2> {
let _ = kern;
let _ = img;
let _ = cube;
todo!()
}
pub fn conv1d_t<R: Runtime>(
cube: &ComputeClient<R::Server, R::Channel>,
img: &Tensor<f32, 2>,
kern: &Tensor<f32, 1>,
) -> Tensor<f32, 2> {
let _ = kern;
let _ = img;
let _ = cube;
todo!()
}
#[cubecl::cube(launch_unchecked)]
fn cube_conv1d<F: Float>(
input: &cubecl::prelude::Tensor<F>,
kern: &cubecl::prelude::Tensor<F>,
output: &mut cubecl::prelude::Tensor<F>,
#[comptime] trans: bool,
) {
let kern_half = kern.len() as i32 / 2;
let h = output.shape(0) as i32 - 1;
let w = output.shape(1) as i32 - 1;
let px = ABSOLUTE_POS_X as i32 - kern_half;
let py = ABSOLUTE_POS_Y as i32 - kern_half;
for i in 0..kern.len() as i32 {
let ox = if trans {
Abs::abs(w - Abs::abs(px + i - w))
} else {
px
};
let oy = if trans {
Abs::abs(h - Abs::abs(py + i - h))
} else {
py
};
output[oy as u32 * output.stride(0) + ox as u32] =
input[oy as u32 * input.stride(0) + ox as u32] * kern[i as u32];
}
}

65
src/image/downsample.rs Normal file
View File

@ -0,0 +1,65 @@
use cubecl::{client::ComputeClient, Runtime};
use super::{convolution::conv1d, GrayImage, Tensor};
///
/// Apply a convolution by a kernel if required, then downsample an image.
/// Args:
/// image: Device Array the input image (WARNING: single channel only!)
/// factor: downsampling factor
///
pub fn downsample<R: Runtime>(
cube: &ComputeClient<R::Server, R::Channel>,
img: &GrayImage,
factor: u32,
) -> GrayImage {
// Special case
if factor == 1 {
return img.clone::<R>(cube);
}
// gaussian kernel std is proportional to downsampling factor
// filteredImage = gaussian_filter(image, sigma=factor * 0.5, order=0, output=None, mode='reflect')
let gaussian_kernel = create_gaussian_kernel1d(factor as f32 * 0.5, 4 * ((factor + 1) / 2));
let gaussian_kernel = Tensor::from_slice::<R>(cube, &gaussian_kernel);
let temp = conv1d::<R>(cube, &img.data, &gaussian_kernel); // convolve y
let filtered_image = conv1d::<R>(cube, &temp, &gaussian_kernel); // convolve x
// Shape of the downsampled image
let [h2, w2] = filtered_image.shape();
let (h2, w2) = (h2 as f32 / factor as f32, w2 as f32 / factor as f32);
let tensor = subsample::<R>(cube, &filtered_image, factor, factor);
GrayImage::from_tensor(tensor, w2 as _, h2 as _)
}
///
/// Computes a 1-D Gaussian convolution kernel.
///
pub fn create_gaussian_kernel1d(sigma: f32, radius: u32) -> Vec<f32> {
let sigma2 = sigma * sigma;
let mut phi_x = (-(radius as i32)..radius as i32 + 1)
.map(|x| f32::exp(-0.5 / (sigma2 * (x * x) as f32)))
.collect::<Vec<_>>();
let sum: f32 = phi_x.iter().cloned().sum();
phi_x.iter_mut().for_each(|x| *x /= sum);
phi_x
}
pub fn subsample<R: Runtime>(
cube: &ComputeClient<R::Server, R::Channel>,
img: &Tensor<f32, 2>,
y_step: u32,
x_step: u32,
) -> Tensor<f32, 2> {
let _ = x_step;
let _ = y_step;
let _ = img;
let _ = cube;
todo!()
}

20
src/image/padding.rs Normal file
View File

@ -0,0 +1,20 @@
use cubecl::{client::ComputeClient, Runtime};
use super::GrayImage;
pub fn pad<R: Runtime>(
cube: &ComputeClient<R::Server, R::Channel>,
img: &GrayImage,
left: u32,
right: u32,
top: u32,
bottom: u32,
) -> GrayImage {
let _ = bottom;
let _ = top;
let _ = right;
let _ = left;
let _ = img;
let _ = cube;
todo!()
}

385
src/kernels.rs Normal file
View File

@ -0,0 +1,385 @@
use cubecl::prelude::{Abs, Float, Line, Max, Min, Sqrt, Tensor, ABSOLUTE_POS_X, ABSOLUTE_POS_Y};
use crate::{
gat::generalized_ascombe_transform,
image::{self, GrayImage},
};
pub struct EstimateKernelsConfig {
k_detail: f32,
k_denoise: f32,
d_th: f32,
d_tr: f32,
k_stretch: f32,
k_shrink: f32,
}
#[derive(Debug, Clone, Copy, PartialEq)]
pub struct NoiseConfig {
pub alpha: f32,
pub beta: f32,
}
///
/// Implementation of Alg. 5: ComputeKernelCovariance
///
/// Returns the kernels covariance matrices for the frame J_n, sampled at the
/// center of every bayer quad (or at the center of every grey pixel in grey
/// mode).
///
/// Parameters
/// ----------
/// img : device Array[imshape_y, imshape_x]
/// Raw image J_n
///
/// Returns
/// -------
/// covs : device Array[imshape_y/2, imshape_x/2, 2, 2]
/// Covariance matrices Omega_n, sampled at the center of each bayer quad.
///
///
pub fn estimate_kernels(
img: &GrayImage,
cfg: EstimateKernelsConfig,
noise_cfg: NoiseConfig,
) -> image::Tensor<f32, 4> {
// Performing Variance Stabilization Transform
let img = generalized_ascombe_transform(img, noise_cfg);
// Decimate to grey
// if bayer_mode :
// img_grey = compute_grey_images_(img, method="decimating")
// else :
// img_grey = img # no need to copy now, they will be copied to gpu later.
let img_gray = img;
let (grey_imshape_y, grey_imshape_x) = img_gray.shape();
// Computing grads
// let th_grey_img = th.as_tensor(img_grey, dtype = DEFAULT_TORCH_FLOAT_TYPE, device = "cuda");
// Horizontal filters
// let grad_kernel1 = np.array([[[[-0.5, 0.5]]], [[[0.5, 0.5]]]]);
// Vertical filters
// let grad_kernel2 = np.array([[[[0.5], [0.5]]], [[[-0.5], [0.5]]]]);
// let tmp = conv1d(th_grey_img, grad_kernel1);
// let th_full_grad = conv1d(tmp, grad_kernel2, groups = 2);
// The default padding mode reduces the shape of grey_img of 1 pixel in each
// direction, as expected
// let cuda_full_grads =
// cuda.as_cuda_array(th_full_grad.squeeze().transpose(0, 1).transpose(1, 2));
// shape [y, x, 2]
// let covs = cuda.device_array(grey_imshape + (2, 2), DEFAULT_NUMPY_FLOAT_TYPE);
// threadsperblock = (DEFAULT_THREADS, DEFAULT_THREADS)
// blockspergrid_x = math.ceil(grey_imshape_x/threadsperblock[1])
// blockspergrid_y = math.ceil(grey_imshape_y/threadsperblock[0])
// blockspergrid = (blockspergrid_x, blockspergrid_y)
// cuda_estimate_kernel[blockspergrid, threadsperblock](cuda_full_grads,
// k_detail, k_denoise, D_th, D_tr, k_stretch, k_shrink,
// covs)
// covs
todo!()
}
#[allow(clippy::identity_op)]
#[cubecl::cube]
fn cuda_estimate_kernel(
full_grads: &Tensor<f32>,
covs: &mut Tensor<f32>,
k_detail: f32,
k_denoise: f32,
d_th: f32,
d_tr: f32,
k_stretch: f32,
k_shrink: f32,
) {
let pixel_idx = ABSOLUTE_POS_X;
let pixel_idy = ABSOLUTE_POS_Y;
let imshape_y = covs.shape(0);
let imshape_x = covs.shape(1);
if pixel_idy >= imshape_y || pixel_idx >= imshape_x {
return;
}
let covs_offset = pixel_idy * imshape_x + pixel_idx;
let mut structure_tensor: Line<f32> = Line::empty(4);
structure_tensor[0] = 0.0;
structure_tensor[1] = 0.0;
structure_tensor[2] = 0.0;
structure_tensor[3] = 0.0;
for i in 0..2 {
for j in 0..2 {
let x = (pixel_idx + j) as i32 - 1;
let y = (pixel_idy + i) as i32 - 1;
if (0 <= y && y < full_grads.shape(0) as i32)
&& (0 <= x && x < full_grads.shape(1) as i32)
{
let full_grad_offset = y as u32 * full_grads.stride(0) + x as u32;
let full_grad_x = full_grads[full_grad_offset + 0];
let full_grad_y = full_grads[full_grad_offset + 1];
structure_tensor[0] += full_grad_x * full_grad_x;
structure_tensor[1] += full_grad_x * full_grad_y;
structure_tensor[2] += full_grad_x * full_grad_y;
structure_tensor[3] += full_grad_y * full_grad_y;
}
}
}
let mut l = Line::empty(2);
let mut e1 = Line::empty(2);
let mut e2 = Line::empty(2);
let mut k = Line::empty(2);
get_eigen_elmts_2x2(&structure_tensor, &mut l, &mut e1, &mut e2);
compute_k(
&mut k, l[0], l[1], k_detail, k_denoise, d_th, d_tr, k_stretch, k_shrink,
);
let k_1_sq = k[0] * k[0];
let k_2_sq = k[1] * k[1];
covs[covs_offset + 0] = k_1_sq * e1[0] * e1[0] + k_2_sq * e2[0] * e2[0];
covs[covs_offset + 1] = k_1_sq * e1[0] * e1[1] + k_2_sq * e2[0] * e2[1];
covs[covs_offset + 2] = k_1_sq * e1[0] * e1[1] + k_2_sq * e2[0] * e2[1];
covs[covs_offset + 3] = k_1_sq * e1[1] * e1[1] + k_2_sq * e2[1] * e2[1];
}
///
/// Cuda function for resolving the 2x2 system A*X = B
/// by using the analytical formula
///
/// Parameters
/// ----------
/// A : Array[2,2]
///
/// B : Array[2]
///
/// Returns
/// -------
/// None
///
///
#[cubecl::cube]
fn solve_2x2(a: &Line<f32>, b: &Line<f32>, x: &mut Line<f32>) {
let det_a = a[0] * a[3] - a[1] * a[2];
x[0] = (a[3] * b[0] - a[1] * b[1]) / det_a;
x[1] = (a[0] * b[1] - a[2] * b[0]) / det_a;
}
///
/// inverts the 2x2 M array
///
/// Parameters
/// ----------
/// M : Array[2, 2]
/// Array to invert
/// M_i : Array[2, 2]
///
/// Returns
/// -------
/// None.
///
///
#[cubecl::cube]
fn invert_2x2(m: &Line<f32>, m_i: &mut Line<f32>) {
let det = m[0] * m[3] - m[1] * m[2];
if Abs::abs(det) > f32::new(f32::EPSILON) {
let det_i = 1.0 / det;
m_i[0] = m[3] * det_i;
m_i[1] = -m[1] * det_i;
m_i[2] = -m[2] * det_i;
m_i[3] = m[0] * det_i;
} else {
m_i[0] = 1.0;
m_i[1] = 0.0;
m_i[2] = 0.0;
m_i[3] = 1.0;
}
}
///
/// Returns the two roots of the polynom a*X^2 + b*X + c = 0 for a, b and c
/// real numbers. The function only returns real roots : make sure they exist
/// before calling the function. l[0] contains the root with the biggest module
/// and l[1] the smallest
///
///
/// Parameters
/// ----------
/// a : float
///
/// b : float
///
/// c : float
///
/// roots : Array[2]
///
/// Returns
/// -------
/// None
///
#[cubecl::cube]
fn get_real_polyroots_2(a: f32, b: f32, c: f32, roots: &mut Line<f32>) {
// numerical instabilities can cause delta to be slightly negative despite
// the equation admitting 2 real roots.
let delta_root = Sqrt::sqrt(Max::max(b * b - 4.0 * a * c, 0.0));
let r1 = (-b + delta_root) / (2.0 * a);
let r2 = (-b - delta_root) / (2.0 * a);
let r1_abs: f32 = Abs::abs(r1);
if r1_abs >= Abs::abs(r2) {
roots[0] = r1;
roots[1] = r2;
} else {
roots[0] = r2;
roots[1] = r1;
}
}
#[cubecl::cube]
fn get_eigen_val_2x2(m: &Line<f32>, l: &mut Line<f32>) {
let a = 1.0;
let b = -(m[0] + m[3]);
let c = m[0] * m[3] - m[1] * m[2];
get_real_polyroots_2(a, b, c, l)
}
///
/// return the eigen vectors with norm 1 for the eigen values l
/// M.e1 = l1.e1 ; M.e2 = l2.e2
///
/// Parameters
/// ----------
/// M : Array[2,2]
/// Real Symmetric array for which eigen values are to be determined
/// l : Array[2]
/// e1, e2 : Array[2]
/// sorted Eigenvalues
/// e1, e2 : Array[2, 2]
/// Computed orthogonal and normalized eigen vectors
///
/// Returns
/// -------
/// None.
///
#[cubecl::cube]
fn get_eigen_vect_2x2(m: &Line<f32>, l: &Line<f32>, e1: &mut Line<f32>, e2: &mut Line<f32>) {
// 2x2 algorithm : https://en.wikipedia.org/wiki/Eigenvalue_algorithm
if m[1] == 0.0 && m[0] == m[3] {
// m is multiple of identity, picking 2 ortogonal eigen vectors.
e1[0] = 1.0;
e1[1] = 0.0;
e2[0] = 0.0;
e2[1] = 1.0;
} else {
// averaging 2 for increased reliability
e1[0] = m[0] + m[1] - l[1];
e1[1] = m[2] + m[3] - l[1];
if e1[0] == 0.0 {
e1[1] = 1.0;
e2[0] = 1.0;
e2[1] = 0.0;
} else if e1[1] == 0.0 {
e1[0] = 1.0;
e2[0] = 0.0;
e2[1] = 1.0;
} else {
let norm_ = Sqrt::sqrt(e1[0] * e1[0] + e1[1] * e1[1]);
e1[0] /= norm_;
e1[1] /= norm_;
let sign = copysign(e1[0]);
e2[1] = Abs::abs(e1[0]);
e2[0] = -e1[1] * sign
}
}
}
#[cubecl::cube]
fn get_eigen_elmts_2x2(m: &Line<f32>, l: &mut Line<f32>, e1: &mut Line<f32>, e2: &mut Line<f32>) {
get_eigen_val_2x2(m, l);
get_eigen_vect_2x2(m, l, e1, e2);
}
///
/// Computes k_1 and k_2 based on lambda1, lambda2 and the constants.
///
/// Parameters
/// ----------
/// l1 : float
/// lambda1 (dominant eigen value)
/// l2 : float
/// lambda2 : second eigenvalue
/// k : Array[2]
/// empty vector where k_1 and k_2 will be stored
/// k_detail : float
/// k_denoise : float
/// D_th : float
/// D_tr : float
/// k_stretch : float
/// k_shrink : float
/// Parameters to compute k_1 and k_2, all detailed in the article.
///
#[cubecl::cube]
fn compute_k(
k: &mut Line<f32>,
l1: f32,
l2: f32,
k_detail: f32,
k_denoise: f32,
d_th: f32,
d_tr: f32,
k_stretch: f32,
k_shrink: f32,
) {
// When A is Nan, we fall back to this condition
let mut k1 = 1.0f32;
let mut k2 = 1.0f32;
let a: f32 = 1.0 + Sqrt::sqrt((l1 - l2) / (l1 + l2));
let d: f32 = clamp(1.0 - Sqrt::sqrt(l1) / d_tr + d_th, 0.0, 1.0);
// This is a very aggressive way of driving anisotropy, but it works well so far.
if a > 1.95 {
k1 = 1.0 / k_shrink;
k2 = k_stretch;
}
k[0] = k_detail * ((1.0 - d) * k1 + d * k_denoise);
k[1] = k_detail * ((1.0 - d) * k2 + d * k_denoise);
}
#[cubecl::cube]
fn copysign(val: f32) -> f32 {
if val < 0.0f32 {
-(1.0f32)
} else {
1.0f32
}
}
#[cubecl::cube]
fn clamp(a: f32, min: f32, max: f32) -> f32 {
// Clamp::clamp(a, min, max)
Min::min(Max::max(a, min), max)
}

6
src/lib.rs Normal file
View File

@ -0,0 +1,6 @@
pub mod alignment;
pub mod gat;
pub mod image;
pub mod kernels;
// pub mod merge;
pub mod misc;

411
src/merge.rs Normal file
View File

@ -0,0 +1,411 @@
use cubecl::prelude::Tensor;
use crate::image::{Alignments, GrayImage};
///
/// Implementation of Alg. 11: AccumulationReference
/// Accumulates the reference frame into num and den, while considering
/// (if enabled) the accumulated robustness mask to enforce single frame SR if
/// necessary.
///
/// Parameters
/// ----------
/// ref_img : device Array[imshape_y, imshape_x]
/// Reference image J_1
/// kernels : device Array[imshape_y//2, imshape_x//2, 2, 2]
/// Covariance Matrices Omega_1
/// num : device Array[s*imshape_y, s*imshape_x, c]
/// Numerator of the accumulator
/// den : device Array[s*imshape_y, s*imshape_x, c]
/// Denominator of the accumulator
/// options : dict
/// verbose options.
/// params : dict
/// parameters (containing the zoom s).
/// acc_rob : [imshape_y, imshape_x], optional
/// accumulated robustness mask. The default is None.
///
/// Returns
/// -------
/// None.
///
pub fn merge_ref() {}
#[cubecl::cube]
fn accumulate_ref(ref_img: &Tensor<f32>, covs: &Tensor<f32>, bayer_mode: bool, iso_kernel: bool, scale: f32, cfa_pattern: &Array<f32>,
num: &Tensor<f32>, den: &Tensor<f32>, acc_rob: &Tensor<f32>,
robustness_denoise: bool, max_frame_count: u32, rad_max: u32, max_multiplier: u32) {
let output_pixel_idx, output_pixel_idy = cuda.grid(2);
let output_size_y, output_size_x, _ = num.shape;
let input_size_y, input_size_x = ref_img.shape;
if output_pixel_idx >= output_size_x || output_pixel_idy >= output_size_y) {
return;
}
if bayer_mode {
n_channels = 3;
acc = cuda.local.array(3, dtype=DEFAULT_CUDA_FLOAT_TYPE);
val = cuda.local.array(3, dtype=DEFAULT_CUDA_FLOAT_TYPE);
} else {
n_channels = 1;
acc = cuda.local.array(1, dtype=DEFAULT_CUDA_FLOAT_TYPE);
val = cuda.local.array(1, dtype=DEFAULT_CUDA_FLOAT_TYPE);
}
// Copying CFA locally. We will read that 9 times, so it's worth it
let local_cfa = cuda.local.array((2,2), uint8);
for i in 0..2 {
for j in 0..2 {
local_cfa[i,j] = uint8(cfa_pattern[i,j]);
}
}
let coarse_ref_sub_pos = cuda.local.array(2, dtype=DEFAULT_CUDA_FLOAT_TYPE); // y, x
coarse_ref_sub_pos[0] = output_pixel_idy / scale;
coarse_ref_sub_pos[1] = output_pixel_idx / scale;
for chan in 0..n_channels {
acc[chan] = 0.0;
val[chan] = 0.0;
}
// computing kernel
// this is rather slow and could probably be sped up
if !iso_kernel {
let interpolated_cov = cuda.local.array((2, 2), dtype = DEFAULT_CUDA_FLOAT_TYPE);
let cov_i = cuda.local.array((2, 2), dtype=DEFAULT_CUDA_FLOAT_TYPE);
// fetching the 4 closest covs
let close_covs = cuda.local.array((2, 2, 2 ,2), DEFAULT_CUDA_FLOAT_TYPE);
let grey_pos = cuda.local.array(2, DEFAULT_CUDA_FLOAT_TYPE);
if bayer_mode {
grey_pos[0] = (coarse_ref_sub_pos[0]-0.5)/2; // grey grid is offseted and twice more sparse
grey_pos[1] = (coarse_ref_sub_pos[1]-0.5)/2;
} else {
grey_pos[0] = coarse_ref_sub_pos[0]; // grey grid is exactly the coarse grid
grey_pos[1] = coarse_ref_sub_pos[1];
}
// clipping the coordinates to stay in bound
let floor_x = int(max(math.floor(grey_pos[1]), 0));
let floor_y = int(max(math.floor(grey_pos[0]), 0));
let ceil_x = min(floor_x + 1, covs.shape[1]-1);
let ceil_y = min(floor_y + 1, covs.shape[0]-1);
for i in 0..2 {
for j in 0..2 {
close_covs[0, 0, i, j] = covs[floor_y, floor_x, i, j];
close_covs[0, 1, i, j] = covs[floor_y, ceil_x, i, j];
close_covs[1, 0, i, j] = covs[ceil_y, floor_x, i, j];
close_covs[1, 1, i, j] = covs[ceil_y, ceil_x, i, j];
}
}
// interpolating covs
interpolate_cov(close_covs, grey_pos, interpolated_cov);
invert_2x2(interpolated_cov, cov_i);
}
// fetching acc robustness if required
// Acc robustness is known for each raw pixel. An implicit interpolation done
// from LR to HR using nearest neighbor.
if robustness_denoise {
let local_acc_r = acc_rob[min(round(coarse_ref_sub_pos[0]), acc_rob.shape[0]-1),
min(round(coarse_ref_sub_pos[1]), acc_rob.shape[1]-1)];
additional_denoise_power = denoise_power_merge(local_acc_r, max_multiplier, max_frame_count);
rad = denoise_range_merge(local_acc_r, rad_max, max_frame_count);
} else {
additional_denoise_power = 1;
rad = 1;
}
let center_x = round(coarse_ref_sub_pos[1]);
let center_y = round(coarse_ref_sub_pos[0]);
for i in -rad..=rad {
for j in -rad..=rad {
let pixel_idx = center_x + j;
let pixel_idy = center_y + i;
// in bound condition
if (0 <= pixel_idx && pixel_idx < input_size_x) &&
(0 <= pixel_idy && pixel_idy < input_size_y) {
// checking if pixel is r, g or b
let channel if bayer_mode :
local_CFA[pixel_idy%2, pixel_idx%2]
else:
0
};
// By fetching the value now, we can compute the kernel weight
// while it is called from global memory
let c = ref_img[pixel_idy, pixel_idx];
// computing distance
let dist_x = pixel_idx - coarse_ref_sub_pos[1];
let dist_y = pixel_idy - coarse_ref_sub_pos[0];
// Computing w
let mut y = if iso_kernel {
max(0, 2 * (dist_x * dist_x + dist_y * dist_y))
} else {
max(0, quad_mat_prod(cov_i, dist_x, dist_y))
};
// y can be slightly negative because of numerical precision.
// I clamp it to not explode the error with exp
// this is equivalent to multiplying the covariance,
// but at the cost of one scalar operation (instead of 4)
y /= additional_denoise_power;
let w = if bayer_mode {
exp(-0.5 * y)
} else {
exp(-0.5 * 4.0 * y) // original kernel constants are designed for bayer distances, not greys, Hence x4
};
// -----
val[channel] += c * w;
acc[channel] += w;
}
}
}
if robustness_denoise and local_acc_r < max_frame_count {
// Overwritting values to enforce single frame demosaicing
for chan in 0..n_channels {
num[output_pixel_idy, output_pixel_idx, chan] = val[chan];
den[output_pixel_idy, output_pixel_idx, chan] = acc[chan];
}
} else {
for chan in 0..n_channels {
num[output_pixel_idy, output_pixel_idx, chan] += val[chan];
den[output_pixel_idy, output_pixel_idx, chan] += acc[chan];
}
}
}
pub struct MergeConfig {
pub scale: f32,
}
///
/// Implementation of Alg. 4: Accumulation
/// Accumulates comp_img (J_n, n>1) into num and den, based on the alignment
/// V_n, the covariance matrices Omega_n and the robustness mask estimated before.
/// The size of the merge_result is adjustable with params['scale']
///
///
/// Parameters
/// ----------
/// comp_imgs : device Array [imsize_y, imsize_x]
/// The non-reference image to merge (J_n)
/// alignments : device Array[n_tiles_y, n_tiles_x, 2]
/// The final estimation of the tiles' alignment V_n(p)
/// covs : device array[imsize_y//2, imsize_x//2, 2, 2]
/// covariance matrices Omega_n
/// r : Device_Array[imsize_y, imsize_x]
/// Robustness mask r_n
/// num : device Array[s*imshape_y, s*imshape_x, c]
/// Numerator of the accumulator
/// den : device Array[s*imshape_y, s*imshape_x, c]
/// Denominator of the accumulator
///
/// options : Dict
/// Options to pass
/// params : Dict
/// parameters
///
/// Returns
/// -------
/// None
///
fn merge(comp_img: &GrayImage, alignments: &Alignments, covs: image::Tensor<f32 4>, r, num, den, cfg: MergeConfig) {
// scale = params['scale']
// CFA_pattern = cuda.to_device(params['exif']['CFA Pattern'])
// bayer_mode = params['mode'] == 'bayer'
// iso_kernel = params['kernel'] == 'iso'
// tile_size = params['tuning']['tileSize']
// native_im_size = comp_img.shape
// // casting to integer to account for floating scale
// output_size = (round(scale*native_im_size[0]), round(scale*native_im_size[1]))
// dispatching threads. 1 thread for 1 output pixel
// threadsperblock = (DEFAULT_THREADS, DEFAULT_THREADS) # maximum, we may take less
// blockspergrid_x = math.ceil(output_size[1]/threadsperblock[1])
// blockspergrid_y = math.ceil(output_size[0]/threadsperblock[0])
// blockspergrid = (blockspergrid_x, blockspergrid_y)
// accumulate[blockspergrid, threadsperblock](
// comp_img, alignments, covs, r,
// bayer_mode, iso_kernel, scale, tile_size, CFA_pattern,
// num, den)
}
#[cubecl::cube]
fn accumulate(comp_img, alignments, covs, r,
bayer_mode, iso_kernel, scale, tile_size, CFA_pattern,
num, den) {
let output_pixel_idx, output_pixel_idy = cuda.grid(2);
let output_size_y, output_size_x, _ = num.shape;
let input_size_y, input_size_x = comp_img.shape;
if output_pixel_idx >= output_size_x || output_pixel_idy >= output_size_y {
return;
}
if bayer_mode {
n_channels = 3;
acc = cuda.local.array(3, dtype=DEFAULT_CUDA_FLOAT_TYPE);
val = cuda.local.array(3, dtype=DEFAULT_CUDA_FLOAT_TYPE);
} else {
n_channels = 1;
acc = cuda.local.array(1, dtype=DEFAULT_CUDA_FLOAT_TYPE);
val = cuda.local.array(1, dtype=DEFAULT_CUDA_FLOAT_TYPE);
}
// Copying CFA locally. We will read that 9 times, so it's worth it
let local_cfa = cuda.local.array((2,2), uint8);
for i in 0..2 {
for j in 0..2 {
local_CFA[i,j] = uint8(cfa_pattern[i,j]);
}
}
let coarse_ref_sub_pos = cuda.local.array(2, dtype=DEFAULT_CUDA_FLOAT_TYPE); // y, x
coarse_ref_sub_pos[0] = output_pixel_idy / scale;
coarse_ref_sub_pos[1] = output_pixel_idx / scale;
// fetch of the flow, as early as possible
let local_optical_flow = cuda.local.array(2, dtype=DEFAULT_CUDA_FLOAT_TYPE);
let patch_idy = int(coarse_ref_sub_pos[0] / tile_size);
let patch_idx = int(coarse_ref_sub_pos[1] /tile_size);
local_optical_flow[0] = alignments[patch_idy, patch_idx, 0];
local_optical_flow[1] = alignments[patch_idy, patch_idx, 1];
for chan in 0..n_channels {
acc[chan] = 0.0;
val[chan] = 0.0;
}
let patch_center_pos = cuda.local.array(2, DEFAULT_CUDA_FLOAT_TYPE); // y, x
// fetching robustness
// The robustness coefficient is known for every raw pixel, and implicitely
// interpolated to HR using nearest neighboor interpolations.
let y_r = clamp(round(coarse_ref_sub_pos[0]), 0, r.shape[0]-1);
let x_r = clamp(round(coarse_ref_sub_pos[1]), 0, r.shape[1]-1);
let local_r = r[y_r, x_r];
patch_center_pos[1] = coarse_ref_sub_pos[1] + local_optical_flow[0];
patch_center_pos[0] = coarse_ref_sub_pos[0] + local_optical_flow[1];
// updating inbound condition
if patch_center_pos[1] >= input_size_x || patch_center_pos[0] >= input_size_y {
return;
}
// computing kernel
if !iso_kernel {
let interpolated_cov = cuda.local.array((2, 2), dtype = DEFAULT_CUDA_FLOAT_TYPE);
let cov_i = cuda.local.array((2, 2), dtype=DEFAULT_CUDA_FLOAT_TYPE);
// fetching the 4 closest covs
let close_covs = cuda.local.array((2, 2, 2 ,2), DEFAULT_CUDA_FLOAT_TYPE);
let grey_pos = cuda.local.array(2, DEFAULT_CUDA_FLOAT_TYPE);
if bayer_mode {
grey_pos[0] = (patch_center_pos[0] - 0.5) /2; // grey grid is offseted and twice more sparse
grey_pos[1] = (patch_center_pos[1] - 0.5) /2;
} else {
grey_pos[0] = patch_center_pos[0]; // grey grid is exactly the coarse grid
grey_pos[1] = patch_center_pos[1];
}
// clipping the coordinates to stay in bound
let floor_x = int(max(math.floor(grey_pos[1]), 0));
let floor_y = int(max(math.floor(grey_pos[0]), 0));
let ceil_x = min(floor_x + 1, covs.shape[1]-1);
let ceil_y = min(floor_y + 1, covs.shape[0]-1);
for i in 0..2 {
for j in range(0, 2) {
close_covs[0, 0, i, j] = covs[floor_y, floor_x, i, j];
close_covs[0, 1, i, j] = covs[floor_y, ceil_x, i, j];
close_covs[1, 0, i, j] = covs[ceil_y, floor_x, i, j];
close_covs[1, 1, i, j] = covs[ceil_y, ceil_x, i, j];
}
}
// interpolating covs at the desired spot
interpolate_cov(close_covs, grey_pos, interpolated_cov);
invert_2x2(interpolated_cov, cov_i);
}
let center_x = round(patch_center_pos[1]);
let center_y = round(patch_center_pos[0]);
for i in -1..=1 {
for j in -1..=1 {
let pixel_idx = center_x + j;
let pixel_idy = center_y + i;
// in bound condition
if (0 <= pixel_idx && pixel_idx< input_size_x) &&
(0 <= pixel_idy && pixel_idy< input_size_y) {
// checking if pixel is r, g or b
let channel = if bayer_mode {
local_CFA[pixel_idy%2, pixel_idx%2]
} else {
0
};
// By fetching the value now, we can compute the kernel weight
// while it is called from global memory
let c = comp_img[pixel_idy, pixel_idx];
// computing distance
let dist_x = pixel_idx - patch_center_pos[1];
let dist_y = pixel_idy - patch_center_pos[0];
// Computing w
let y = if iso_kernel {
max(0, 2 * (dist_x * dist_x + dist_y * dist_y))
} else {
max(0, quad_mat_prod(cov_i, dist_x, dist_y))
// y can be slightly negative because of numerical precision.
// I clamp it to not explode the error with exp
};
w = math.exp(-0.5*y)
//---
val[channel] += c * w * local_r;
acc[channel] += w * local_r;
}
}
}
for chan in 0..n_channels {
num[output_pixel_idy, output_pixel_idx, chan] += val[chan];
den[output_pixel_idy, output_pixel_idx, chan] += acc[chan];
}
}

5
src/misc.rs Normal file
View File

@ -0,0 +1,5 @@
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum DistanceKind {
L1,
L2,
}