initial
This commit is contained in:
commit
e79a8329ea
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
||||
/target
|
1781
Cargo.lock
generated
Normal file
1781
Cargo.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
7
Cargo.toml
Normal file
7
Cargo.toml
Normal file
@ -0,0 +1,7 @@
|
||||
[package]
|
||||
name = "rax"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
cubecl = { git = "https://github.com/tracel-ai/cubecl.git", features = ["wgpu", "cuda"] }
|
1
examples/align.rs
Normal file
1
examples/align.rs
Normal file
@ -0,0 +1 @@
|
||||
fn main() {}
|
20
src/alignment.rs
Normal file
20
src/alignment.rs
Normal file
@ -0,0 +1,20 @@
|
||||
use crate::misc::DistanceKind;
|
||||
|
||||
pub mod block_matching;
|
||||
pub mod gauss_pyramid;
|
||||
pub mod local_search;
|
||||
pub mod upsample_alignments;
|
||||
|
||||
pub struct AlingLevelConfig {
|
||||
pub factor: u32,
|
||||
pub upsampling_factor: u32,
|
||||
pub tile_size: u32,
|
||||
pub search_radius: u32,
|
||||
pub distance: DistanceKind,
|
||||
}
|
||||
|
||||
pub struct AlignmentConfig {
|
||||
pub levels: Vec<AlingLevelConfig>,
|
||||
}
|
||||
|
||||
pub struct Alignments {}
|
96
src/alignment/block_matching.rs
Normal file
96
src/alignment/block_matching.rs
Normal file
@ -0,0 +1,96 @@
|
||||
use cubecl::{client::ComputeClient, Runtime};
|
||||
|
||||
use crate::image::{Alignments, GrayImage};
|
||||
|
||||
use super::{
|
||||
gauss_pyramid::build_pyramid, local_search::local_search,
|
||||
upsample_alignments::upsample_alignments, AlingLevelConfig,
|
||||
};
|
||||
|
||||
///
|
||||
/// Align the reference image with the img : returns a patchwise flow such that
|
||||
/// for patches py, px :
|
||||
/// img[py, px] ~= ref_img[py + alignments[py, px, 1],
|
||||
/// px + alignments[py, px, 0]]
|
||||
///
|
||||
/// Parameters
|
||||
/// ----------
|
||||
/// ref_img : Img [imshape_y, imshape_x]
|
||||
/// Image to be compared
|
||||
///
|
||||
/// ref_pyramid : [Img]
|
||||
/// Pyramid representation of the ref image J_1
|
||||
///
|
||||
/// Returns
|
||||
/// -------
|
||||
/// alignments : Alignments
|
||||
/// a device array[n_patchs_y, n_patchs_x, 2]
|
||||
/// Patchwise flow : V_n(p) for each patch (p)
|
||||
///
|
||||
pub fn align_image_block_matching<R: Runtime>(
|
||||
cube: &ComputeClient<R::Server, R::Channel>,
|
||||
img: &GrayImage,
|
||||
ref_pyramid: &[GrayImage],
|
||||
level_configs: &[AlingLevelConfig],
|
||||
) -> Alignments {
|
||||
let alt_pyramid = build_pyramid::<R>(cube, img, level_configs.iter().map(|x| x.factor));
|
||||
|
||||
// Align alternate image to the reference image
|
||||
let mut alignments = None;
|
||||
let cfg_iter = level_configs.iter().rev();
|
||||
let mut prev_tile_size = 64;
|
||||
|
||||
for (lv, cfg) in cfg_iter.enumerate() {
|
||||
alignments = Some(align_on_a_level::<R>(
|
||||
cube,
|
||||
&ref_pyramid[lv],
|
||||
&alt_pyramid[lv],
|
||||
cfg,
|
||||
alignments.as_ref(),
|
||||
prev_tile_size,
|
||||
));
|
||||
|
||||
prev_tile_size = cfg.tile_size;
|
||||
}
|
||||
|
||||
alignments.unwrap()
|
||||
}
|
||||
|
||||
///
|
||||
/// Alignment will always be an integer with this function, however it is
|
||||
/// set to DEFAULT_FLOAT_TYPE. This enables to directly use the outputed
|
||||
/// alignment for ICA without any casting from int to float
|
||||
///
|
||||
pub(crate) fn align_on_a_level<R: Runtime>(
|
||||
cube: &ComputeClient<R::Server, R::Channel>,
|
||||
ref_pyramid_lvl: &GrayImage,
|
||||
alt_pyramid_lvl: &GrayImage,
|
||||
cfg: &AlingLevelConfig,
|
||||
prev: Option<&Alignments>,
|
||||
prev_tile_size: u32,
|
||||
) -> Alignments {
|
||||
let (ref_h, ref_w) = ref_pyramid_lvl.shape();
|
||||
|
||||
// Number of patches that can fit on this level
|
||||
let h = ref_h / cfg.tile_size;
|
||||
let w = ref_w / cfg.tile_size;
|
||||
|
||||
let mut alignments = Alignments::new_empty::<R>(cube, w, h);
|
||||
|
||||
if let Some(prev) = prev {
|
||||
// use the upsampled previous alignments as initial guesses
|
||||
upsample_alignments::<R>(
|
||||
cube,
|
||||
&mut alignments,
|
||||
ref_pyramid_lvl,
|
||||
alt_pyramid_lvl,
|
||||
prev,
|
||||
prev_tile_size,
|
||||
cfg,
|
||||
)
|
||||
}
|
||||
|
||||
local_search::<R>(cube, &mut alignments, ref_pyramid_lvl, alt_pyramid_lvl, cfg);
|
||||
|
||||
alignments
|
||||
}
|
71
src/alignment/gauss_pyramid.rs
Normal file
71
src/alignment/gauss_pyramid.rs
Normal file
@ -0,0 +1,71 @@
|
||||
use cubecl::{client::ComputeClient, Runtime};
|
||||
|
||||
use crate::image::{downsample::downsample, padding::pad, GrayImage};
|
||||
|
||||
///
|
||||
/// Construct N-level coarse-to-fine gaussian pyramid
|
||||
///
|
||||
/// Args:
|
||||
/// image: input image (expected to be a grayscale image downsampled from a Bayer raw image)
|
||||
/// factors: [int], dowsampling factors (fine-to-coarse)
|
||||
///
|
||||
pub(crate) fn build_pyramid<R: Runtime>(
|
||||
cube: &ComputeClient<R::Server, R::Channel>,
|
||||
img: &GrayImage,
|
||||
factors: impl Iterator<Item = u32>,
|
||||
) -> Vec<GrayImage> {
|
||||
// Subsequent pyramid levels are successively created
|
||||
// with convolution by a kernel followed by downsampling
|
||||
let init = Vec::with_capacity(factors.size_hint().1.unwrap_or(factors.size_hint().0));
|
||||
let mut levels = factors.fold(init, |mut acc, factor| {
|
||||
acc.push(downsample::<R>(cube, acc.last().unwrap_or(img), factor));
|
||||
acc
|
||||
});
|
||||
|
||||
// Reverse the pyramid to get it coarse-to-fine
|
||||
levels.reverse();
|
||||
|
||||
levels
|
||||
}
|
||||
|
||||
///
|
||||
/// Returns the pyramid representation of `img`, that will be used for
|
||||
/// future block matching
|
||||
///
|
||||
/// Parameters
|
||||
/// ----------
|
||||
/// img : Img grayscale input image
|
||||
/// factors : [int] pyramid scaling factors
|
||||
/// tile_size : int size of the tile
|
||||
///
|
||||
/// Returns
|
||||
/// -------
|
||||
/// pyramid : [Img]
|
||||
/// pyramid representation of the image
|
||||
///
|
||||
pub fn build_gauss_pyramid<R: Runtime>(
|
||||
cube: &ComputeClient<R::Server, R::Channel>,
|
||||
img: &GrayImage,
|
||||
factors: impl Iterator<Item = u32>,
|
||||
tile_size: u32,
|
||||
) -> Vec<GrayImage> {
|
||||
let (h, w) = img.shape();
|
||||
|
||||
// if needed, pad images with zeros so that getTiles contains all image pixels
|
||||
let padding_patches_height = tile_size - (h % tile_size);
|
||||
let padding_patches_width = tile_size - (w % tile_size);
|
||||
|
||||
if padding_patches_width != 0 || padding_patches_height != 0 {
|
||||
let img = pad::<R>(
|
||||
cube,
|
||||
img,
|
||||
0,
|
||||
padding_patches_width,
|
||||
0,
|
||||
padding_patches_height,
|
||||
);
|
||||
build_pyramid::<R>(cube, &img, factors)
|
||||
} else {
|
||||
build_pyramid::<R>(cube, img, factors)
|
||||
}
|
||||
}
|
114
src/alignment/local_search.rs
Normal file
114
src/alignment/local_search.rs
Normal file
@ -0,0 +1,114 @@
|
||||
use cubecl::{
|
||||
client::ComputeClient,
|
||||
prelude::{Float, Line, ScalarArg, Tensor, ABSOLUTE_POS_X, ABSOLUTE_POS_Y},
|
||||
CubeCount, CubeDim, Runtime,
|
||||
};
|
||||
|
||||
use crate::{
|
||||
alignment::upsample_alignments::cube_compute_dist,
|
||||
image::{Alignments, GrayImage},
|
||||
misc::DistanceKind,
|
||||
};
|
||||
|
||||
use super::AlingLevelConfig;
|
||||
|
||||
pub fn local_search<R: Runtime>(
|
||||
cube: &ComputeClient<R::Server, R::Channel>,
|
||||
upsampled_alignments: &mut Alignments,
|
||||
ref_pyramid_lvl: &GrayImage,
|
||||
alt_pyramid_lvl: &GrayImage,
|
||||
cfg: &AlingLevelConfig,
|
||||
) {
|
||||
let threadsperblock = CubeDim::default();
|
||||
|
||||
let w = upsampled_alignments.shape()[0] as u32;
|
||||
let h = upsampled_alignments.shape()[1] as u32;
|
||||
|
||||
let blockspergrid_x = (w + threadsperblock.x - 1) / threadsperblock.x;
|
||||
let blockspergrid_y = (h + threadsperblock.y - 1) / threadsperblock.y;
|
||||
|
||||
unsafe {
|
||||
cube_local_search::launch_unchecked::<R>(
|
||||
cube,
|
||||
CubeCount::Static(blockspergrid_x, blockspergrid_y, 1),
|
||||
threadsperblock,
|
||||
ref_pyramid_lvl.as_tensor_arg().as_tensor_arg(1),
|
||||
alt_pyramid_lvl.as_tensor_arg().as_tensor_arg(1),
|
||||
upsampled_alignments.as_tensor_arg().as_tensor_arg(1),
|
||||
ScalarArg::new(cfg.tile_size),
|
||||
ScalarArg::new(cfg.search_radius as _),
|
||||
cfg.distance,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[cubecl::cube(launch_unchecked)]
|
||||
#[allow(clippy::identity_op)]
|
||||
fn cube_local_search(
|
||||
ref_pyramid_lvl: &Tensor<f32>,
|
||||
alt_pyramid_lvl: &Tensor<f32>,
|
||||
upsampled_alignments: &mut Tensor<f32>,
|
||||
tile_size: u32,
|
||||
search_radius: i32,
|
||||
#[comptime] distance: DistanceKind,
|
||||
) {
|
||||
let n_patchs_y = upsampled_alignments.shape(0);
|
||||
let n_patchs_x = upsampled_alignments.shape(1);
|
||||
|
||||
let tile_x = ABSOLUTE_POS_X;
|
||||
let tile_y = ABSOLUTE_POS_Y;
|
||||
|
||||
if tile_y >= n_patchs_y || tile_x >= n_patchs_x {
|
||||
return;
|
||||
}
|
||||
|
||||
let offset = tile_y * upsampled_alignments.stride(0) + tile_x * upsampled_alignments.stride(1);
|
||||
|
||||
let mut local_flow = Line::empty(2);
|
||||
local_flow[0] = upsampled_alignments[offset + 0];
|
||||
local_flow[1] = upsampled_alignments[offset + 1];
|
||||
|
||||
// position of the pixel in the top left corner of the patch
|
||||
let patch_pos_x = tile_x * tile_size;
|
||||
let patch_pos_y = tile_y * tile_size;
|
||||
|
||||
// this should be rewritten to allow patchs bigger than 32
|
||||
let mut local_ref = Line::empty(1024);
|
||||
for i in 0..tile_size {
|
||||
for j in 0..tile_size {
|
||||
let idx = patch_pos_x + j;
|
||||
let idy = patch_pos_y + i;
|
||||
|
||||
local_ref[i * 32 + j] = ref_pyramid_lvl[idy * ref_pyramid_lvl.stride(0) + idx];
|
||||
}
|
||||
}
|
||||
|
||||
let mut min_dist = f32::new(f32::INFINITY); // init as infty
|
||||
let mut min_shift_y: i32 = 0;
|
||||
let mut min_shift_x: i32 = 0;
|
||||
|
||||
// window search
|
||||
for search_shift_y in -search_radius..=search_radius {
|
||||
for search_shift_x in -search_radius..=search_radius {
|
||||
// computing dist
|
||||
let dist = cube_compute_dist(
|
||||
&local_ref,
|
||||
alt_pyramid_lvl,
|
||||
&local_flow,
|
||||
patch_pos_x as i32 + search_shift_x,
|
||||
patch_pos_y as i32 + search_shift_y,
|
||||
tile_size,
|
||||
distance,
|
||||
);
|
||||
|
||||
if dist < min_dist {
|
||||
min_dist = dist;
|
||||
min_shift_y = search_shift_y;
|
||||
min_shift_x = search_shift_x;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
upsampled_alignments[offset + 0] = local_flow[0] + min_shift_x as f32;
|
||||
upsampled_alignments[offset + 1] = local_flow[1] + min_shift_y as f32;
|
||||
}
|
263
src/alignment/upsample_alignments.rs
Normal file
263
src/alignment/upsample_alignments.rs
Normal file
@ -0,0 +1,263 @@
|
||||
use cubecl::{
|
||||
client::ComputeClient,
|
||||
prelude::{Abs, Float, Line, Max, Min, ScalarArg, Tensor, ABSOLUTE_POS_X, ABSOLUTE_POS_Y},
|
||||
CubeCount, CubeDim, Runtime,
|
||||
};
|
||||
|
||||
use crate::{
|
||||
image::{Alignments, GrayImage},
|
||||
misc::DistanceKind,
|
||||
};
|
||||
|
||||
use super::AlingLevelConfig;
|
||||
|
||||
///
|
||||
/// Upsample alignements to adapt them to the next pyramid level (Section 3.2 of the IPOL article).
|
||||
///
|
||||
pub(crate) fn upsample_alignments<R: Runtime>(
|
||||
cube: &ComputeClient<R::Server, R::Channel>,
|
||||
dst: &mut Alignments,
|
||||
ref_pyramid_lvl: &GrayImage,
|
||||
alt_pyramid_lvl: &GrayImage,
|
||||
prev_alignments: &Alignments,
|
||||
prev_tile_size: u32,
|
||||
cfg: &AlingLevelConfig,
|
||||
) {
|
||||
let threadsperblock = CubeDim::default();
|
||||
|
||||
let n_tiles_y_new = dst.shape()[0] as u32;
|
||||
let n_tiles_x_new = dst.shape()[1] as u32;
|
||||
|
||||
let blockspergrid_x = (n_tiles_x_new + threadsperblock.x - 1) / threadsperblock.x;
|
||||
let blockspergrid_y = (n_tiles_y_new + threadsperblock.y - 1) / threadsperblock.y;
|
||||
|
||||
unsafe {
|
||||
cube_upsample_alignments::launch_unchecked::<R>(
|
||||
cube,
|
||||
CubeCount::Static(blockspergrid_x, blockspergrid_y, 1),
|
||||
threadsperblock,
|
||||
ref_pyramid_lvl.as_tensor_arg().as_tensor_arg(1),
|
||||
alt_pyramid_lvl.as_tensor_arg().as_tensor_arg(1),
|
||||
prev_alignments.as_tensor_arg().as_tensor_arg(1),
|
||||
dst.as_tensor_arg().as_tensor_arg(1),
|
||||
ScalarArg::new(cfg.upsampling_factor),
|
||||
ScalarArg::new(cfg.tile_size),
|
||||
ScalarArg::new(prev_tile_size),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::identity_op)]
|
||||
#[cubecl::cube(launch_unchecked)]
|
||||
fn cube_upsample_alignments(
|
||||
ref_pyramid_lvl: &Tensor<f32>,
|
||||
alt_pyramid_lvl: &Tensor<f32>,
|
||||
previous_alignments: &Tensor<f32>,
|
||||
upsampled_alignments: &mut Tensor<f32>,
|
||||
upsampling_factor: u32,
|
||||
tile_size: u32,
|
||||
prev_tile_size: u32,
|
||||
) {
|
||||
let subtile_x = ABSOLUTE_POS_X;
|
||||
let subtile_y = ABSOLUTE_POS_Y;
|
||||
|
||||
let n_tiles_y_prev = previous_alignments.shape(0);
|
||||
let n_tiles_x_prev = previous_alignments.shape(1);
|
||||
|
||||
let n_tiles_y_new = upsampled_alignments.shape(0);
|
||||
let n_tiles_x_new = upsampled_alignments.shape(1);
|
||||
|
||||
let w = ref_pyramid_lvl.shape(1);
|
||||
|
||||
let repeat_factor = upsampling_factor / (tile_size / prev_tile_size);
|
||||
|
||||
if subtile_x >= n_tiles_x_new || subtile_y >= n_tiles_y_new {
|
||||
return;
|
||||
}
|
||||
|
||||
let des_offset =
|
||||
subtile_y * upsampled_alignments.stride(0) + subtile_x * upsampled_alignments.stride(1);
|
||||
|
||||
// the new subtile is on the side of the image, and is not contained within a bigger old tile
|
||||
if subtile_x >= repeat_factor * n_tiles_x_prev || subtile_y >= repeat_factor * n_tiles_y_prev {
|
||||
upsampled_alignments[des_offset + 0] = 0.;
|
||||
upsampled_alignments[des_offset + 1] = 0.;
|
||||
return;
|
||||
}
|
||||
|
||||
let prev_tile_x = subtile_x / repeat_factor;
|
||||
let prev_tile_y = subtile_y / repeat_factor;
|
||||
|
||||
// position of the top left pixel in the subtile
|
||||
let subtile_pos_y = subtile_y * tile_size;
|
||||
let subtile_pos_x = subtile_x * tile_size;
|
||||
|
||||
// copying ref patch into local memory, because it needs to be read 3 times
|
||||
// this should be rewritten to allow patchs bigger than 32
|
||||
let mut local_ref = Line::empty(1024);
|
||||
for i in 0..tile_size {
|
||||
for j in 0..tile_size {
|
||||
let idx = subtile_pos_x + j;
|
||||
let idy = subtile_pos_y + i;
|
||||
|
||||
local_ref[i * 32 + j] = ref_pyramid_lvl[idy * w + idx];
|
||||
}
|
||||
}
|
||||
|
||||
// position of the new tile within the old tile
|
||||
let ups_subtile_x = subtile_x % repeat_factor;
|
||||
let ups_subtile_y = subtile_y % repeat_factor;
|
||||
|
||||
// computing id for the 3 closest patchs
|
||||
let x_shift = if 2 * ups_subtile_x + 1 > repeat_factor {
|
||||
1i32
|
||||
} else {
|
||||
-(1i32)
|
||||
};
|
||||
|
||||
let y_shift = if 2 * ups_subtile_y + 1 > repeat_factor {
|
||||
1i32
|
||||
} else {
|
||||
-(1i32)
|
||||
};
|
||||
|
||||
// Choosing the best of the 3 alignments by minimising L1 dist
|
||||
let mut dist = 1.0f32 / 0.0f32;
|
||||
let mut optimal_flow_x = 0.0f32;
|
||||
let mut optimal_flow_y = 0.0f32;
|
||||
|
||||
// 3 Candidates alignments are fetched (by fetching them as early as possible, we may received
|
||||
// them from global memory before we even require them, as calculations are performed during this delay)
|
||||
let candidate_alignment_0_shift = cube_alignment_flow(
|
||||
previous_alignments,
|
||||
prev_tile_x,
|
||||
prev_tile_y,
|
||||
upsampling_factor as f32,
|
||||
);
|
||||
|
||||
let candidate_alignment_vert_shift = cube_alignment_flow(
|
||||
previous_alignments,
|
||||
prev_tile_x,
|
||||
clamp(prev_tile_y as i32 + y_shift, 0, n_tiles_y_prev as i32 - 1) as u32,
|
||||
upsampling_factor as f32,
|
||||
);
|
||||
|
||||
let candidate_alignment_horizontal_shift = cube_alignment_flow(
|
||||
previous_alignments,
|
||||
clamp(prev_tile_x as i32 + x_shift, 0, n_tiles_x_prev as i32 - 1) as u32,
|
||||
prev_tile_y,
|
||||
upsampling_factor as f32,
|
||||
);
|
||||
|
||||
// 0 shift
|
||||
let dist_ = cube_compute_dist(
|
||||
&local_ref,
|
||||
alt_pyramid_lvl,
|
||||
&candidate_alignment_0_shift,
|
||||
subtile_pos_x as i32,
|
||||
subtile_pos_y as i32,
|
||||
tile_size,
|
||||
DistanceKind::L1,
|
||||
);
|
||||
|
||||
if dist_ < dist {
|
||||
dist = dist_;
|
||||
optimal_flow_x = candidate_alignment_0_shift[0];
|
||||
optimal_flow_y = candidate_alignment_0_shift[1];
|
||||
}
|
||||
|
||||
// vertical shift
|
||||
let dist_ = cube_compute_dist(
|
||||
&local_ref,
|
||||
alt_pyramid_lvl,
|
||||
&candidate_alignment_vert_shift,
|
||||
subtile_pos_x as i32,
|
||||
subtile_pos_y as i32,
|
||||
tile_size,
|
||||
DistanceKind::L1,
|
||||
);
|
||||
|
||||
if dist_ < dist {
|
||||
dist = dist_;
|
||||
optimal_flow_x = candidate_alignment_vert_shift[0];
|
||||
optimal_flow_y = candidate_alignment_vert_shift[1];
|
||||
}
|
||||
|
||||
// horizontal shift
|
||||
let dist_ = cube_compute_dist(
|
||||
&local_ref,
|
||||
alt_pyramid_lvl,
|
||||
&candidate_alignment_horizontal_shift,
|
||||
subtile_pos_x as i32,
|
||||
subtile_pos_y as i32,
|
||||
tile_size,
|
||||
DistanceKind::L1,
|
||||
);
|
||||
|
||||
if dist_ < dist {
|
||||
optimal_flow_x = candidate_alignment_horizontal_shift[0];
|
||||
optimal_flow_y = candidate_alignment_horizontal_shift[1];
|
||||
}
|
||||
|
||||
// applying best flow
|
||||
upsampled_alignments[des_offset + 0] = optimal_flow_x;
|
||||
upsampled_alignments[des_offset + 1] = optimal_flow_y;
|
||||
}
|
||||
|
||||
#[cubecl::cube]
|
||||
fn clamp(a: i32, min: i32, max: i32) -> i32 {
|
||||
// Clamp::clamp(a, min, max)
|
||||
Min::min(Max::max(a, min), max)
|
||||
}
|
||||
|
||||
#[cubecl::cube]
|
||||
#[allow(clippy::identity_op)]
|
||||
fn cube_alignment_flow(
|
||||
previous_alignments: &Tensor<f32>,
|
||||
x: u32,
|
||||
y: u32,
|
||||
factor: f32,
|
||||
) -> Line<f32> {
|
||||
let offset = y * previous_alignments.stride(0) + x * previous_alignments.stride(1);
|
||||
|
||||
let mut candidate_alignment: Line<f32> = Line::empty(2);
|
||||
candidate_alignment[0] = previous_alignments[offset + 0] * factor;
|
||||
candidate_alignment[1] = previous_alignments[offset + 1] * factor;
|
||||
candidate_alignment
|
||||
}
|
||||
|
||||
#[cubecl::cube]
|
||||
pub(crate) fn cube_compute_dist(
|
||||
local_ref: &Line<f32>,
|
||||
alt_pyramid_lvl: &Tensor<f32>,
|
||||
candidate_alignment: &Line<f32>,
|
||||
pos_x: i32,
|
||||
pos_y: i32,
|
||||
tile_size: u32,
|
||||
#[comptime] distance: DistanceKind,
|
||||
) -> f32 {
|
||||
let h = alt_pyramid_lvl.shape(0);
|
||||
let w = alt_pyramid_lvl.shape(1);
|
||||
|
||||
let mut dist = 0.0f32;
|
||||
for i in 0..tile_size {
|
||||
for j in 0..tile_size {
|
||||
let new_idx = pos_x + j as i32 + candidate_alignment[0] as i32;
|
||||
let new_idy = pos_y + i as i32 + candidate_alignment[1] as i32;
|
||||
|
||||
if (0 <= new_idx && new_idx < w as i32) && (0 <= new_idy && new_idy < h as i32) {
|
||||
let alt_offset = new_idy as u32 * w + new_idx as u32;
|
||||
let diff = local_ref[i * 32 + j] - alt_pyramid_lvl[alt_offset];
|
||||
|
||||
dist += match distance {
|
||||
DistanceKind::L1 => Abs::abs(diff),
|
||||
DistanceKind::L2 => diff * diff,
|
||||
};
|
||||
} else {
|
||||
dist = f32::new(f32::INFINITY)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
dist
|
||||
}
|
69
src/gat.rs
Normal file
69
src/gat.rs
Normal file
@ -0,0 +1,69 @@
|
||||
use cubecl::prelude::{Max, Sqrt, Tensor, ABSOLUTE_POS_X, ABSOLUTE_POS_Y};
|
||||
|
||||
use crate::{image::GrayImage, kernels::NoiseConfig};
|
||||
|
||||
///
|
||||
/// Generalized Ascombe Transform
|
||||
/// noise model : std² = alpha * I + beta
|
||||
/// Where alpha and beta are iso dependant.
|
||||
///
|
||||
/// Parameters
|
||||
/// ----------
|
||||
/// image : TYPE
|
||||
/// DESCRIPTION.
|
||||
/// alpha : float
|
||||
/// value of alpha for the given iso
|
||||
/// iso : float
|
||||
/// ISO value
|
||||
/// beta : float
|
||||
/// Value of beta for the given iso
|
||||
///
|
||||
/// Returns
|
||||
/// -------
|
||||
/// VST_image : TYPE
|
||||
/// input image with stabilized variance
|
||||
///
|
||||
pub fn generalized_ascombe_transform(image: &GrayImage, nc: NoiseConfig) -> GrayImage {
|
||||
let _ = nc;
|
||||
let _ = image;
|
||||
// assert len(image.shape) == 2
|
||||
// imshape_y, imshape_x = image.shape
|
||||
|
||||
// VST_image = cuda.device_array(image.shape, DEFAULT_NUMPY_FLOAT_TYPE)
|
||||
|
||||
// threadsperblock = (DEFAULT_THREADS, DEFAULT_THREADS)
|
||||
// blockspergrid_x = math.ceil(imshape_x/threadsperblock[1])
|
||||
// blockspergrid_y = math.ceil(imshape_y/threadsperblock[0])
|
||||
// blockspergrid = (blockspergrid_x, blockspergrid_y)
|
||||
|
||||
// cuda_GAT[blockspergrid, threadsperblock](image, VST_image,
|
||||
// alpha, beta)
|
||||
|
||||
// return VST_image
|
||||
todo!()
|
||||
}
|
||||
|
||||
#[cubecl::cube]
|
||||
fn cuda_gat(image: &Tensor<f32>, vst_image: &mut Tensor<f32>, alpha: f32, beta: f32) {
|
||||
let x = ABSOLUTE_POS_X;
|
||||
let y = ABSOLUTE_POS_Y;
|
||||
|
||||
let imshape_y = image.shape(0);
|
||||
let imshape_x = image.shape(1);
|
||||
|
||||
if y >= imshape_y || x >= imshape_x {
|
||||
return;
|
||||
}
|
||||
|
||||
let offset = y * image.stride(0) + x;
|
||||
|
||||
// ISO should not appear here, since alpha and beta are
|
||||
// already iso dependant.
|
||||
|
||||
let vst = Max::max(
|
||||
0.0,
|
||||
alpha * image[offset] + 3.0 / 8.0 * alpha * alpha + beta,
|
||||
);
|
||||
|
||||
vst_image[offset] = 2.0 / alpha * Sqrt::sqrt(vst);
|
||||
}
|
133
src/image.rs
Normal file
133
src/image.rs
Normal file
@ -0,0 +1,133 @@
|
||||
use std::marker::PhantomData;
|
||||
|
||||
use cubecl::{
|
||||
client::ComputeClient,
|
||||
prelude::{CubePrimitive, TensorHandleRef},
|
||||
server::Handle,
|
||||
Runtime,
|
||||
};
|
||||
|
||||
pub mod convolution;
|
||||
pub mod downsample;
|
||||
pub mod padding;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct Tensor<T: CubePrimitive, const R: usize> {
|
||||
pub(crate) handle: Handle,
|
||||
pub(crate) shape: [usize; R],
|
||||
pub(crate) stride: [usize; R],
|
||||
_m: PhantomData<T>,
|
||||
}
|
||||
|
||||
impl<T: CubePrimitive, const R: usize> Tensor<T, R> {
|
||||
pub fn new_empty<Rt: Runtime>(
|
||||
cube: &ComputeClient<Rt::Server, Rt::Channel>,
|
||||
width: u32,
|
||||
height: u32,
|
||||
) -> Self {
|
||||
let _ = height;
|
||||
let _ = width;
|
||||
let _ = cube;
|
||||
todo!()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn shape(&self) -> [usize; R] {
|
||||
self.shape
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn stride(&self) -> [usize; R] {
|
||||
self.stride
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn as_tensor_arg<Rt: Runtime>(&self) -> TensorHandleRef<'_, Rt> {
|
||||
TensorHandleRef {
|
||||
handle: &self.handle,
|
||||
strides: &self.stride,
|
||||
shape: &self.shape,
|
||||
runtime: PhantomData,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn from_slice<Rt: Runtime>(
|
||||
cube: &ComputeClient<Rt::Server, Rt::Channel>,
|
||||
data: &[T],
|
||||
) -> Self {
|
||||
let _ = data;
|
||||
let _ = cube;
|
||||
todo!()
|
||||
}
|
||||
}
|
||||
|
||||
pub struct ImageBuffer<T: CubePrimitive, const R: usize, const S: usize> {
|
||||
data: Tensor<T, R>,
|
||||
width: u32,
|
||||
height: u32,
|
||||
}
|
||||
|
||||
impl<T: CubePrimitive, const R: usize, const S: usize> std::ops::Deref for ImageBuffer<T, R, S> {
|
||||
type Target = Tensor<T, R>;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self.data
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: CubePrimitive, const R: usize, const S: usize> std::ops::DerefMut for ImageBuffer<T, R, S> {
|
||||
fn deref_mut(&mut self) -> &mut Self::Target {
|
||||
&mut self.data
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: CubePrimitive, const R: usize, const S: usize> ImageBuffer<T, R, S> {
|
||||
pub fn from_tensor(data: Tensor<T, R>, width: u32, height: u32) -> Self {
|
||||
assert!(height as usize <= data.shape[0]);
|
||||
assert!(width as usize <= data.shape[0] * S);
|
||||
|
||||
Self {
|
||||
data,
|
||||
width,
|
||||
height,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn shape(&self) -> (u32, u32) {
|
||||
(self.height, self.width)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn height(&self) -> u32 {
|
||||
self.height
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn width(&self) -> u32 {
|
||||
self.width
|
||||
}
|
||||
|
||||
pub fn clone<Rt: Runtime>(&self, cube: &ComputeClient<Rt::Server, Rt::Channel>) -> Self {
|
||||
let _ = cube;
|
||||
todo!()
|
||||
}
|
||||
}
|
||||
|
||||
// Bayer Raw Images
|
||||
pub type Raw16Image = ImageBuffer<u32, 2, 2>;
|
||||
pub type Raw10Image = ImageBuffer<u32, 2, 3>;
|
||||
|
||||
// Gray Images
|
||||
pub type GrayImage = ImageBuffer<f32, 2, 1>;
|
||||
pub type Gray10Image = ImageBuffer<u32, 2, 3>;
|
||||
pub type Gray16Image = ImageBuffer<u32, 2, 2>;
|
||||
|
||||
// Rgb Images
|
||||
pub type RgbImage = ImageBuffer<f32, 3, 1>;
|
||||
pub type Rgba8Image = ImageBuffer<u32, 2, 4>;
|
||||
pub type Rgb10Image = ImageBuffer<u32, 2, 3>;
|
||||
|
||||
// Alingments
|
||||
pub type Alignments = Tensor<f32, 3>;
|
61
src/image/convolution.rs
Normal file
61
src/image/convolution.rs
Normal file
@ -0,0 +1,61 @@
|
||||
use cubecl::{
|
||||
client::ComputeClient,
|
||||
prelude::{Abs, Float, ABSOLUTE_POS_X, ABSOLUTE_POS_Y},
|
||||
Runtime,
|
||||
};
|
||||
|
||||
use super::Tensor;
|
||||
|
||||
pub fn conv1d<R: Runtime>(
|
||||
cube: &ComputeClient<R::Server, R::Channel>,
|
||||
img: &Tensor<f32, 2>,
|
||||
kern: &Tensor<f32, 1>,
|
||||
) -> Tensor<f32, 2> {
|
||||
let _ = kern;
|
||||
let _ = img;
|
||||
let _ = cube;
|
||||
todo!()
|
||||
}
|
||||
|
||||
pub fn conv1d_t<R: Runtime>(
|
||||
cube: &ComputeClient<R::Server, R::Channel>,
|
||||
img: &Tensor<f32, 2>,
|
||||
kern: &Tensor<f32, 1>,
|
||||
) -> Tensor<f32, 2> {
|
||||
let _ = kern;
|
||||
let _ = img;
|
||||
let _ = cube;
|
||||
todo!()
|
||||
}
|
||||
|
||||
#[cubecl::cube(launch_unchecked)]
|
||||
fn cube_conv1d<F: Float>(
|
||||
input: &cubecl::prelude::Tensor<F>,
|
||||
kern: &cubecl::prelude::Tensor<F>,
|
||||
output: &mut cubecl::prelude::Tensor<F>,
|
||||
#[comptime] trans: bool,
|
||||
) {
|
||||
let kern_half = kern.len() as i32 / 2;
|
||||
let h = output.shape(0) as i32 - 1;
|
||||
let w = output.shape(1) as i32 - 1;
|
||||
|
||||
let px = ABSOLUTE_POS_X as i32 - kern_half;
|
||||
let py = ABSOLUTE_POS_Y as i32 - kern_half;
|
||||
|
||||
for i in 0..kern.len() as i32 {
|
||||
let ox = if trans {
|
||||
Abs::abs(w - Abs::abs(px + i - w))
|
||||
} else {
|
||||
px
|
||||
};
|
||||
|
||||
let oy = if trans {
|
||||
Abs::abs(h - Abs::abs(py + i - h))
|
||||
} else {
|
||||
py
|
||||
};
|
||||
|
||||
output[oy as u32 * output.stride(0) + ox as u32] =
|
||||
input[oy as u32 * input.stride(0) + ox as u32] * kern[i as u32];
|
||||
}
|
||||
}
|
65
src/image/downsample.rs
Normal file
65
src/image/downsample.rs
Normal file
@ -0,0 +1,65 @@
|
||||
use cubecl::{client::ComputeClient, Runtime};
|
||||
|
||||
use super::{convolution::conv1d, GrayImage, Tensor};
|
||||
|
||||
///
|
||||
/// Apply a convolution by a kernel if required, then downsample an image.
|
||||
/// Args:
|
||||
/// image: Device Array the input image (WARNING: single channel only!)
|
||||
/// factor: downsampling factor
|
||||
///
|
||||
pub fn downsample<R: Runtime>(
|
||||
cube: &ComputeClient<R::Server, R::Channel>,
|
||||
img: &GrayImage,
|
||||
factor: u32,
|
||||
) -> GrayImage {
|
||||
// Special case
|
||||
if factor == 1 {
|
||||
return img.clone::<R>(cube);
|
||||
}
|
||||
|
||||
// gaussian kernel std is proportional to downsampling factor
|
||||
// filteredImage = gaussian_filter(image, sigma=factor * 0.5, order=0, output=None, mode='reflect')
|
||||
let gaussian_kernel = create_gaussian_kernel1d(factor as f32 * 0.5, 4 * ((factor + 1) / 2));
|
||||
let gaussian_kernel = Tensor::from_slice::<R>(cube, &gaussian_kernel);
|
||||
|
||||
let temp = conv1d::<R>(cube, &img.data, &gaussian_kernel); // convolve y
|
||||
let filtered_image = conv1d::<R>(cube, &temp, &gaussian_kernel); // convolve x
|
||||
|
||||
// Shape of the downsampled image
|
||||
let [h2, w2] = filtered_image.shape();
|
||||
let (h2, w2) = (h2 as f32 / factor as f32, w2 as f32 / factor as f32);
|
||||
|
||||
let tensor = subsample::<R>(cube, &filtered_image, factor, factor);
|
||||
|
||||
GrayImage::from_tensor(tensor, w2 as _, h2 as _)
|
||||
}
|
||||
|
||||
///
|
||||
/// Computes a 1-D Gaussian convolution kernel.
|
||||
///
|
||||
pub fn create_gaussian_kernel1d(sigma: f32, radius: u32) -> Vec<f32> {
|
||||
let sigma2 = sigma * sigma;
|
||||
|
||||
let mut phi_x = (-(radius as i32)..radius as i32 + 1)
|
||||
.map(|x| f32::exp(-0.5 / (sigma2 * (x * x) as f32)))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let sum: f32 = phi_x.iter().cloned().sum();
|
||||
|
||||
phi_x.iter_mut().for_each(|x| *x /= sum);
|
||||
phi_x
|
||||
}
|
||||
|
||||
pub fn subsample<R: Runtime>(
|
||||
cube: &ComputeClient<R::Server, R::Channel>,
|
||||
img: &Tensor<f32, 2>,
|
||||
y_step: u32,
|
||||
x_step: u32,
|
||||
) -> Tensor<f32, 2> {
|
||||
let _ = x_step;
|
||||
let _ = y_step;
|
||||
let _ = img;
|
||||
let _ = cube;
|
||||
todo!()
|
||||
}
|
20
src/image/padding.rs
Normal file
20
src/image/padding.rs
Normal file
@ -0,0 +1,20 @@
|
||||
use cubecl::{client::ComputeClient, Runtime};
|
||||
|
||||
use super::GrayImage;
|
||||
|
||||
pub fn pad<R: Runtime>(
|
||||
cube: &ComputeClient<R::Server, R::Channel>,
|
||||
img: &GrayImage,
|
||||
left: u32,
|
||||
right: u32,
|
||||
top: u32,
|
||||
bottom: u32,
|
||||
) -> GrayImage {
|
||||
let _ = bottom;
|
||||
let _ = top;
|
||||
let _ = right;
|
||||
let _ = left;
|
||||
let _ = img;
|
||||
let _ = cube;
|
||||
todo!()
|
||||
}
|
385
src/kernels.rs
Normal file
385
src/kernels.rs
Normal file
@ -0,0 +1,385 @@
|
||||
use cubecl::prelude::{Abs, Float, Line, Max, Min, Sqrt, Tensor, ABSOLUTE_POS_X, ABSOLUTE_POS_Y};
|
||||
|
||||
use crate::{
|
||||
gat::generalized_ascombe_transform,
|
||||
image::{self, GrayImage},
|
||||
};
|
||||
|
||||
pub struct EstimateKernelsConfig {
|
||||
k_detail: f32,
|
||||
k_denoise: f32,
|
||||
d_th: f32,
|
||||
d_tr: f32,
|
||||
k_stretch: f32,
|
||||
k_shrink: f32,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq)]
|
||||
pub struct NoiseConfig {
|
||||
pub alpha: f32,
|
||||
pub beta: f32,
|
||||
}
|
||||
|
||||
///
|
||||
/// Implementation of Alg. 5: ComputeKernelCovariance
|
||||
///
|
||||
/// Returns the kernels covariance matrices for the frame J_n, sampled at the
|
||||
/// center of every bayer quad (or at the center of every grey pixel in grey
|
||||
/// mode).
|
||||
///
|
||||
/// Parameters
|
||||
/// ----------
|
||||
/// img : device Array[imshape_y, imshape_x]
|
||||
/// Raw image J_n
|
||||
///
|
||||
/// Returns
|
||||
/// -------
|
||||
/// covs : device Array[imshape_y/2, imshape_x/2, 2, 2]
|
||||
/// Covariance matrices Omega_n, sampled at the center of each bayer quad.
|
||||
///
|
||||
///
|
||||
pub fn estimate_kernels(
|
||||
img: &GrayImage,
|
||||
cfg: EstimateKernelsConfig,
|
||||
noise_cfg: NoiseConfig,
|
||||
) -> image::Tensor<f32, 4> {
|
||||
// Performing Variance Stabilization Transform
|
||||
let img = generalized_ascombe_transform(img, noise_cfg);
|
||||
|
||||
// Decimate to grey
|
||||
// if bayer_mode :
|
||||
// img_grey = compute_grey_images_(img, method="decimating")
|
||||
|
||||
// else :
|
||||
// img_grey = img # no need to copy now, they will be copied to gpu later.
|
||||
|
||||
let img_gray = img;
|
||||
let (grey_imshape_y, grey_imshape_x) = img_gray.shape();
|
||||
|
||||
// Computing grads
|
||||
// let th_grey_img = th.as_tensor(img_grey, dtype = DEFAULT_TORCH_FLOAT_TYPE, device = "cuda");
|
||||
|
||||
// Horizontal filters
|
||||
// let grad_kernel1 = np.array([[[[-0.5, 0.5]]], [[[0.5, 0.5]]]]);
|
||||
|
||||
// Vertical filters
|
||||
// let grad_kernel2 = np.array([[[[0.5], [0.5]]], [[[-0.5], [0.5]]]]);
|
||||
|
||||
// let tmp = conv1d(th_grey_img, grad_kernel1);
|
||||
// let th_full_grad = conv1d(tmp, grad_kernel2, groups = 2);
|
||||
|
||||
// The default padding mode reduces the shape of grey_img of 1 pixel in each
|
||||
// direction, as expected
|
||||
|
||||
// let cuda_full_grads =
|
||||
// cuda.as_cuda_array(th_full_grad.squeeze().transpose(0, 1).transpose(1, 2));
|
||||
|
||||
// shape [y, x, 2]
|
||||
// let covs = cuda.device_array(grey_imshape + (2, 2), DEFAULT_NUMPY_FLOAT_TYPE);
|
||||
|
||||
// threadsperblock = (DEFAULT_THREADS, DEFAULT_THREADS)
|
||||
// blockspergrid_x = math.ceil(grey_imshape_x/threadsperblock[1])
|
||||
// blockspergrid_y = math.ceil(grey_imshape_y/threadsperblock[0])
|
||||
// blockspergrid = (blockspergrid_x, blockspergrid_y)
|
||||
|
||||
// cuda_estimate_kernel[blockspergrid, threadsperblock](cuda_full_grads,
|
||||
// k_detail, k_denoise, D_th, D_tr, k_stretch, k_shrink,
|
||||
// covs)
|
||||
// covs
|
||||
todo!()
|
||||
}
|
||||
|
||||
#[allow(clippy::identity_op)]
|
||||
#[cubecl::cube]
|
||||
fn cuda_estimate_kernel(
|
||||
full_grads: &Tensor<f32>,
|
||||
covs: &mut Tensor<f32>,
|
||||
k_detail: f32,
|
||||
k_denoise: f32,
|
||||
d_th: f32,
|
||||
d_tr: f32,
|
||||
k_stretch: f32,
|
||||
k_shrink: f32,
|
||||
) {
|
||||
let pixel_idx = ABSOLUTE_POS_X;
|
||||
let pixel_idy = ABSOLUTE_POS_Y;
|
||||
|
||||
let imshape_y = covs.shape(0);
|
||||
let imshape_x = covs.shape(1);
|
||||
|
||||
if pixel_idy >= imshape_y || pixel_idx >= imshape_x {
|
||||
return;
|
||||
}
|
||||
|
||||
let covs_offset = pixel_idy * imshape_x + pixel_idx;
|
||||
|
||||
let mut structure_tensor: Line<f32> = Line::empty(4);
|
||||
structure_tensor[0] = 0.0;
|
||||
structure_tensor[1] = 0.0;
|
||||
structure_tensor[2] = 0.0;
|
||||
structure_tensor[3] = 0.0;
|
||||
|
||||
for i in 0..2 {
|
||||
for j in 0..2 {
|
||||
let x = (pixel_idx + j) as i32 - 1;
|
||||
let y = (pixel_idy + i) as i32 - 1;
|
||||
|
||||
if (0 <= y && y < full_grads.shape(0) as i32)
|
||||
&& (0 <= x && x < full_grads.shape(1) as i32)
|
||||
{
|
||||
let full_grad_offset = y as u32 * full_grads.stride(0) + x as u32;
|
||||
let full_grad_x = full_grads[full_grad_offset + 0];
|
||||
let full_grad_y = full_grads[full_grad_offset + 1];
|
||||
|
||||
structure_tensor[0] += full_grad_x * full_grad_x;
|
||||
structure_tensor[1] += full_grad_x * full_grad_y;
|
||||
structure_tensor[2] += full_grad_x * full_grad_y;
|
||||
structure_tensor[3] += full_grad_y * full_grad_y;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let mut l = Line::empty(2);
|
||||
let mut e1 = Line::empty(2);
|
||||
let mut e2 = Line::empty(2);
|
||||
let mut k = Line::empty(2);
|
||||
|
||||
get_eigen_elmts_2x2(&structure_tensor, &mut l, &mut e1, &mut e2);
|
||||
compute_k(
|
||||
&mut k, l[0], l[1], k_detail, k_denoise, d_th, d_tr, k_stretch, k_shrink,
|
||||
);
|
||||
|
||||
let k_1_sq = k[0] * k[0];
|
||||
let k_2_sq = k[1] * k[1];
|
||||
|
||||
covs[covs_offset + 0] = k_1_sq * e1[0] * e1[0] + k_2_sq * e2[0] * e2[0];
|
||||
covs[covs_offset + 1] = k_1_sq * e1[0] * e1[1] + k_2_sq * e2[0] * e2[1];
|
||||
covs[covs_offset + 2] = k_1_sq * e1[0] * e1[1] + k_2_sq * e2[0] * e2[1];
|
||||
covs[covs_offset + 3] = k_1_sq * e1[1] * e1[1] + k_2_sq * e2[1] * e2[1];
|
||||
}
|
||||
|
||||
///
|
||||
/// Cuda function for resolving the 2x2 system A*X = B
|
||||
/// by using the analytical formula
|
||||
///
|
||||
/// Parameters
|
||||
/// ----------
|
||||
/// A : Array[2,2]
|
||||
///
|
||||
/// B : Array[2]
|
||||
///
|
||||
/// Returns
|
||||
/// -------
|
||||
/// None
|
||||
///
|
||||
///
|
||||
|
||||
#[cubecl::cube]
|
||||
fn solve_2x2(a: &Line<f32>, b: &Line<f32>, x: &mut Line<f32>) {
|
||||
let det_a = a[0] * a[3] - a[1] * a[2];
|
||||
|
||||
x[0] = (a[3] * b[0] - a[1] * b[1]) / det_a;
|
||||
x[1] = (a[0] * b[1] - a[2] * b[0]) / det_a;
|
||||
}
|
||||
|
||||
///
|
||||
/// inverts the 2x2 M array
|
||||
///
|
||||
/// Parameters
|
||||
/// ----------
|
||||
/// M : Array[2, 2]
|
||||
/// Array to invert
|
||||
/// M_i : Array[2, 2]
|
||||
///
|
||||
/// Returns
|
||||
/// -------
|
||||
/// None.
|
||||
///
|
||||
///
|
||||
#[cubecl::cube]
|
||||
fn invert_2x2(m: &Line<f32>, m_i: &mut Line<f32>) {
|
||||
let det = m[0] * m[3] - m[1] * m[2];
|
||||
|
||||
if Abs::abs(det) > f32::new(f32::EPSILON) {
|
||||
let det_i = 1.0 / det;
|
||||
|
||||
m_i[0] = m[3] * det_i;
|
||||
m_i[1] = -m[1] * det_i;
|
||||
m_i[2] = -m[2] * det_i;
|
||||
m_i[3] = m[0] * det_i;
|
||||
} else {
|
||||
m_i[0] = 1.0;
|
||||
m_i[1] = 0.0;
|
||||
m_i[2] = 0.0;
|
||||
m_i[3] = 1.0;
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Returns the two roots of the polynom a*X^2 + b*X + c = 0 for a, b and c
|
||||
/// real numbers. The function only returns real roots : make sure they exist
|
||||
/// before calling the function. l[0] contains the root with the biggest module
|
||||
/// and l[1] the smallest
|
||||
///
|
||||
///
|
||||
/// Parameters
|
||||
/// ----------
|
||||
/// a : float
|
||||
///
|
||||
/// b : float
|
||||
///
|
||||
/// c : float
|
||||
///
|
||||
/// roots : Array[2]
|
||||
///
|
||||
/// Returns
|
||||
/// -------
|
||||
/// None
|
||||
///
|
||||
#[cubecl::cube]
|
||||
fn get_real_polyroots_2(a: f32, b: f32, c: f32, roots: &mut Line<f32>) {
|
||||
// numerical instabilities can cause delta to be slightly negative despite
|
||||
// the equation admitting 2 real roots.
|
||||
let delta_root = Sqrt::sqrt(Max::max(b * b - 4.0 * a * c, 0.0));
|
||||
let r1 = (-b + delta_root) / (2.0 * a);
|
||||
let r2 = (-b - delta_root) / (2.0 * a);
|
||||
|
||||
let r1_abs: f32 = Abs::abs(r1);
|
||||
if r1_abs >= Abs::abs(r2) {
|
||||
roots[0] = r1;
|
||||
roots[1] = r2;
|
||||
} else {
|
||||
roots[0] = r2;
|
||||
roots[1] = r1;
|
||||
}
|
||||
}
|
||||
|
||||
#[cubecl::cube]
|
||||
fn get_eigen_val_2x2(m: &Line<f32>, l: &mut Line<f32>) {
|
||||
let a = 1.0;
|
||||
let b = -(m[0] + m[3]);
|
||||
let c = m[0] * m[3] - m[1] * m[2];
|
||||
|
||||
get_real_polyroots_2(a, b, c, l)
|
||||
}
|
||||
|
||||
///
|
||||
/// return the eigen vectors with norm 1 for the eigen values l
|
||||
/// M.e1 = l1.e1 ; M.e2 = l2.e2
|
||||
///
|
||||
/// Parameters
|
||||
/// ----------
|
||||
/// M : Array[2,2]
|
||||
/// Real Symmetric array for which eigen values are to be determined
|
||||
/// l : Array[2]
|
||||
/// e1, e2 : Array[2]
|
||||
/// sorted Eigenvalues
|
||||
/// e1, e2 : Array[2, 2]
|
||||
/// Computed orthogonal and normalized eigen vectors
|
||||
///
|
||||
/// Returns
|
||||
/// -------
|
||||
/// None.
|
||||
///
|
||||
#[cubecl::cube]
|
||||
fn get_eigen_vect_2x2(m: &Line<f32>, l: &Line<f32>, e1: &mut Line<f32>, e2: &mut Line<f32>) {
|
||||
// 2x2 algorithm : https://en.wikipedia.org/wiki/Eigenvalue_algorithm
|
||||
if m[1] == 0.0 && m[0] == m[3] {
|
||||
// m is multiple of identity, picking 2 ortogonal eigen vectors.
|
||||
e1[0] = 1.0;
|
||||
e1[1] = 0.0;
|
||||
e2[0] = 0.0;
|
||||
e2[1] = 1.0;
|
||||
} else {
|
||||
// averaging 2 for increased reliability
|
||||
e1[0] = m[0] + m[1] - l[1];
|
||||
e1[1] = m[2] + m[3] - l[1];
|
||||
|
||||
if e1[0] == 0.0 {
|
||||
e1[1] = 1.0;
|
||||
e2[0] = 1.0;
|
||||
e2[1] = 0.0;
|
||||
} else if e1[1] == 0.0 {
|
||||
e1[0] = 1.0;
|
||||
e2[0] = 0.0;
|
||||
e2[1] = 1.0;
|
||||
} else {
|
||||
let norm_ = Sqrt::sqrt(e1[0] * e1[0] + e1[1] * e1[1]);
|
||||
e1[0] /= norm_;
|
||||
e1[1] /= norm_;
|
||||
|
||||
let sign = copysign(e1[0]);
|
||||
e2[1] = Abs::abs(e1[0]);
|
||||
e2[0] = -e1[1] * sign
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cubecl::cube]
|
||||
fn get_eigen_elmts_2x2(m: &Line<f32>, l: &mut Line<f32>, e1: &mut Line<f32>, e2: &mut Line<f32>) {
|
||||
get_eigen_val_2x2(m, l);
|
||||
get_eigen_vect_2x2(m, l, e1, e2);
|
||||
}
|
||||
|
||||
///
|
||||
/// Computes k_1 and k_2 based on lambda1, lambda2 and the constants.
|
||||
///
|
||||
/// Parameters
|
||||
/// ----------
|
||||
/// l1 : float
|
||||
/// lambda1 (dominant eigen value)
|
||||
/// l2 : float
|
||||
/// lambda2 : second eigenvalue
|
||||
/// k : Array[2]
|
||||
/// empty vector where k_1 and k_2 will be stored
|
||||
/// k_detail : float
|
||||
/// k_denoise : float
|
||||
/// D_th : float
|
||||
/// D_tr : float
|
||||
/// k_stretch : float
|
||||
/// k_shrink : float
|
||||
/// Parameters to compute k_1 and k_2, all detailed in the article.
|
||||
///
|
||||
#[cubecl::cube]
|
||||
fn compute_k(
|
||||
k: &mut Line<f32>,
|
||||
l1: f32,
|
||||
l2: f32,
|
||||
k_detail: f32,
|
||||
k_denoise: f32,
|
||||
d_th: f32,
|
||||
d_tr: f32,
|
||||
k_stretch: f32,
|
||||
k_shrink: f32,
|
||||
) {
|
||||
// When A is Nan, we fall back to this condition
|
||||
let mut k1 = 1.0f32;
|
||||
let mut k2 = 1.0f32;
|
||||
|
||||
let a: f32 = 1.0 + Sqrt::sqrt((l1 - l2) / (l1 + l2));
|
||||
let d: f32 = clamp(1.0 - Sqrt::sqrt(l1) / d_tr + d_th, 0.0, 1.0);
|
||||
|
||||
// This is a very aggressive way of driving anisotropy, but it works well so far.
|
||||
if a > 1.95 {
|
||||
k1 = 1.0 / k_shrink;
|
||||
k2 = k_stretch;
|
||||
}
|
||||
|
||||
k[0] = k_detail * ((1.0 - d) * k1 + d * k_denoise);
|
||||
k[1] = k_detail * ((1.0 - d) * k2 + d * k_denoise);
|
||||
}
|
||||
|
||||
#[cubecl::cube]
|
||||
fn copysign(val: f32) -> f32 {
|
||||
if val < 0.0f32 {
|
||||
-(1.0f32)
|
||||
} else {
|
||||
1.0f32
|
||||
}
|
||||
}
|
||||
|
||||
#[cubecl::cube]
|
||||
fn clamp(a: f32, min: f32, max: f32) -> f32 {
|
||||
// Clamp::clamp(a, min, max)
|
||||
Min::min(Max::max(a, min), max)
|
||||
}
|
6
src/lib.rs
Normal file
6
src/lib.rs
Normal file
@ -0,0 +1,6 @@
|
||||
pub mod alignment;
|
||||
pub mod gat;
|
||||
pub mod image;
|
||||
pub mod kernels;
|
||||
// pub mod merge;
|
||||
pub mod misc;
|
411
src/merge.rs
Normal file
411
src/merge.rs
Normal file
@ -0,0 +1,411 @@
|
||||
use cubecl::prelude::Tensor;
|
||||
|
||||
use crate::image::{Alignments, GrayImage};
|
||||
|
||||
///
|
||||
/// Implementation of Alg. 11: AccumulationReference
|
||||
/// Accumulates the reference frame into num and den, while considering
|
||||
/// (if enabled) the accumulated robustness mask to enforce single frame SR if
|
||||
/// necessary.
|
||||
///
|
||||
/// Parameters
|
||||
/// ----------
|
||||
/// ref_img : device Array[imshape_y, imshape_x]
|
||||
/// Reference image J_1
|
||||
/// kernels : device Array[imshape_y//2, imshape_x//2, 2, 2]
|
||||
/// Covariance Matrices Omega_1
|
||||
/// num : device Array[s*imshape_y, s*imshape_x, c]
|
||||
/// Numerator of the accumulator
|
||||
/// den : device Array[s*imshape_y, s*imshape_x, c]
|
||||
/// Denominator of the accumulator
|
||||
/// options : dict
|
||||
/// verbose options.
|
||||
/// params : dict
|
||||
/// parameters (containing the zoom s).
|
||||
/// acc_rob : [imshape_y, imshape_x], optional
|
||||
/// accumulated robustness mask. The default is None.
|
||||
///
|
||||
/// Returns
|
||||
/// -------
|
||||
/// None.
|
||||
///
|
||||
pub fn merge_ref() {}
|
||||
|
||||
#[cubecl::cube]
|
||||
fn accumulate_ref(ref_img: &Tensor<f32>, covs: &Tensor<f32>, bayer_mode: bool, iso_kernel: bool, scale: f32, cfa_pattern: &Array<f32>,
|
||||
num: &Tensor<f32>, den: &Tensor<f32>, acc_rob: &Tensor<f32>,
|
||||
robustness_denoise: bool, max_frame_count: u32, rad_max: u32, max_multiplier: u32) {
|
||||
|
||||
let output_pixel_idx, output_pixel_idy = cuda.grid(2);
|
||||
let output_size_y, output_size_x, _ = num.shape;
|
||||
let input_size_y, input_size_x = ref_img.shape;
|
||||
|
||||
if output_pixel_idx >= output_size_x || output_pixel_idy >= output_size_y) {
|
||||
return;
|
||||
}
|
||||
|
||||
if bayer_mode {
|
||||
n_channels = 3;
|
||||
acc = cuda.local.array(3, dtype=DEFAULT_CUDA_FLOAT_TYPE);
|
||||
val = cuda.local.array(3, dtype=DEFAULT_CUDA_FLOAT_TYPE);
|
||||
} else {
|
||||
n_channels = 1;
|
||||
acc = cuda.local.array(1, dtype=DEFAULT_CUDA_FLOAT_TYPE);
|
||||
val = cuda.local.array(1, dtype=DEFAULT_CUDA_FLOAT_TYPE);
|
||||
}
|
||||
|
||||
// Copying CFA locally. We will read that 9 times, so it's worth it
|
||||
let local_cfa = cuda.local.array((2,2), uint8);
|
||||
for i in 0..2 {
|
||||
for j in 0..2 {
|
||||
local_cfa[i,j] = uint8(cfa_pattern[i,j]);
|
||||
}
|
||||
}
|
||||
|
||||
let coarse_ref_sub_pos = cuda.local.array(2, dtype=DEFAULT_CUDA_FLOAT_TYPE); // y, x
|
||||
coarse_ref_sub_pos[0] = output_pixel_idy / scale;
|
||||
coarse_ref_sub_pos[1] = output_pixel_idx / scale;
|
||||
|
||||
for chan in 0..n_channels {
|
||||
acc[chan] = 0.0;
|
||||
val[chan] = 0.0;
|
||||
}
|
||||
|
||||
// computing kernel
|
||||
// this is rather slow and could probably be sped up
|
||||
if !iso_kernel {
|
||||
let interpolated_cov = cuda.local.array((2, 2), dtype = DEFAULT_CUDA_FLOAT_TYPE);
|
||||
let cov_i = cuda.local.array((2, 2), dtype=DEFAULT_CUDA_FLOAT_TYPE);
|
||||
|
||||
|
||||
// fetching the 4 closest covs
|
||||
let close_covs = cuda.local.array((2, 2, 2 ,2), DEFAULT_CUDA_FLOAT_TYPE);
|
||||
let grey_pos = cuda.local.array(2, DEFAULT_CUDA_FLOAT_TYPE);
|
||||
if bayer_mode {
|
||||
grey_pos[0] = (coarse_ref_sub_pos[0]-0.5)/2; // grey grid is offseted and twice more sparse
|
||||
grey_pos[1] = (coarse_ref_sub_pos[1]-0.5)/2;
|
||||
} else {
|
||||
grey_pos[0] = coarse_ref_sub_pos[0]; // grey grid is exactly the coarse grid
|
||||
grey_pos[1] = coarse_ref_sub_pos[1];
|
||||
}
|
||||
|
||||
// clipping the coordinates to stay in bound
|
||||
let floor_x = int(max(math.floor(grey_pos[1]), 0));
|
||||
let floor_y = int(max(math.floor(grey_pos[0]), 0));
|
||||
|
||||
let ceil_x = min(floor_x + 1, covs.shape[1]-1);
|
||||
let ceil_y = min(floor_y + 1, covs.shape[0]-1);
|
||||
for i in 0..2 {
|
||||
for j in 0..2 {
|
||||
close_covs[0, 0, i, j] = covs[floor_y, floor_x, i, j];
|
||||
close_covs[0, 1, i, j] = covs[floor_y, ceil_x, i, j];
|
||||
close_covs[1, 0, i, j] = covs[ceil_y, floor_x, i, j];
|
||||
close_covs[1, 1, i, j] = covs[ceil_y, ceil_x, i, j];
|
||||
}
|
||||
}
|
||||
|
||||
// interpolating covs
|
||||
interpolate_cov(close_covs, grey_pos, interpolated_cov);
|
||||
invert_2x2(interpolated_cov, cov_i);
|
||||
}
|
||||
|
||||
// fetching acc robustness if required
|
||||
// Acc robustness is known for each raw pixel. An implicit interpolation done
|
||||
// from LR to HR using nearest neighbor.
|
||||
if robustness_denoise {
|
||||
let local_acc_r = acc_rob[min(round(coarse_ref_sub_pos[0]), acc_rob.shape[0]-1),
|
||||
min(round(coarse_ref_sub_pos[1]), acc_rob.shape[1]-1)];
|
||||
|
||||
additional_denoise_power = denoise_power_merge(local_acc_r, max_multiplier, max_frame_count);
|
||||
rad = denoise_range_merge(local_acc_r, rad_max, max_frame_count);
|
||||
} else {
|
||||
additional_denoise_power = 1;
|
||||
rad = 1;
|
||||
}
|
||||
|
||||
let center_x = round(coarse_ref_sub_pos[1]);
|
||||
let center_y = round(coarse_ref_sub_pos[0]);
|
||||
for i in -rad..=rad {
|
||||
for j in -rad..=rad {
|
||||
let pixel_idx = center_x + j;
|
||||
let pixel_idy = center_y + i;
|
||||
|
||||
// in bound condition
|
||||
if (0 <= pixel_idx && pixel_idx < input_size_x) &&
|
||||
(0 <= pixel_idy && pixel_idy < input_size_y) {
|
||||
|
||||
// checking if pixel is r, g or b
|
||||
let channel if bayer_mode :
|
||||
local_CFA[pixel_idy%2, pixel_idx%2]
|
||||
else:
|
||||
0
|
||||
};
|
||||
|
||||
// By fetching the value now, we can compute the kernel weight
|
||||
// while it is called from global memory
|
||||
let c = ref_img[pixel_idy, pixel_idx];
|
||||
|
||||
// computing distance
|
||||
let dist_x = pixel_idx - coarse_ref_sub_pos[1];
|
||||
let dist_y = pixel_idy - coarse_ref_sub_pos[0];
|
||||
|
||||
// Computing w
|
||||
let mut y = if iso_kernel {
|
||||
max(0, 2 * (dist_x * dist_x + dist_y * dist_y))
|
||||
} else {
|
||||
max(0, quad_mat_prod(cov_i, dist_x, dist_y))
|
||||
};
|
||||
|
||||
// y can be slightly negative because of numerical precision.
|
||||
// I clamp it to not explode the error with exp
|
||||
|
||||
// this is equivalent to multiplying the covariance,
|
||||
// but at the cost of one scalar operation (instead of 4)
|
||||
y /= additional_denoise_power;
|
||||
|
||||
let w = if bayer_mode {
|
||||
exp(-0.5 * y)
|
||||
} else {
|
||||
exp(-0.5 * 4.0 * y) // original kernel constants are designed for bayer distances, not greys, Hence x4
|
||||
};
|
||||
// -----
|
||||
val[channel] += c * w;
|
||||
acc[channel] += w;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if robustness_denoise and local_acc_r < max_frame_count {
|
||||
// Overwritting values to enforce single frame demosaicing
|
||||
|
||||
for chan in 0..n_channels {
|
||||
num[output_pixel_idy, output_pixel_idx, chan] = val[chan];
|
||||
den[output_pixel_idy, output_pixel_idx, chan] = acc[chan];
|
||||
}
|
||||
} else {
|
||||
for chan in 0..n_channels {
|
||||
num[output_pixel_idy, output_pixel_idx, chan] += val[chan];
|
||||
den[output_pixel_idy, output_pixel_idx, chan] += acc[chan];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct MergeConfig {
|
||||
pub scale: f32,
|
||||
}
|
||||
|
||||
///
|
||||
/// Implementation of Alg. 4: Accumulation
|
||||
/// Accumulates comp_img (J_n, n>1) into num and den, based on the alignment
|
||||
/// V_n, the covariance matrices Omega_n and the robustness mask estimated before.
|
||||
/// The size of the merge_result is adjustable with params['scale']
|
||||
///
|
||||
///
|
||||
/// Parameters
|
||||
/// ----------
|
||||
/// comp_imgs : device Array [imsize_y, imsize_x]
|
||||
/// The non-reference image to merge (J_n)
|
||||
/// alignments : device Array[n_tiles_y, n_tiles_x, 2]
|
||||
/// The final estimation of the tiles' alignment V_n(p)
|
||||
/// covs : device array[imsize_y//2, imsize_x//2, 2, 2]
|
||||
/// covariance matrices Omega_n
|
||||
/// r : Device_Array[imsize_y, imsize_x]
|
||||
/// Robustness mask r_n
|
||||
/// num : device Array[s*imshape_y, s*imshape_x, c]
|
||||
/// Numerator of the accumulator
|
||||
/// den : device Array[s*imshape_y, s*imshape_x, c]
|
||||
/// Denominator of the accumulator
|
||||
///
|
||||
/// options : Dict
|
||||
/// Options to pass
|
||||
/// params : Dict
|
||||
/// parameters
|
||||
///
|
||||
/// Returns
|
||||
/// -------
|
||||
/// None
|
||||
///
|
||||
fn merge(comp_img: &GrayImage, alignments: &Alignments, covs: image::Tensor<f32 4>, r, num, den, cfg: MergeConfig) {
|
||||
// scale = params['scale']
|
||||
|
||||
// CFA_pattern = cuda.to_device(params['exif']['CFA Pattern'])
|
||||
// bayer_mode = params['mode'] == 'bayer'
|
||||
// iso_kernel = params['kernel'] == 'iso'
|
||||
// tile_size = params['tuning']['tileSize']
|
||||
|
||||
// native_im_size = comp_img.shape
|
||||
|
||||
// // casting to integer to account for floating scale
|
||||
// output_size = (round(scale*native_im_size[0]), round(scale*native_im_size[1]))
|
||||
|
||||
|
||||
// dispatching threads. 1 thread for 1 output pixel
|
||||
// threadsperblock = (DEFAULT_THREADS, DEFAULT_THREADS) # maximum, we may take less
|
||||
// blockspergrid_x = math.ceil(output_size[1]/threadsperblock[1])
|
||||
// blockspergrid_y = math.ceil(output_size[0]/threadsperblock[0])
|
||||
// blockspergrid = (blockspergrid_x, blockspergrid_y)
|
||||
|
||||
// accumulate[blockspergrid, threadsperblock](
|
||||
// comp_img, alignments, covs, r,
|
||||
// bayer_mode, iso_kernel, scale, tile_size, CFA_pattern,
|
||||
// num, den)
|
||||
}
|
||||
|
||||
|
||||
#[cubecl::cube]
|
||||
fn accumulate(comp_img, alignments, covs, r,
|
||||
bayer_mode, iso_kernel, scale, tile_size, CFA_pattern,
|
||||
num, den) {
|
||||
|
||||
let output_pixel_idx, output_pixel_idy = cuda.grid(2);
|
||||
|
||||
let output_size_y, output_size_x, _ = num.shape;
|
||||
let input_size_y, input_size_x = comp_img.shape;
|
||||
|
||||
if output_pixel_idx >= output_size_x || output_pixel_idy >= output_size_y {
|
||||
return;
|
||||
}
|
||||
|
||||
if bayer_mode {
|
||||
n_channels = 3;
|
||||
acc = cuda.local.array(3, dtype=DEFAULT_CUDA_FLOAT_TYPE);
|
||||
val = cuda.local.array(3, dtype=DEFAULT_CUDA_FLOAT_TYPE);
|
||||
} else {
|
||||
n_channels = 1;
|
||||
acc = cuda.local.array(1, dtype=DEFAULT_CUDA_FLOAT_TYPE);
|
||||
val = cuda.local.array(1, dtype=DEFAULT_CUDA_FLOAT_TYPE);
|
||||
}
|
||||
|
||||
// Copying CFA locally. We will read that 9 times, so it's worth it
|
||||
let local_cfa = cuda.local.array((2,2), uint8);
|
||||
for i in 0..2 {
|
||||
for j in 0..2 {
|
||||
local_CFA[i,j] = uint8(cfa_pattern[i,j]);
|
||||
}
|
||||
}
|
||||
|
||||
let coarse_ref_sub_pos = cuda.local.array(2, dtype=DEFAULT_CUDA_FLOAT_TYPE); // y, x
|
||||
|
||||
coarse_ref_sub_pos[0] = output_pixel_idy / scale;
|
||||
coarse_ref_sub_pos[1] = output_pixel_idx / scale;
|
||||
|
||||
// fetch of the flow, as early as possible
|
||||
let local_optical_flow = cuda.local.array(2, dtype=DEFAULT_CUDA_FLOAT_TYPE);
|
||||
let patch_idy = int(coarse_ref_sub_pos[0] / tile_size);
|
||||
let patch_idx = int(coarse_ref_sub_pos[1] /tile_size);
|
||||
|
||||
local_optical_flow[0] = alignments[patch_idy, patch_idx, 0];
|
||||
local_optical_flow[1] = alignments[patch_idy, patch_idx, 1];
|
||||
|
||||
for chan in 0..n_channels {
|
||||
acc[chan] = 0.0;
|
||||
val[chan] = 0.0;
|
||||
}
|
||||
|
||||
let patch_center_pos = cuda.local.array(2, DEFAULT_CUDA_FLOAT_TYPE); // y, x
|
||||
|
||||
|
||||
// fetching robustness
|
||||
// The robustness coefficient is known for every raw pixel, and implicitely
|
||||
// interpolated to HR using nearest neighboor interpolations.
|
||||
|
||||
let y_r = clamp(round(coarse_ref_sub_pos[0]), 0, r.shape[0]-1);
|
||||
let x_r = clamp(round(coarse_ref_sub_pos[1]), 0, r.shape[1]-1);
|
||||
let local_r = r[y_r, x_r];
|
||||
|
||||
patch_center_pos[1] = coarse_ref_sub_pos[1] + local_optical_flow[0];
|
||||
patch_center_pos[0] = coarse_ref_sub_pos[0] + local_optical_flow[1];
|
||||
|
||||
// updating inbound condition
|
||||
if patch_center_pos[1] >= input_size_x || patch_center_pos[0] >= input_size_y {
|
||||
return;
|
||||
}
|
||||
|
||||
// computing kernel
|
||||
if !iso_kernel {
|
||||
let interpolated_cov = cuda.local.array((2, 2), dtype = DEFAULT_CUDA_FLOAT_TYPE);
|
||||
let cov_i = cuda.local.array((2, 2), dtype=DEFAULT_CUDA_FLOAT_TYPE);
|
||||
// fetching the 4 closest covs
|
||||
let close_covs = cuda.local.array((2, 2, 2 ,2), DEFAULT_CUDA_FLOAT_TYPE);
|
||||
let grey_pos = cuda.local.array(2, DEFAULT_CUDA_FLOAT_TYPE);
|
||||
|
||||
if bayer_mode {
|
||||
grey_pos[0] = (patch_center_pos[0] - 0.5) /2; // grey grid is offseted and twice more sparse
|
||||
grey_pos[1] = (patch_center_pos[1] - 0.5) /2;
|
||||
} else {
|
||||
grey_pos[0] = patch_center_pos[0]; // grey grid is exactly the coarse grid
|
||||
grey_pos[1] = patch_center_pos[1];
|
||||
}
|
||||
|
||||
// clipping the coordinates to stay in bound
|
||||
let floor_x = int(max(math.floor(grey_pos[1]), 0));
|
||||
let floor_y = int(max(math.floor(grey_pos[0]), 0));
|
||||
|
||||
let ceil_x = min(floor_x + 1, covs.shape[1]-1);
|
||||
let ceil_y = min(floor_y + 1, covs.shape[0]-1);
|
||||
for i in 0..2 {
|
||||
for j in range(0, 2) {
|
||||
close_covs[0, 0, i, j] = covs[floor_y, floor_x, i, j];
|
||||
close_covs[0, 1, i, j] = covs[floor_y, ceil_x, i, j];
|
||||
close_covs[1, 0, i, j] = covs[ceil_y, floor_x, i, j];
|
||||
close_covs[1, 1, i, j] = covs[ceil_y, ceil_x, i, j];
|
||||
}
|
||||
}
|
||||
|
||||
// interpolating covs at the desired spot
|
||||
interpolate_cov(close_covs, grey_pos, interpolated_cov);
|
||||
invert_2x2(interpolated_cov, cov_i);
|
||||
}
|
||||
|
||||
let center_x = round(patch_center_pos[1]);
|
||||
let center_y = round(patch_center_pos[0]);
|
||||
|
||||
for i in -1..=1 {
|
||||
for j in -1..=1 {
|
||||
let pixel_idx = center_x + j;
|
||||
let pixel_idy = center_y + i;
|
||||
|
||||
// in bound condition
|
||||
if (0 <= pixel_idx && pixel_idx< input_size_x) &&
|
||||
(0 <= pixel_idy && pixel_idy< input_size_y) {
|
||||
|
||||
// checking if pixel is r, g or b
|
||||
let channel = if bayer_mode {
|
||||
local_CFA[pixel_idy%2, pixel_idx%2]
|
||||
} else {
|
||||
0
|
||||
};
|
||||
// By fetching the value now, we can compute the kernel weight
|
||||
// while it is called from global memory
|
||||
let c = comp_img[pixel_idy, pixel_idx];
|
||||
|
||||
// computing distance
|
||||
let dist_x = pixel_idx - patch_center_pos[1];
|
||||
let dist_y = pixel_idy - patch_center_pos[0];
|
||||
|
||||
// Computing w
|
||||
let y = if iso_kernel {
|
||||
max(0, 2 * (dist_x * dist_x + dist_y * dist_y))
|
||||
} else {
|
||||
max(0, quad_mat_prod(cov_i, dist_x, dist_y))
|
||||
// y can be slightly negative because of numerical precision.
|
||||
// I clamp it to not explode the error with exp
|
||||
};
|
||||
|
||||
w = math.exp(-0.5*y)
|
||||
|
||||
//---
|
||||
|
||||
val[channel] += c * w * local_r;
|
||||
acc[channel] += w * local_r;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for chan in 0..n_channels {
|
||||
num[output_pixel_idy, output_pixel_idx, chan] += val[chan];
|
||||
den[output_pixel_idy, output_pixel_idx, chan] += acc[chan];
|
||||
}
|
||||
}
|
||||
|
||||
|
5
src/misc.rs
Normal file
5
src/misc.rs
Normal file
@ -0,0 +1,5 @@
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
pub enum DistanceKind {
|
||||
L1,
|
||||
L2,
|
||||
}
|
Loading…
Reference in New Issue
Block a user