Decoder Safe Impl

2023-03-28 20:57:22 +04:00 · 2023-03-28 20:57:22 +04:00 · 85dee7d53d
commit 85dee7d53d
parent e2be112c9c
16 changed files with 1653 additions and 70 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@ -7,10 +7,24 @@ description = "NVIDIA Video Codec bindings"
 repository = "https://github.com/rust-av/nvidia-video-codec-rs"
 readme = "README.md"
 keywords = ["NVIDIA", "cuvid", "nvenc"]
+edition = "2021"

 [dependencies]
+anyhow = "1.0.69"
+arrayvec = "0.7.2"
+npp-sys = "0.0.1"
 nvidia-video-codec-sys = { version = "0.1.0", path = "nvidia-video-codec-sys" }
+parking_lot = "0.12.1"
+serde = { version = "1.0.155", features = ["derive"] }
+thiserror = "1.0.38"

 [workspace]
 members = ["nvidia-video-codec-sys"]

+[dev-dependencies]
+ffmpeg = {version = "5.1.1", package = "ffmpeg-next" }
+image = "0.24.5"
+mp4 = "0.13.0"
+mp4parse = "0.12.0"
+viuer = "0.6.2"
+
--- a/examples/demo.rs
+++ b/examples/demo.rs
@ -0,0 +1,193 @@
+use anyhow::bail;
+use nvidia_video_codec::cuda::context::CuContext;
+use nvidia_video_codec::cuda::device::CuDevice;
+use nvidia_video_codec::decoder::CuvidRect;
+use nvidia_video_codec::parser::CuvidVideoParser;
+use nvidia_video_codec_sys::cuvid::{
+    cuDriverGetVersion, cuInit, cudaVideoCodec_enum_cudaVideoCodec_H264,
+};
+use std::fs::File;
+use std::io::BufReader;
+
+fn init_and_version() -> CuContext {
+    let ret = unsafe { cuInit(0) };
+    println!("{:?}", ret);
+
+    let ver = unsafe {
+        let mut ver = 0;
+        cuDriverGetVersion(&mut ver as *mut i32);
+        ver
+    };
+
+    println!("Cuda Version: {}", ver);
+
+    let dev = CuDevice::new(0).unwrap();
+    println!(
+        "Using: {} {:.2}Gb",
+        dev.get_name().unwrap(),
+        dev.get_total_mem().unwrap() as f64 / (1024 * 1024 * 1024) as f64
+    );
+
+    CuContext::new(dev, 0).unwrap()
+}
+
+fn main() -> mp4::Result<()> {
+    let ctx = init_and_version();
+    let conf = viuer::Config {
+        // set offset
+        x: 0,
+        y: 0,
+        // set dimensions
+        width: Some(200),
+        height: Some(40),
+        ..Default::default()
+    };
+
+    let codec = nvidia_video_codec::decoder::CuvidCodec::H264;
+    let config = nvidia_video_codec::parser::Config {
+        crop: Some(CuvidRect {
+            left: 280,
+            top: 0,
+            right: 1000,
+            bottom: 720,
+        }),
+        // resize: Some((224, 224)),
+        ..Default::default()
+    };
+    let mut parser = CuvidVideoParser::new(codec, config, move |res| match res {
+        Ok(frame) => {
+            println!("decoded {}x{}", frame.width, frame.height);
+            if let Some(buf) = image::RgbImage::from_vec(frame.width, frame.height, frame.data) {
+                let img = image::DynamicImage::ImageRgb8(buf);
+                if let Err(err) = viuer::print(&img, &conf) {
+                    println!("print image error: {}", err);
+                }
+            }
+
+            true
+        }
+        Err(err) => {
+            println!("err {}", err);
+            true
+        }
+    })
+    .unwrap();
+
+    let f = File::open("/data/home/andrey/video_test/test_h264.mp4").unwrap();
+    let size = f.metadata()?.len();
+    let reader = BufReader::new(f);
+
+    let mut mp4 = mp4::Mp4Reader::read_header(reader, size)?;
+
+    // Print boxes.
+    println!("major brand: {}", mp4.ftyp.major_brand);
+    println!("timescale: {}", mp4.moov.mvhd.timescale);
+
+    // Use available methods.
+    println!("size: {}", mp4.size());
+
+    let mut compatible_brands = String::new();
+    for brand in mp4.compatible_brands().iter() {
+        compatible_brands.push_str(&brand.to_string());
+        compatible_brands.push_str(",");
+    }
+    println!("compatible brands: {}", compatible_brands);
+    println!("duration: {:?}", mp4.duration());
+
+    // Track info.
+    let mut track_id = 0;
+    let mut sample_count = 0;
+
+    for track in mp4.tracks().values() {
+        track_id = track.track_id();
+        sample_count = track.sample_count();
+        println!(
+            "track: #{}({}) {} : {}; {}x{}; {:0.2}MBps; {}fps; {} samples; sps: {}, pps: {}; audio: {:?}",
+            track.track_id(),
+            track.language(),
+            track.track_type()?,
+            track.box_type()?,
+            track.width(),
+            track.height(),
+            track.bitrate() as f64 / (1024 * 1024) as f64,
+            track.frame_rate(),
+            track.sample_count(),
+            track.sequence_parameter_set().is_ok(),
+            track.picture_parameter_set().is_ok(),
+            track.audio_profile(),
+        );
+    }
+
+    if let Some(track) = mp4.tracks().get(&track_id) {
+        if let Ok(sps) = track.sequence_parameter_set() {
+            let mut x: Vec<u8> = Vec::with_capacity(sps.len() + 4);
+            x.extend_from_slice(&[0, 0, 0, 1]);
+            x.extend_from_slice(sps);
+            println!("sps: {:?}", x);
+
+            parser.feed_data(&x, 0, false).unwrap();
+        }
+
+        if let Ok(pps) = track.picture_parameter_set() {
+            let mut x: Vec<u8> = Vec::with_capacity(pps.len() + 4);
+            x.extend_from_slice(&[0, 0, 0, 1]);
+            x.extend_from_slice(pps);
+            println!("pps: {:?}", x);
+
+            parser.feed_data(&x, 0, false).unwrap();
+        }
+    }
+
+    for sample_idx in 0..sample_count {
+        let sample_id = sample_idx + 1;
+        let sample = mp4.read_sample(track_id, sample_id).unwrap();
+
+        if let Some(samp) = sample {
+            let mut data: Vec<u8> = samp.bytes.into();
+            convert_h264(&mut data).unwrap();
+
+            parser
+                .feed_data(
+                    &data,
+                    (((samp.start_time as f64 + samp.rendering_offset as f64) / 24000.0) * 1000.0)
+                        as i64,
+                    sample_idx == sample_count - 1,
+                )
+                .unwrap();
+        }
+    }
+
+    Ok(())
+}
+
+fn convert_h264(data: &mut [u8]) -> anyhow::Result<()> {
+    // TODO:
+    // * For each IDR frame, copy the SPS and PPS from the stream's
+    //   parameters, rather than depend on it being present in the frame
+    //   already. In-band parameters aren't guaranteed. This is awkward
+    //   with h264_reader v0.5's h264_reader::avcc::AvcDecoderRecord because it
+    //   strips off the NAL header byte from each parameter. The next major
+    //   version shouldn't do this.
+    // * Copy only the slice data. In particular, don't copy SEI, which confuses
+    //   Safari: <https://github.com/scottlamb/retina/issues/60#issuecomment-1178369955>
+
+    let mut i = 0;
+    while i < data.len() - 3 {
+        // Replace each NAL's length with the Annex B start code b"\x00\x00\x00\x01".
+        let bytes = &mut data[i..i + 4];
+        let nalu_length = u32::from_be_bytes(bytes.try_into().unwrap()) as usize;
+        bytes.copy_from_slice(&[0, 0, 0, 1]);
+
+        i += 4 + nalu_length;
+
+        if i > data.len() {
+            bail!("partial nal body");
+        }
+    }
+
+    if i < data.len() {
+        bail!("partial nal body");
+    }
+
+    Ok(())
+}
--- a/nvidia-video-codec-sys/Cargo.toml
+++ b/nvidia-video-codec-sys/Cargo.toml
@ -9,6 +9,6 @@ repository = "https://github.com/rust-av/nvidia-video-codec-rs"
 build = "build.rs"

 [build-dependencies]
-bindgen = "0.54.0"
+bindgen = "0.64"

 [dependencies]
--- a/nvidia-video-codec-sys/build.rs
+++ b/nvidia-video-codec-sys/build.rs
@ -6,7 +6,13 @@ use std::io::Write;
 use std::path::PathBuf;

 fn format_write(builder: bindgen::Builder, output: &str) {
-    let s = builder.generate()
+    let s = builder
+        .layout_tests(false)
+        // .rustified_non_exhaustive_enum("cudaVideoCodec_enum")
+        .rustified_non_exhaustive_enum("cuvidDecodeStatus_enum")
+        .rustified_non_exhaustive_enum("cudaVideoSurfaceFormat_enum")
+        .rustified_non_exhaustive_enum("cudaVideoDeinterlaceMode_enum")
+        .generate()
        .unwrap()
        .to_string()
        .replace("/**", "/*")
@ -24,6 +30,7 @@ fn format_write(builder: bindgen::Builder, output: &str) {

 fn common_builder() -> bindgen::Builder {
    bindgen::builder()
+        .constified_enum_module("*")
        .raw_line("#![allow(dead_code)]")
        .raw_line("#![allow(non_camel_case_types)]")
        .raw_line("#![allow(non_snake_case)]")
@ -39,15 +46,16 @@ fn find_dir(default: &'static str, env_key: &'static str) -> PathBuf {

 fn main() {
    let cuda_include = find_dir("/opt/cuda/include", "CUDA_INCLUDE_PATH");
-    let nvc_include = find_dir("/opt/nvidia-video-codec/include",
-                               "NVIDIA_VIDEO_CODEC_INCLUDE_PATH");
+    let nvc_include = find_dir(
+        "/opt/nvidia-video-codec/include",
+        "NVIDIA_VIDEO_CODEC_INCLUDE_PATH",
+    );

    // TODO support windows
    println!("cargo:rustc-link-lib=dylib={}", "cuda");
    println!("cargo:rustc-link-lib=dylib={}", "nvcuvid");
    println!("cargo:rustc-link-lib=dylib={}", "nvidia-encode");

-
    let cuda_builder = common_builder()
        .clang_arg(format!("-I{}", cuda_include.to_string_lossy()))
        .header(cuda_include.join("cuda.h").to_string_lossy());
@ -58,6 +66,7 @@ fn main() {
    let cuvid_builder = common_builder()
        .clang_arg(format!("-I{}", nvc_include.to_string_lossy()))
        .clang_arg(format!("-I{}", cuda_include.to_string_lossy()))
+        .derive_debug(true)
        .header(nvc_include.join("nvcuvid.h").to_string_lossy());

    format_write(cuvid_builder, "src/cuvid.rs");
--- a/nvidia-video-codec-sys/src/lib.rs
+++ b/nvidia-video-codec-sys/src/lib.rs
@ -1,6 +1,3 @@
-// TODO do w/out the unions?
-#![feature(untagged_unions)]
-
 pub mod cuda;
 pub mod cuvid;
 pub mod nvenc;
--- a/src/cuda/context.rs
+++ b/src/cuda/context.rs
@ -1,45 +1,55 @@
-use ffi::cuda::*;
+use nvidia_video_codec_sys::cuvid::{
+    cuCtxCreate_v2, cuCtxDestroy_v2, cuCtxGetApiVersion, cuCtxPopCurrent_v2, cuCtxPushCurrent_v2,
+    CUcontext,
+};

-use cuda::device::CuDevice;
+use crate::error::Error;
+
+use super::device::CuDevice;
+
+pub struct ContextHandler<'a> {
+    _ctx: &'a CuContext,
+}
+
+impl Drop for ContextHandler<'_> {
+    fn drop(&mut self) {
+        let _ = call!(cuCtxPopCurrent_v2(std::ptr::null_mut()));
+    }
+}

 pub struct CuContext {
-    context : CUcontext,
+    context: CUcontext,
 }

+unsafe impl Send for CuContext {}
+unsafe impl Sync for CuContext {}
+
 impl CuContext {
-    pub fn new(dev : CuDevice, flags : u32) -> Result<CuContext, CUresult> {
-        let mut ctx = CuContext { context : 0 };
-        let res = unsafe { cuCtxCreate(&mut ctx.context as *mut CUcontext, dev.device) }
+    pub fn new(dev: CuDevice, flags: u32) -> Result<CuContext, Error> {
+        let mut ctx = CuContext {
+            context: std::ptr::null_mut(),
+        };

-        wrap!(ctx, res)
+        call!(
+            cuCtxCreate_v2(&mut ctx.context as *mut CUcontext, flags, dev.device),
+            ctx
+        )
    }

-    pub fn get_api_version(&self) -> Result<u32, CUresult> {
+    pub fn get_api_version(&self) -> Result<u32, Error> {
        let mut ver = 0;
-        let res = unsafe { cuGetApiVersion(self.context, &mut ver as *mut u32)};
-
-        wrap!(ver, res)
+        call!(cuCtxGetApiVersion(self.context, &mut ver as *mut u32), ver)
    }

-    pub fn pop() -> Result<, CUresult> {
-        let res = unsafe { cuCtxPopCurrent
+    pub fn make_current(&self) -> Result<ContextHandler<'_>, Error> {
+        call!(cuCtxPushCurrent_v2(self.context))?;
+
+        Ok(ContextHandler { _ctx: self })
    }
 }

-/* TODO: leverage clone/drop/deref traits
-impl Clone for CuContext {
-
-}
-
-impl Deref for CuContext {
-
-}
-*/
-
 impl Drop for CuContext {
    fn drop(&mut self) {
-        unsafe {
-            cuCtxDestroy(self.context);
-        }
+        let _ = call!(cuCtxDestroy_v2(self.context));
    }
 }
--- a/src/cuda/device.rs
+++ b/src/cuda/device.rs
@ -1,59 +1,63 @@
-use std::os::raw::c_int;
 use std::os::raw::c_char;
+use std::os::raw::c_int;

 use ffi::cuda::*;
-use ffi::cuda::cudaError_enum::*;
+
+use crate::error::Error;

 pub struct CuDevice {
-    device: CUdevice,
+    pub device: CUdevice,
 }

 impl CuDevice {
-    pub fn new(ordinal: c_int) -> Result<CuDevice, CUresult> {
+    pub fn new(ordinal: c_int) -> Result<CuDevice, Error> {
        let mut d = CuDevice { device: 0 };
-        let res = unsafe { cuDeviceGet(&mut d.device as *mut i32, ordinal) };

-        wrap!(d, res)
+        call!(cuDeviceGet(&mut d.device as *mut i32, ordinal), d)
    }

-    pub fn get_attribute(&self, attr: CUdevice_attribute) -> Result<c_int, CUresult> {
+    pub fn get_attribute(&self, attr: CUdevice_attribute) -> Result<c_int, Error> {
        let mut pi = 0;
-        let res = unsafe { cuDeviceGetAttribute(&mut pi as *mut i32, attr, self.device) };

-        wrap!(pi, res)
+        call!(
+            cuDeviceGetAttribute(&mut pi as *mut i32, attr, self.device),
+            pi
+        )
    }

-    pub fn get_name(&self) -> Result<String, CUresult> {
+    pub fn get_name(&self) -> Result<String, Error> {
        let mut name = vec![0; 256];
-        let res = unsafe {
-            cuDeviceGetName(name.as_mut_ptr() as *mut c_char,
-                            name.len() as i32,
-                            self.device)
-        };
-        let val = String::from_utf8(name).unwrap();

-        wrap!(val, res)
+        call!(
+            cuDeviceGetName(
+                name.as_mut_ptr() as *mut c_char,
+                name.len() as i32,
+                self.device,
+            ),
+            String::from_utf8(name).unwrap()
+        )
    }

-    pub fn get_total_mem(&self) -> Result<usize, CUresult> {
+    pub fn get_total_mem(&self) -> Result<usize, Error> {
        let mut val = 0;
-        let res = unsafe { cuDeviceTotalMem_v2(&mut val as *mut usize, self.device) };

-        wrap!(val, res)
+        call!(
+            cuDeviceTotalMem_v2(&mut val as *mut usize as *mut _, self.device),
+            val
+        )
    }
 }

-pub fn get_count() -> Result<c_int, CUresult> {
+pub fn get_count() -> Result<c_int, Error> {
    let mut val = 0;
-    let res = unsafe { cuDeviceGetCount(&mut val as *mut i32) };

-    wrap!(val, res)
+    call!(cuDeviceGetCount(&mut val as *mut i32), val)
 }

 #[cfg(test)]
 mod tests {
-    use ffi::cuda::cuInit;
    use super::*;
+    use ffi::cuda::cuInit;

    #[test]
    fn device_enum() {
@ -62,9 +66,11 @@ mod tests {
        for i in 0..get_count().unwrap() {
            let dev = CuDevice::new(i).unwrap();

-            println!("{} {}",
-                     dev.get_name().unwrap(),
-                     dev.get_total_mem().unwrap());
+            println!(
+                "{} {}",
+                dev.get_name().unwrap(),
+                dev.get_total_mem().unwrap()
+            );
        }
    }
 }
--- a/src/cuda/mod.rs
+++ b/src/cuda/mod.rs
@ -1 +1,60 @@
+use nvidia_video_codec_sys::cuvid::{
+    cudaVideoCodec, cudaVideoCodec_enum_cudaVideoCodec_AV1,
+    cudaVideoCodec_enum_cudaVideoCodec_H264, cudaVideoCodec_enum_cudaVideoCodec_H264_MVC,
+    cudaVideoCodec_enum_cudaVideoCodec_H264_SVC, cudaVideoCodec_enum_cudaVideoCodec_HEVC,
+    cudaVideoCodec_enum_cudaVideoCodec_JPEG, cudaVideoCodec_enum_cudaVideoCodec_MPEG1,
+    cudaVideoCodec_enum_cudaVideoCodec_MPEG2, cudaVideoCodec_enum_cudaVideoCodec_MPEG4,
+    cudaVideoCodec_enum_cudaVideoCodec_NV12, cudaVideoCodec_enum_cudaVideoCodec_NumCodecs,
+    cudaVideoCodec_enum_cudaVideoCodec_UYVY, cudaVideoCodec_enum_cudaVideoCodec_VC1,
+    cudaVideoCodec_enum_cudaVideoCodec_VP8, cudaVideoCodec_enum_cudaVideoCodec_VP9,
+    cudaVideoCodec_enum_cudaVideoCodec_YUV420, cudaVideoCodec_enum_cudaVideoCodec_YUYV,
+    cudaVideoCodec_enum_cudaVideoCodec_YV12,
+};
+
+pub mod context;
 pub mod device;
+pub mod stream;
+
+pub const CUVID_CHROMA_FORMAT_NAMES: &[&str] =
+    &["YUV 400 (Monochrome)", "YUV 420", "YUV 422", "YUV 444"];
+
+pub const CUVID_VIDEO_CODECS: &[(u32, &str)] = &[
+    (cudaVideoCodec_enum_cudaVideoCodec_MPEG1, "MPEG-1"),
+    (cudaVideoCodec_enum_cudaVideoCodec_MPEG2, "MPEG-2"),
+    (cudaVideoCodec_enum_cudaVideoCodec_MPEG4, "MPEG-4 (ASP)"),
+    (cudaVideoCodec_enum_cudaVideoCodec_VC1, "VC-1/WMV"),
+    (cudaVideoCodec_enum_cudaVideoCodec_H264, "AVC/H.264"),
+    (cudaVideoCodec_enum_cudaVideoCodec_JPEG, "M-JPEG"),
+    (cudaVideoCodec_enum_cudaVideoCodec_H264_SVC, "H.264/SVC"),
+    (cudaVideoCodec_enum_cudaVideoCodec_H264_MVC, "H.264/MVC"),
+    (cudaVideoCodec_enum_cudaVideoCodec_HEVC, "H.265/HEVC"),
+    (cudaVideoCodec_enum_cudaVideoCodec_VP8, "VP8"),
+    (cudaVideoCodec_enum_cudaVideoCodec_VP9, "VP9"),
+    (cudaVideoCodec_enum_cudaVideoCodec_AV1, "AV1"),
+    (cudaVideoCodec_enum_cudaVideoCodec_NumCodecs, "Invalid"),
+    (cudaVideoCodec_enum_cudaVideoCodec_YUV420, "YUV  4:2:0"),
+    (cudaVideoCodec_enum_cudaVideoCodec_YV12, "YV12 4:2:0"),
+    (cudaVideoCodec_enum_cudaVideoCodec_NV12, "NV12 4:2:0"),
+    (cudaVideoCodec_enum_cudaVideoCodec_YUYV, "YUYV 4:2:2"),
+    (cudaVideoCodec_enum_cudaVideoCodec_UYVY, "UYVY 4:2:2"),
+];
+
+#[inline]
+pub fn get_video_chroma_format_str(codec: cudaVideoCodec) -> &'static str {
+    CUVID_CHROMA_FORMAT_NAMES
+        .get(codec as usize)
+        .copied()
+        .unwrap_or("Unknown")
+}
+
+#[inline]
+pub fn get_video_codec_str(codec: cudaVideoCodec) -> &'static str {
+    if codec <= cudaVideoCodec_enum_cudaVideoCodec_NumCodecs {
+        return CUVID_VIDEO_CODECS[codec as usize].1;
+    }
+
+    CUVID_VIDEO_CODECS
+        .iter()
+        .find_map(|&(k, v)| if k == codec { Some(v) } else { None })
+        .unwrap_or("Unknown")
+}
--- a/src/cuda/stream.rs
+++ b/src/cuda/stream.rs
@ -0,0 +1,71 @@
+use nvidia_video_codec_sys::cuvid::{
+    cuMemcpy, cuMemcpy2DAsync_v2, cuMemcpyDtoH_v2, cuStreamCreate, cuStreamDestroy_v2,
+    cuStreamSynchronize, CUdeviceptr, CUmemorytype_enum_CU_MEMORYTYPE_DEVICE,
+    CUmemorytype_enum_CU_MEMORYTYPE_HOST, CUstream, CUstream_flags_enum_CU_STREAM_DEFAULT,
+    CUDA_MEMCPY2D,
+};
+
+use crate::error::Error;
+
+pub struct CuStream {
+    inner: CUstream,
+}
+
+impl CuStream {
+    pub fn new() -> Result<Self, Error> {
+        let mut inner = std::ptr::null_mut();
+        call!(cuStreamCreate(
+            &mut inner,
+            CUstream_flags_enum_CU_STREAM_DEFAULT
+        ))?;
+
+        Ok(Self { inner })
+    }
+
+    pub fn copy_dev_to_host(
+        &mut self,
+        dst: *mut (),
+        src: CUdeviceptr,
+        size: usize,
+    ) -> Result<(), Error> {
+        call!(cuMemcpyDtoH_v2(dst as _, src, size))
+    }
+
+    pub fn copy2d_dev_to_host(
+        &mut self,
+        dst: *mut (),
+        src: CUdeviceptr,
+        src_pitch: usize,
+        dst_pitch: usize,
+        size: (usize, usize),
+    ) -> Result<(), Error> {
+        let mut m: CUDA_MEMCPY2D = unsafe { std::mem::zeroed() };
+
+        m.srcMemoryType = CUmemorytype_enum_CU_MEMORYTYPE_DEVICE;
+        m.srcDevice = src;
+        m.srcPitch = src_pitch;
+
+        m.dstMemoryType = CUmemorytype_enum_CU_MEMORYTYPE_HOST;
+        m.dstHost = dst as *mut _;
+        m.dstPitch = dst_pitch;
+
+        m.WidthInBytes = size.0;
+        m.Height = size.1;
+
+        call!(cuMemcpy2DAsync_v2(&m, self.inner))
+    }
+
+    pub(crate) fn inner(&mut self) -> CUstream {
+        self.inner
+    }
+
+    pub fn synchronize(&self) -> Result<(), Error> {
+        call!(cuStreamSynchronize(self.inner))
+    }
+}
+
+impl Drop for CuStream {
+    fn drop(&mut self) {
+        let _ = call!(cuStreamDestroy_v2(self.inner));
+    }
+}
--- a/src/decoder.rs
+++ b/src/decoder.rs
@ -0,0 +1,495 @@
+use arrayvec::ArrayVec;
+use nvidia_video_codec_sys::cuvid::{
+    cudaVideoCreateFlags_enum_cudaVideoCreate_PreferCUVID, cudaVideoDeinterlaceMode_enum,
+    cudaVideoSurfaceFormat_enum, cuvidCreateDecoder, cuvidDecodePicture, cuvidDestroyDecoder,
+    cuvidGetDecoderCaps, cuvidReconfigureDecoder, CUvideodecoder,
+    _CUVIDDECODECREATEINFO__bindgen_ty_1, _CUVIDDECODECREATEINFO__bindgen_ty_2,
+    cudaVideoChromaFormat, cudaVideoChromaFormat_enum_cudaVideoChromaFormat_420,
+    cudaVideoChromaFormat_enum_cudaVideoChromaFormat_422,
+    cudaVideoChromaFormat_enum_cudaVideoChromaFormat_444,
+    cudaVideoChromaFormat_enum_cudaVideoChromaFormat_Monochrome, cudaVideoCodec,
+    cudaVideoCodec_enum_cudaVideoCodec_NV12, cudaVideoCodec_enum_cudaVideoCodec_UYVY,
+    cudaVideoCodec_enum_cudaVideoCodec_YUV420, cudaVideoCodec_enum_cudaVideoCodec_YUYV,
+    cudaVideoCodec_enum_cudaVideoCodec_YV12, cuvidDecodeStatus_enum, CUVIDDECODECAPS,
+    CUVIDDECODECREATEINFO, CUVIDEOFORMAT, CUVIDPARSERDISPINFO, CUVIDPICPARAMS,
+    CUVIDRECONFIGUREDECODERINFO,
+};
+
+use crate::{
+    cuda::stream::CuStream,
+    error::{Error, ErrorKind},
+    frame::DecodedFrame,
+    mapping::CuvidMapping,
+};
+
+#[derive(Debug)]
+pub(crate) enum FramePlaneKind {
+    Luma,
+    ChromaNv12,
+    ChromaP016,
+}
+
+#[derive(Debug)]
+pub(crate) struct FramePlane {
+    kind: FramePlaneKind,
+    bytes_per_pixel: u8,
+    width: u32,
+    height: u32,
+}
+impl FramePlane {
+    #[inline]
+    pub(crate) fn get_size(&self) -> usize {
+        self.bytes_per_pixel as usize * self.width as usize * self.height as usize
+    }
+
+    #[inline]
+    pub(crate) fn surface_height(&self) -> usize {
+        self.height as usize
+    }
+
+    #[inline]
+    pub(crate) fn surface_width(&self) -> usize {
+        self.width as usize * self.bytes_per_pixel as usize
+    }
+
+    #[inline]
+    pub(crate) fn is_luma(&self) -> bool {
+        matches!(self.kind, FramePlaneKind::Luma)
+    }
+}
+
+#[derive(Debug)]
+pub(crate) struct FrameLayout {
+    pub(crate) planes: ArrayVec<FramePlane, 3>,
+    pub(crate) width: u32,
+    pub(crate) height: u32,
+}
+
+impl FrameLayout {
+    fn new_nv12(width: u32, height: u32, lbpp: u8, cbpp: u8) -> FrameLayout {
+        let mut planes = ArrayVec::new();
+        planes.push(FramePlane {
+            kind: FramePlaneKind::Luma,
+            bytes_per_pixel: lbpp,
+            width,
+            height,
+        });
+
+        planes.push(FramePlane {
+            kind: FramePlaneKind::ChromaNv12,
+            bytes_per_pixel: cbpp,
+            width,
+            height: (height + 1) >> 1,
+        });
+
+        Self {
+            planes,
+            width,
+            height,
+        }
+    }
+
+    fn new_p016(width: u32, height: u32, lbpp: u8, cbpp: u8) -> FrameLayout {
+        let mut planes = ArrayVec::new();
+        planes.push(FramePlane {
+            kind: FramePlaneKind::Luma,
+            bytes_per_pixel: lbpp,
+            width,
+            height,
+        });
+
+        planes.push(FramePlane {
+            kind: FramePlaneKind::ChromaP016,
+            bytes_per_pixel: cbpp,
+            width,
+            height: (height + 1) >> 1,
+        });
+
+        Self {
+            planes,
+            width,
+            height,
+        }
+    }
+}
+
+pub struct CuvidDecoder {
+    pub(crate) inner: CUvideodecoder,
+    pub(crate) format: CuvidDecoderConfig,
+    pub(crate) stream: CuStream,
+    pub(crate) frame_layout: FrameLayout,
+}
+
+impl Drop for CuvidDecoder {
+    fn drop(&mut self) {
+        let _ = call!(cuvidDestroyDecoder(self.inner));
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+#[non_exhaustive]
+pub enum CuvidCodec {
+    /// MPEG1 Codec
+    Mpeg1 = 0,
+    /// MPEG2 Codec
+    Mpeg2 = 1,
+    /// MPEG4 Codec
+    Mpeg4 = 2,
+    /// VC1 Codec
+    Vc1 = 3,
+    /// H264 Codec
+    H264 = 4,
+    /// JPEG Codec
+    Jpeg = 5,
+    /// H264-SVC Codec
+    H264Svc = 6,
+    /// H264-MVC Codec
+    H264Mvc = 7,
+    /// HEVC Codec
+    Hevc = 8,
+    /// VP8 Codec
+    Vp8 = 9,
+    /// VP9 Codec
+    Vp9 = 10,
+    /// AV1 Codec
+    Av1 = 11,
+    /// Max codecs Codec
+    NumCodecs = 12,
+    /// Y,U,V (4:2:0) Codec
+    YUV420 = cudaVideoCodec_enum_cudaVideoCodec_YUV420 as _,
+    /// Y,V,U (4:2:0) Codec
+    YV12 = cudaVideoCodec_enum_cudaVideoCodec_YV12 as _,
+    /// Y,UV  (4:2:0) Codec
+    NV12 = cudaVideoCodec_enum_cudaVideoCodec_NV12 as _,
+    /// YUYV/YUY2 (4:2:2) Codec
+    YUYV = cudaVideoCodec_enum_cudaVideoCodec_YUYV as _,
+    /// UYVY (4:2:2) Codec
+    UYVY = cudaVideoCodec_enum_cudaVideoCodec_UYVY as _,
+}
+
+impl CuvidCodec {
+    const VARIANTS: &[CuvidCodec] = &[
+        CuvidCodec::Mpeg1,
+        CuvidCodec::Mpeg2,
+        CuvidCodec::Mpeg4,
+        CuvidCodec::Vc1,
+        CuvidCodec::H264,
+        CuvidCodec::Jpeg,
+        CuvidCodec::H264Svc,
+        CuvidCodec::H264Mvc,
+        CuvidCodec::Hevc,
+        CuvidCodec::Vp8,
+        CuvidCodec::Vp9,
+        CuvidCodec::Av1,
+    ];
+}
+
+impl From<cudaVideoCodec> for CuvidCodec {
+    fn from(value: cudaVideoCodec) -> Self {
+        match value {
+            codec @ 0..=11 => Self::VARIANTS[codec as usize],
+            codec if codec == CuvidCodec::YUV420 as _ => Self::YUV420,
+            codec if codec == CuvidCodec::YV12 as _ => Self::YV12,
+            codec if codec == CuvidCodec::NV12 as _ => Self::NV12,
+            codec if codec == CuvidCodec::YUYV as _ => Self::YUYV,
+            codec if codec == CuvidCodec::UYVY as _ => Self::UYVY,
+            _ => Self::NumCodecs,
+        }
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+#[non_exhaustive]
+pub enum CuvidChromaFormat {
+    /// YUV 4:0:0
+    Monochrome = 0,
+    /// YUV 4:2:0
+    Yuv420 = 1,
+    /// YUV 4:2:2
+    Yuv422 = 2,
+    /// YUV 4:4:4
+    Yuv444 = 3,
+}
+
+impl TryFrom<cudaVideoChromaFormat> for CuvidChromaFormat {
+    type Error = Error;
+    fn try_from(value: cudaVideoChromaFormat) -> Result<Self, Self::Error> {
+        match value {
+            cudaVideoChromaFormat_enum_cudaVideoChromaFormat_Monochrome => Ok(Self::Monochrome),
+            cudaVideoChromaFormat_enum_cudaVideoChromaFormat_420 => Ok(Self::Yuv420),
+            cudaVideoChromaFormat_enum_cudaVideoChromaFormat_422 => Ok(Self::Yuv422),
+            cudaVideoChromaFormat_enum_cudaVideoChromaFormat_444 => Ok(Self::Yuv444),
+            _ => Err(ErrorKind::CuvidChromaFormatError.into()),
+        }
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+#[non_exhaustive]
+pub enum BitDepthFormat {
+    Bit8 = 0,
+    Bit10 = 2,
+    Bit12 = 4,
+}
+
+impl BitDepthFormat {
+    #[inline]
+    pub fn bytes_count(self) -> u8 {
+        u8::min(self as u8 + 1, 2)
+    }
+}
+
+impl TryFrom<u8> for BitDepthFormat {
+    type Error = Error;
+    fn try_from(value: u8) -> Result<Self, Self::Error> {
+        match value {
+            0 => Ok(Self::Bit8),
+            2 => Ok(Self::Bit10),
+            4 => Ok(Self::Bit12),
+            _ => Err(ErrorKind::CuvidBitFormatError.into()),
+        }
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub struct CuvidRect {
+    pub left: u16,
+    pub top: u16,
+    pub right: u16,
+    pub bottom: u16,
+}
+impl CuvidRect {
+    #[inline]
+    pub fn width(self) -> u16 {
+        self.right - self.left
+    }
+
+    #[inline]
+    pub fn height(self) -> u16 {
+        self.bottom - self.top
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub struct CuvidDecoderConfig {
+    pub codec: CuvidCodec,
+    pub chroma_format: CuvidChromaFormat,
+    pub bit_depth_luma: BitDepthFormat,
+    pub bit_depth_chroma: BitDepthFormat,
+    pub num_decode_surfaces: u32,
+    pub num_output_surfaces: u32,
+    pub coded_width: u32,
+    pub coded_height: u32,
+    pub progressive_sequence: bool,
+    pub crop: Option<CuvidRect>,
+    pub resize: Option<(u16, u16)>,
+}
+
+impl CuvidDecoderConfig {
+    #[inline]
+    pub fn target_dimension(&self) -> (u32, u32) {
+        match (self.resize, self.crop) {
+            (None, None) => (self.coded_width, self.coded_height),
+            (None, Some(r)) => ((r.right - r.left) as _, (r.bottom - r.top) as _),
+            (Some((w, h)), _) => (w as _, h as _),
+        }
+    }
+}
+
+impl<'a> TryFrom<&'a CUVIDEOFORMAT> for CuvidDecoderConfig {
+    type Error = Error;
+    fn try_from(value: &'a CUVIDEOFORMAT) -> Result<Self, Self::Error> {
+        Ok(Self {
+            codec: value.codec.into(),
+            chroma_format: value.chroma_format.try_into()?,
+            bit_depth_luma: value.bit_depth_luma_minus8.try_into()?,
+            bit_depth_chroma: value.bit_depth_chroma_minus8.try_into()?,
+            num_decode_surfaces: value.min_num_decode_surfaces as _,
+            num_output_surfaces: 2,
+            coded_width: value.coded_width,
+            coded_height: value.coded_height,
+            progressive_sequence: value.progressive_sequence == 1,
+            crop: None,
+            resize: None,
+        })
+    }
+}
+
+impl CuvidDecoder {
+    pub fn new(format: CuvidDecoderConfig) -> Result<Self, Error> {
+        let caps = Self::caps(format.codec, format.chroma_format, format.bit_depth_luma)?;
+
+        let mut inner = std::ptr::null_mut();
+        let mut config: CUVIDDECODECREATEINFO = unsafe { std::mem::zeroed() };
+
+        config.ulCreationFlags = cudaVideoCreateFlags_enum_cudaVideoCreate_PreferCUVID as _;
+        config.CodecType = format.codec as _;
+        config.ChromaFormat = format.chroma_format as _;
+        config.bitDepthMinus8 = format.bit_depth_luma as _;
+        config.ulNumDecodeSurfaces = format.num_decode_surfaces as _;
+        config.ulNumOutputSurfaces = format.num_output_surfaces as _;
+        config.ulWidth = format.coded_width as _;
+        config.ulHeight = format.coded_height as _;
+
+        if let Some(r) = format.crop {
+            config.display_area = _CUVIDDECODECREATEINFO__bindgen_ty_1 {
+                left: r.left as _,
+                top: r.top as _,
+                right: r.right as _,
+                bottom: r.bottom as _,
+            };
+
+            config.ulTargetWidth = r.width() as _;
+            config.ulTargetHeight = r.height() as _;
+        }
+
+        if let Some((width, height)) = format.resize {
+            config.ulTargetWidth = width as _;
+            config.ulTargetHeight = height as _;
+        }
+
+        if format.crop.is_none() && format.resize.is_none() {
+            config.ulTargetWidth = format.coded_width as _;
+            config.ulTargetHeight = format.coded_height as _;
+        }
+
+        config.DeinterlaceMode = if format.progressive_sequence {
+            cudaVideoDeinterlaceMode_enum::cudaVideoDeinterlaceMode_Weave
+        } else {
+            cudaVideoDeinterlaceMode_enum::cudaVideoDeinterlaceMode_Adaptive
+        };
+
+        config.OutputFormat = match format.chroma_format {
+            CuvidChromaFormat::Yuv420 | CuvidChromaFormat::Monochrome => {
+                if format.bit_depth_luma as u32 > 0 {
+                    cudaVideoSurfaceFormat_enum::cudaVideoSurfaceFormat_P016
+                } else {
+                    cudaVideoSurfaceFormat_enum::cudaVideoSurfaceFormat_NV12
+                }
+            }
+            CuvidChromaFormat::Yuv444 => {
+                if format.bit_depth_luma as u32 > 0 {
+                    cudaVideoSurfaceFormat_enum::cudaVideoSurfaceFormat_YUV444_16Bit
+                } else {
+                    cudaVideoSurfaceFormat_enum::cudaVideoSurfaceFormat_YUV444_16Bit
+                }
+            }
+            CuvidChromaFormat::Yuv422 => cudaVideoSurfaceFormat_enum::cudaVideoSurfaceFormat_NV12,
+        };
+
+        if caps.nOutputFormatMask & (1 << config.OutputFormat as u32) == 0 {
+            if caps.nOutputFormatMask
+                & (1 << cudaVideoSurfaceFormat_enum::cudaVideoSurfaceFormat_NV12 as u32)
+                == 0
+            {
+                config.OutputFormat = cudaVideoSurfaceFormat_enum::cudaVideoSurfaceFormat_NV12;
+            } else if (caps.nOutputFormatMask
+                & (1 << cudaVideoSurfaceFormat_enum::cudaVideoSurfaceFormat_P016 as u32))
+                == 0
+            {
+                config.OutputFormat = cudaVideoSurfaceFormat_enum::cudaVideoSurfaceFormat_P016;
+            } else if (caps.nOutputFormatMask
+                & (1 << cudaVideoSurfaceFormat_enum::cudaVideoSurfaceFormat_YUV444 as u32))
+                == 0
+            {
+                config.OutputFormat = cudaVideoSurfaceFormat_enum::cudaVideoSurfaceFormat_YUV444;
+            } else if (caps.nOutputFormatMask
+                & (1 << cudaVideoSurfaceFormat_enum::cudaVideoSurfaceFormat_YUV444_16Bit as u32))
+                == 0
+            {
+                config.OutputFormat =
+                    cudaVideoSurfaceFormat_enum::cudaVideoSurfaceFormat_YUV444_16Bit;
+            } else {
+                return Err(ErrorKind::CudaErrorInvalidValue.into());
+            }
+        }
+
+        let frame_layout = match config.OutputFormat {
+            cudaVideoSurfaceFormat_enum::cudaVideoSurfaceFormat_NV12 => FrameLayout::new_nv12(
+                config.ulTargetWidth as _,
+                config.ulTargetHeight as _,
+                format.bit_depth_luma.bytes_count(),
+                format.bit_depth_chroma.bytes_count(),
+            ),
+            cudaVideoSurfaceFormat_enum::cudaVideoSurfaceFormat_P016 => FrameLayout::new_p016(
+                config.ulTargetWidth as _,
+                config.ulTargetHeight as _,
+                format.bit_depth_luma.bytes_count(),
+                format.bit_depth_chroma.bytes_count(),
+            ),
+            _ => unimplemented!(),
+        };
+
+        call!(
+            cuvidCreateDecoder(&mut inner, &mut config),
+            Self {
+                inner,
+                format,
+                stream: CuStream::new()?,
+                frame_layout,
+            }
+        )
+    }
+
+    pub fn recofigure(&mut self, params: &mut CUVIDRECONFIGUREDECODERINFO) -> Result<(), Error> {
+        call!(cuvidReconfigureDecoder(self.inner, params))
+    }
+
+    pub fn decode(&mut self, params: &mut CUVIDPICPARAMS) -> Result<(), Error> {
+        call!(cuvidDecodePicture(self.inner, params))
+    }
+
+    pub fn map_frame(&mut self, disp_info: &CUVIDPARSERDISPINFO) -> Result<CuvidMapping, Error> {
+        CuvidMapping::new(self, disp_info)
+    }
+
+    pub fn get_frame(
+        &mut self,
+        disp_info: &mut CUVIDPARSERDISPINFO,
+    ) -> Result<DecodedFrame, Error> {
+        let data = {
+            let mut mapping = self.map_frame(disp_info)?;
+            let data = mapping.get_rgb_data_host()?;
+
+            match mapping.decode_status()? {
+                cuvidDecodeStatus_enum::cuvidDecodeStatus_Error => {
+                    Err(ErrorKind::CuvidFrameDecodeError)
+                }
+                cuvidDecodeStatus_enum::cuvidDecodeStatus_Error_Concealed => {
+                    Err(ErrorKind::CuvidFrameDecodeCanceledError)
+                }
+                cuvidDecodeStatus_enum::cuvidDecodeStatus_InProgress => {
+                    Err(ErrorKind::CuvidFrameDecodeInProgress)
+                }
+                cuvidDecodeStatus_enum::cuvidDecodeStatus_Invalid => {
+                    Err(ErrorKind::CuvidFrameDecodeInvalid)
+                }
+                cuvidDecodeStatus_enum::cuvidDecodeStatus_Success => Ok(data),
+
+                code => Err(ErrorKind::CudaErrorUnknown(code as _)),
+            }?
+        };
+
+        let (width, height) = self.format.target_dimension();
+
+        Ok(DecodedFrame {
+            data,
+            width,
+            height,
+            timestamp: disp_info.timestamp,
+        })
+    }
+
+    pub fn caps(
+        codec: CuvidCodec,
+        chroma: CuvidChromaFormat,
+        bit_depth: BitDepthFormat,
+    ) -> Result<CUVIDDECODECAPS, Error> {
+        let mut ca: CUVIDDECODECAPS = unsafe { std::mem::zeroed() };
+        ca.eCodecType = codec as _;
+        ca.eChromaFormat = chroma as _;
+        ca.nBitDepthMinus8 = bit_depth as _;
+
+        call!(cuvidGetDecoderCaps(&mut ca), ca)
+    }
+}
--- a/src/error.rs
+++ b/src/error.rs
@ -0,0 +1,373 @@
+use nvidia_video_codec_sys::cuvid::{cuvidDecodeStatus, CUresult};
+use std::{backtrace::Backtrace, fmt};
+
+#[derive(Debug, thiserror::Error)]
+pub enum ErrorKind {
+    #[error("This indicates that one or more of the parameters passed to the API call\n is not within an acceptable range of values.")]
+    CudaErrorInvalidValue,
+
+    #[error("The API call failed because it was unable to allocate enough memory to\n perform the requested operation.")]
+    CudaErrorOutOfMemory,
+
+    #[error("This indicates that the CUDA driver has not been initialized with\n ::cuInit() or that initialization has failed.")]
+    CudaErrorNotInitialized,
+
+    #[error("This indicates that the CUDA driver is in the process of shutting down.")]
+    CudaErrorDeinitialized,
+    #[error("This indicates profiler is not initialized for this run. This can\n happen when the application is running with external profiling tools\n like visual profiler.")]
+    CudaErrorProfilerDisabled,
+    #[error("\\deprecated\n This error return is deprecated as of CUDA 5.0. It is no longer an error\n to attempt to enable/disable the profiling via ::cuProfilerStart or\n ::cuProfilerStop without initialization.")]
+    CudaErrorProfilerNotInitialized,
+    #[error("\\deprecated\n This error return is deprecated as of CUDA 5.0. It is no longer an error\n to call cuProfilerStart() when profiling is already enabled.")]
+    CudaErrorProfilerAlreadyStarted,
+    #[error("\\deprecated\n This error return is deprecated as of CUDA 5.0. It is no longer an error\n to call cuProfilerStop() when profiling is already disabled.")]
+    CudaErrorProfilerAlreadyStopped,
+
+    #[error("This indicates that the CUDA driver that the application has loaded is a\n stub library. Applications that run with the stub rather than a real\n driver loaded will result in CUDA API returning this error.")]
+    CudaErrorStubLibrary,
+    #[error("This indicates that requested CUDA device is unavailable at the current\n time. Devices are often unavailable due to use of\n ::CU_COMPUTEMODE_EXCLUSIVE_PROCESS or ::CU_COMPUTEMODE_PROHIBITED.")]
+    CudaErrorDeviceUnavailable,
+    #[error(
+        "This indicates that no CUDA-capable devices were detected by the installed\n CUDA driver."
+    )]
+    CudaErrorNoDevice,
+    #[error("This indicates that the device ordinal supplied by the user does not\n correspond to a valid CUDA device or that the action requested is\n invalid for the specified device.")]
+    CudaErrorInvalidDevice,
+    #[error("This error indicates that the Grid license is not applied.")]
+    CudaErrorDeviceNotLicensed,
+    #[error("This indicates that the device kernel image is invalid. This can also\n indicate an invalid CUDA module.")]
+    CudaErrorInvalidImage,
+    #[error("This most frequently indicates that there is no context bound to the\n current thread. This can also be returned if the context passed to an\n API call is not a valid handle (such as a context that has had\n ::cuCtxDestroy() invoked on it). This can also be returned if a user\n mixes different API versions (i.e. 3010 context with 3020 API calls).\n See ::cuCtxGetApiVersion() for more details.")]
+    CudaErrorInvalidContext,
+    #[error("This indicated that the context being supplied as a parameter to the\n API call was already the active context.\n \\deprecated\n This error return is deprecated as of CUDA 3.2. It is no longer an\n error to attempt to push the active context via ::cuCtxPushCurrent().")]
+    CudaErrorContextAlreadyCurrent,
+    #[error("This indicates that a map or register operation has failed.")]
+    CudaErrorMapFailed,
+    #[error("This indicates that an unmap or unregister operation has failed.")]
+    CudaErrorUnmapFailed,
+    #[error("This indicates that the specified array is currently mapped and thus\n cannot be destroyed.")]
+    CudaErrorArrayIsMapped,
+    #[error("This indicates that the resource is already mapped.")]
+    CudaErrorAlreadyMapped,
+    #[error("This indicates that there is no kernel image available that is suitable\n for the device. This can occur when a user specifies code generation\n options for a particular CUDA source file that do not include the\n corresponding device configuration.")]
+    CudaErrorNoBinaryForGPU,
+    #[error("This indicates that a resource has already been acquired.")]
+    CudaErrorAlreadyAcquired,
+    #[error("This indicates that a resource is not mapped.")]
+    CudaErrorNotMapped,
+    #[error("This indicates that a mapped resource is not available for access as an\n array.")]
+    CudaErrorNotMappedAsArray,
+    #[error("This indicates that a mapped resource is not available for access as a\n pointer.")]
+    CudaErrorNotMappedAsPointer,
+    #[error("This indicates that an uncorrectable ECC error was detected during\n execution.")]
+    CudaErrorEccUncorrectable,
+    #[error("This indicates that the ::CUlimit passed to the API call is not\n supported by the active device.")]
+    CudaErrorUnsupportedLimit,
+    #[error("This indicates that the ::CUcontext passed to the API call can\n only be bound to a single CPU thread at a time but is already\n bound to a CPU thread.")]
+    CudaErrorContextAlreadyInUse,
+    #[error("This indicates that peer access is not supported across the given\n devices.")]
+    CudaErrorPeerAccessUnsupported,
+    #[error("This indicates that a PTX JIT compilation failed.")]
+    CudaErrorInvalidPTX,
+    #[error("This indicates an error with OpenGL or DirectX context.")]
+    CudaErrorInvalidGraphicsContext,
+    #[error(
+        " This indicates that an uncorrectable NVLink error was detected during the\n execution."
+    )]
+    CudaErrorNvlinkUncorrectable,
+    #[error("This indicates that the PTX JIT compiler library was not found.")]
+    CudaErrorJitCompilerNotFound,
+    #[error("This indicates that the provided PTX was compiled with an unsupported toolchain.")]
+    CudaErrorUnsupportedPTXVersion,
+    #[error("This indicates that the PTX JIT compilation was disabled.")]
+    CudaErrorJitCompilationDisabled,
+    #[error("This indicates that the ::CUexecAffinityType passed to the API call is not\n supported by the active device.")]
+    CudaErrorUnsupportedExecAffinity,
+    #[error("This indicates that the device kernel source is invalid. This includes\n compilation/linker errors encountered in device code or user error.")]
+    CudaErrorInvalidSource,
+    #[error("This indicates that the file specified was not found.")]
+    CudaErrorFileNotFound,
+    #[error("This indicates that a link to a shared object failed to resolve.")]
+    CudaErrorSharedObjectSymbolNotFound,
+    #[error("This indicates that initialization of a shared object failed.")]
+    CudaErrorSharedObjectInitFailed,
+    #[error("This indicates that an OS call failed.")]
+    CudaErrorOperatingSystem,
+    #[error("This indicates that a resource handle passed to the API call was not\n valid. Resource handles are opaque types like ::CUstream and ::CUevent.")]
+    CudaErrorInvalidHandle,
+    #[error("This indicates that a resource required by the API call is not in a\n valid state to perform the requested operation.")]
+    CudaErrorIllegalState,
+    #[error("This indicates that a named symbol was not found. Examples of symbols\n are global/constant variable names, driver function names, texture names,\n and surface names.")]
+    CudaErrorNotFound,
+    #[error("This indicates that asynchronous operations issued previously have not\n completed yet. This result is not actually an error, but must be indicated\n differently than ::CUDA_SUCCESS (which indicates completion). Calls that\n may return this value include ::cuEventQuery() and ::cuStreamQuery().")]
+    CudaErrorNotReady,
+    #[error("While executing a kernel, the device encountered a\n load or store instruction on an invalid memory address.\n This leaves the process in an inconsistent state and any further CUDA work\n will return the same error. To continue using CUDA, the process must be terminated\n and relaunched.")]
+    CudaErrorIllegalAddress,
+    #[error("This indicates that a launch did not occur because it did not have\n appropriate resources. This error usually indicates that the user has\n attempted to pass too many arguments to the device kernel, or the\n kernel launch specifies too many threads for the kernel's register\n count. Passing arguments of the wrong size (i.e. a 64-bit pointer\n when a 32-bit int is expected) is equivalent to passing too many\n arguments and can also result in this error.")]
+    CudaErrorLaunchOutOfResources,
+    #[error("This indicates that the device kernel took too long to execute. This can\n only occur if timeouts are enabled - see the device attribute\n ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT for more information.\n This leaves the process in an inconsistent state and any further CUDA work\n will return the same error. To continue using CUDA, the process must be terminated\n and relaunched.")]
+    CudaErrorLaunchTimeout,
+    #[error("This error indicates a kernel launch that uses an incompatible texturing\n mode.")]
+    CudaErrorLaunchIncompatibleTexturing,
+    #[error("This error indicates that a call to ::cuCtxEnablePeerAccess() is\n trying to re-enable peer access to a context which has already\n had peer access to it enabled.")]
+    CudaErrorPeerAccessAlreadyEnabled,
+    #[error("This error indicates that ::cuCtxDisablePeerAccess() is\n trying to disable peer access which has not been enabled yet\n via ::cuCtxEnablePeerAccess().")]
+    CudaErrorPeerAccessNotEnabled,
+    #[error("This error indicates that the primary context for the specified device\n has already been initialized.")]
+    CudaErrorPrimaryContextActive,
+    #[error("This error indicates that the context current to the calling thread\n has been destroyed using ::cuCtxDestroy, or is a primary context which\n has not yet been initialized.")]
+    CudaErrorContextIsDestroyed,
+    #[error("A device-side assert triggered during kernel execution. The context\n cannot be used anymore, and must be destroyed. All existing device\n memory allocations from this context are invalid and must be\n reconstructed if the program is to continue using CUDA.")]
+    CudaErrorAssert,
+    #[error("This error indicates that the hardware resources required to enable\n peer access have been exhausted for one or more of the devices\n passed to ::cuCtxEnablePeerAccess().")]
+    CudaErrorTooManyPeers,
+    #[error("This error indicates that the memory range passed to ::cuMemHostRegister()\n has already been registered.")]
+    CudaErrorHostMemoryAlreadyRegistered,
+    #[error("This error indicates that the pointer passed to ::cuMemHostUnregister()\n does not correspond to any currently registered memory region.")]
+    CudaErrorHostMemoryNotRegistered,
+    #[error("While executing a kernel, the device encountered a stack error.\n This can be due to stack corruption or exceeding the stack size limit.\n This leaves the process in an inconsistent state and any further CUDA work\n will return the same error. To continue using CUDA, the process must be terminated\n and relaunched.")]
+    CudaErrorHardwareStackError,
+    #[error("While executing a kernel, the device encountered an illegal instruction.\n This leaves the process in an inconsistent state and any further CUDA work\n will return the same error. To continue using CUDA, the process must be terminated\n and relaunched.")]
+    CudaErrorIllegalInstruction,
+    #[error("While executing a kernel, the device encountered a load or store instruction\n on a memory address which is not aligned.\n This leaves the process in an inconsistent state and any further CUDA work\n will return the same error. To continue using CUDA, the process must be terminated\n and relaunched.")]
+    CudaErrorMisalignedAddress,
+    #[error("While executing a kernel, the device encountered an instruction\n which can only operate on memory locations in certain address spaces\n (global, shared, or local), but was supplied a memory address not\n belonging to an allowed address space.\n This leaves the process in an inconsistent state and any further CUDA work\n will return the same error. To continue using CUDA, the process must be terminated\n and relaunched.")]
+    CudaErrorInvalidAddressSpace,
+    #[error("While executing a kernel, the device program counter wrapped its address space.\n This leaves the process in an inconsistent state and any further CUDA work\n will return the same error. To continue using CUDA, the process must be terminated\n and relaunched.")]
+    CudaErrorInvalidPC,
+    #[error("An exception occurred on the device while executing a kernel. Common\n causes include dereferencing an invalid device pointer and accessing\n out of bounds shared memory. Less common cases can be system specific - more\n information about these cases can be found in the system specific user guide.\n This leaves the process in an inconsistent state and any further CUDA work\n will return the same error. To continue using CUDA, the process must be terminated\n and relaunched.")]
+    CudaErrorLaunchFailed,
+    #[error("This error indicates that the number of blocks launched per grid for a kernel that was\n launched via either ::cuLaunchCooperativeKernel or ::cuLaunchCooperativeKernelMultiDevice\n exceeds the maximum number of blocks as allowed by ::cuOccupancyMaxActiveBlocksPerMultiprocessor\n or ::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags times the number of multiprocessors\n as specified by the device attribute ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT.")]
+    CudaErrorCooperativeLaunchTooLarge,
+    #[error("This error indicates that the attempted operation is not permitted.")]
+    CudaErrorNotPermitted,
+    #[error("This error indicates that the attempted operation is not supported\n on the current system or device.")]
+    CudaErrorNotSupported,
+    #[error("This error indicates that the system is not yet ready to start any CUDA\n work.  To continue using CUDA, verify the system configuration is in a\n valid state and all required driver daemons are actively running.\n More information about this error can be found in the system specific\n user guide.")]
+    CudaErrorSystemNotReady,
+    #[error("This error indicates that there is a mismatch between the versions of\n the display driver and the CUDA driver. Refer to the compatibility documentation\n for supported versions.")]
+    CudaErrorSystemDriverMismatch,
+    #[error("This error indicates that the system was upgraded to run with forward compatibility\n but the visible hardware detected by CUDA does not support this configuration.\n Refer to the compatibility documentation for the supported hardware matrix or ensure\n that only supported hardware is visible during initialization via the CUDA_VISIBLE_DEVICES\n environment variable.")]
+    CudaErrorCompatNotSupportedOnDevice,
+    #[error("This error indicates that the MPS client failed to connect to the MPS control daemon or the MPS server.")]
+    CudaErrorMpsConnectionFailed,
+    #[error("This error indicates that the remote procedural call between the MPS server and the MPS client failed.")]
+    CudaErrorMpsRpcFailure,
+    #[error("This error indicates that the MPS server is not ready to accept new MPS client requests.\n This error can be returned when the MPS server is in the process of recovering from a fatal failure.")]
+    CudaErrorMpsServerNotReady,
+    #[error("This error indicates that the hardware resources required to create MPS client have been exhausted.")]
+    CudaErrorMpsMaxClientsReached,
+    #[error("This error indicates the the hardware resources required to support device connections have been exhausted.")]
+    CudaErrorMpsMaxConnectionsReached,
+    #[error("This error indicates that the MPS client has been terminated by the server. To continue using CUDA, the process must be terminated and relaunched.")]
+    CudaErrorMpsClientTerminated,
+    #[error(
+        "This error indicates that the operation is not permitted when\n the stream is capturing."
+    )]
+    CudaErrorStreamCaptureUnsupported,
+    #[error("This error indicates that the current capture sequence on the stream\n has been invalidated due to a previous error.")]
+    CudaErrorStreamCaptureInvalidated,
+    #[error("This error indicates that the operation would have resulted in a merge\n of two independent capture sequences.")]
+    CudaErrorStreamCaptureMerge,
+    #[error("This error indicates that the capture was not initiated in this stream.")]
+    CudaErrorStreamCaptureUnmatched,
+    #[error("This error indicates that the capture sequence contains a fork that was\n not joined to the primary stream.")]
+    CudaErrorStreamCaptureUnjoined,
+    #[error("This error indicates that a dependency would have been created which\n crosses the capture sequence boundary. Only implicit in-stream ordering\n dependencies are allowed to cross the boundary.")]
+    CudaErrorStreamCaptureIsolation,
+    #[error("This error indicates a disallowed implicit dependency on a current capture\n sequence from cudaStreamLegacy.")]
+    CudaErrorStreamCaptureImplicit,
+    #[error("This error indicates that the operation is not permitted on an event which\n was last recorded in a capturing stream.")]
+    CudaErrorCapturedEvent,
+    #[error("A stream capture sequence not initiated with the ::CU_STREAM_CAPTURE_MODE_RELAXED\n argument to ::cuStreamBeginCapture was passed to ::cuStreamEndCapture in a\n different thread.")]
+    CudaErrorStreamCaptureWrongThread,
+    #[error("This error indicates that the timeout specified for the wait operation has lapsed.")]
+    CudaErrorTimeout,
+    #[error("This error indicates that the graph update was not performed because it included\n changes which violated constraints specific to instantiated graph update.")]
+    CudaErrorGraphExecUpdateFailure,
+    #[error("This indicates that an async error has occurred in a device outside of CUDA.\n If CUDA was waiting for an external device's signal before consuming shared data,\n the external device signaled an error indicating that the data is not valid for\n consumption. This leaves the process in an inconsistent state and any further CUDA\n work will return the same error. To continue using CUDA, the process must be\n terminated and relaunched.")]
+    CudaErrorExternalDevice,
+    #[error("Indicates a kernel launch error due to cluster misconfiguration.")]
+    CudaErrorInvalidClusterSize,
+    #[error("This indicates that an unknown internal error has occurred. (code {0})")]
+    CudaErrorUnknown(u32),
+
+    #[error("Frame decode error")]
+    CuvidFrameDecodeError,
+
+    #[error("Frame decode cancelled")]
+    CuvidFrameDecodeCanceledError,
+
+    #[error("Frame in queue to decode, try again.")]
+    CuvidFrameDecodeInProgress,
+
+    #[error("Frame decode invalid state")]
+    CuvidFrameDecodeInvalid,
+
+    #[error("NppError: {0}")]
+    NppError(NppError),
+
+    #[error("Cuvid Bit Format Error")]
+    CuvidBitFormatError,
+    #[error("Cuvid Chroma Format Error")]
+    CuvidChromaFormatError,
+}
+
+impl From<CUresult> for ErrorKind {
+    fn from(code: CUresult) -> ErrorKind {
+        match code {
+            1 => ErrorKind::CudaErrorInvalidValue,
+            2 => ErrorKind::CudaErrorOutOfMemory,
+            3 => ErrorKind::CudaErrorNotInitialized,
+            4 => ErrorKind::CudaErrorDeinitialized,
+            5 => ErrorKind::CudaErrorProfilerDisabled,
+            6 => ErrorKind::CudaErrorProfilerNotInitialized,
+            7 => ErrorKind::CudaErrorProfilerAlreadyStarted,
+            8 => ErrorKind::CudaErrorProfilerAlreadyStopped,
+            34 => ErrorKind::CudaErrorStubLibrary,
+            46 => ErrorKind::CudaErrorDeviceUnavailable,
+            100 => ErrorKind::CudaErrorNoDevice,
+            101 => ErrorKind::CudaErrorInvalidDevice,
+            102 => ErrorKind::CudaErrorDeviceNotLicensed,
+            200 => ErrorKind::CudaErrorInvalidImage,
+            201 => ErrorKind::CudaErrorInvalidContext,
+            202 => ErrorKind::CudaErrorContextAlreadyCurrent,
+            205 => ErrorKind::CudaErrorMapFailed,
+            206 => ErrorKind::CudaErrorUnmapFailed,
+            207 => ErrorKind::CudaErrorArrayIsMapped,
+            208 => ErrorKind::CudaErrorAlreadyMapped,
+            209 => ErrorKind::CudaErrorNoBinaryForGPU,
+            210 => ErrorKind::CudaErrorAlreadyAcquired,
+            211 => ErrorKind::CudaErrorNotMapped,
+            212 => ErrorKind::CudaErrorNotMappedAsArray,
+            213 => ErrorKind::CudaErrorNotMappedAsPointer,
+            214 => ErrorKind::CudaErrorEccUncorrectable,
+            215 => ErrorKind::CudaErrorUnsupportedLimit,
+            216 => ErrorKind::CudaErrorContextAlreadyInUse,
+            217 => ErrorKind::CudaErrorPeerAccessUnsupported,
+            218 => ErrorKind::CudaErrorInvalidPTX,
+            219 => ErrorKind::CudaErrorInvalidGraphicsContext,
+            220 => ErrorKind::CudaErrorNvlinkUncorrectable,
+            221 => ErrorKind::CudaErrorJitCompilerNotFound,
+            222 => ErrorKind::CudaErrorUnsupportedPTXVersion,
+            223 => ErrorKind::CudaErrorJitCompilationDisabled,
+            224 => ErrorKind::CudaErrorUnsupportedExecAffinity,
+            300 => ErrorKind::CudaErrorInvalidSource,
+            301 => ErrorKind::CudaErrorFileNotFound,
+            302 => ErrorKind::CudaErrorSharedObjectSymbolNotFound,
+            303 => ErrorKind::CudaErrorSharedObjectInitFailed,
+            304 => ErrorKind::CudaErrorOperatingSystem,
+            400 => ErrorKind::CudaErrorInvalidHandle,
+            401 => ErrorKind::CudaErrorIllegalState,
+            500 => ErrorKind::CudaErrorNotFound,
+            600 => ErrorKind::CudaErrorNotReady,
+            700 => ErrorKind::CudaErrorIllegalAddress,
+            701 => ErrorKind::CudaErrorLaunchOutOfResources,
+            702 => ErrorKind::CudaErrorLaunchTimeout,
+            703 => ErrorKind::CudaErrorLaunchIncompatibleTexturing,
+            704 => ErrorKind::CudaErrorPeerAccessAlreadyEnabled,
+            705 => ErrorKind::CudaErrorPeerAccessNotEnabled,
+            708 => ErrorKind::CudaErrorPrimaryContextActive,
+            709 => ErrorKind::CudaErrorContextIsDestroyed,
+            710 => ErrorKind::CudaErrorAssert,
+            711 => ErrorKind::CudaErrorTooManyPeers,
+            712 => ErrorKind::CudaErrorHostMemoryAlreadyRegistered,
+            713 => ErrorKind::CudaErrorHostMemoryNotRegistered,
+            714 => ErrorKind::CudaErrorHardwareStackError,
+            715 => ErrorKind::CudaErrorIllegalInstruction,
+            716 => ErrorKind::CudaErrorMisalignedAddress,
+            717 => ErrorKind::CudaErrorInvalidAddressSpace,
+            718 => ErrorKind::CudaErrorInvalidPC,
+            719 => ErrorKind::CudaErrorLaunchFailed,
+            720 => ErrorKind::CudaErrorCooperativeLaunchTooLarge,
+            800 => ErrorKind::CudaErrorNotPermitted,
+            801 => ErrorKind::CudaErrorNotSupported,
+            802 => ErrorKind::CudaErrorSystemNotReady,
+            803 => ErrorKind::CudaErrorSystemDriverMismatch,
+            804 => ErrorKind::CudaErrorCompatNotSupportedOnDevice,
+            805 => ErrorKind::CudaErrorMpsConnectionFailed,
+            806 => ErrorKind::CudaErrorMpsRpcFailure,
+            807 => ErrorKind::CudaErrorMpsServerNotReady,
+            808 => ErrorKind::CudaErrorMpsMaxClientsReached,
+            809 => ErrorKind::CudaErrorMpsMaxConnectionsReached,
+            810 => ErrorKind::CudaErrorMpsClientTerminated,
+            900 => ErrorKind::CudaErrorStreamCaptureUnsupported,
+            901 => ErrorKind::CudaErrorStreamCaptureInvalidated,
+            902 => ErrorKind::CudaErrorStreamCaptureMerge,
+            903 => ErrorKind::CudaErrorStreamCaptureUnmatched,
+            904 => ErrorKind::CudaErrorStreamCaptureUnjoined,
+            905 => ErrorKind::CudaErrorStreamCaptureIsolation,
+            906 => ErrorKind::CudaErrorStreamCaptureImplicit,
+            907 => ErrorKind::CudaErrorCapturedEvent,
+            908 => ErrorKind::CudaErrorStreamCaptureWrongThread,
+            909 => ErrorKind::CudaErrorTimeout,
+            910 => ErrorKind::CudaErrorGraphExecUpdateFailure,
+            911 => ErrorKind::CudaErrorExternalDevice,
+            912 => ErrorKind::CudaErrorInvalidClusterSize,
+            code => ErrorKind::CudaErrorUnknown(code),
+        }
+    }
+}
+
+#[derive(Debug, thiserror::Error)]
+pub enum NppError {
+    #[error("ErrorCode({0})")]
+    Default(i32),
+}
+
+impl From<i32> for NppError {
+    fn from(value: i32) -> Self {
+        Self::Default(value)
+    }
+}
+
+pub struct Error {
+    kind: ErrorKind,
+    backtrace: std::backtrace::Backtrace,
+}
+
+impl Error {
+    pub fn new(kind: ErrorKind) -> Self {
+        Self {
+            kind,
+            backtrace: Backtrace::capture(),
+        }
+    }
+}
+
+impl From<ErrorKind> for Error {
+    fn from(value: ErrorKind) -> Self {
+        Self::new(value)
+    }
+}
+
+impl From<NppError> for Error {
+    fn from(value: NppError) -> Self {
+        Self::new(ErrorKind::NppError(value))
+    }
+}
+
+impl From<CUresult> for Error {
+    fn from(value: CUresult) -> Self {
+        Self::new(ErrorKind::from(value))
+    }
+}
+
+impl fmt::Debug for Error {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        writeln!(f, "{:?}: {}", self.kind, self.kind)?;
+        writeln!(f, "{}", self.backtrace)?;
+
+        Ok(())
+    }
+}
+
+impl fmt::Display for Error {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        writeln!(f, "{:?}: {}", self.kind, self.kind)?;
+        writeln!(f, "{}", self.backtrace)?;
+
+        Ok(())
+    }
+}
+impl std::error::Error for Error {}
--- a/src/frame.rs
+++ b/src/frame.rs
@ -0,0 +1,7 @@
+#[derive(Debug, Clone)]
+pub struct DecodedFrame {
+    pub timestamp: i64,
+    pub data: Vec<u8>,
+    pub width: u32,
+    pub height: u32,
+}
--- a/src/lib.rs
+++ b/src/lib.rs
@ -1,6 +1,16 @@
+use ffi::cuvid::cuInit;
+
 extern crate nvidia_video_codec_sys as ffi;

 #[macro_use]
-mod macros;
-
+pub mod macros;
 pub mod cuda;
+pub mod decoder;
+pub mod error;
+pub mod frame;
+pub mod mapping;
+pub mod parser;
+
+pub fn init_cuda() -> Result<(), error::Error> {
+    call!(cuInit(0))
+}
--- a/src/macros.rs
+++ b/src/macros.rs
@ -1,11 +1,38 @@
-macro_rules! wrap {
-    ($val:ident, $res:ident) => (
-        if $res == CUDA_SUCCESS {
-            Ok($val)
+macro_rules! call {
+    ($expr: expr) => {{
+        let res = unsafe { $expr };
+        if res == nvidia_video_codec_sys::cuvid::cudaError_enum_CUDA_SUCCESS {
+            std::result::Result::<_, $crate::error::Error>::Ok(())
        } else {
-            Err($res)
+            Err(res.into())
        }
-    )
+    }};
+
+    ($expr: expr, $val: expr) => {{
+        let res = unsafe { $expr };
+        if res == nvidia_video_codec_sys::cuvid::cudaError_enum_CUDA_SUCCESS {
+            std::result::Result::<_, $crate::error::Error>::Ok($val)
+        } else {
+            Err(res.into())
+        }
+    }};
 }
+macro_rules! npp_call {
+    ($expr: expr) => {{
+        let res = unsafe { $expr };
+        if res == 0 {
+            std::result::Result::<_, $crate::error::Error>::Ok(())
+        } else {
+            Err($crate::error::NppError::from(res).into())
+        }
+    }};

-
+    ($expr: expr, $val: expr) => {{
+        let res = unsafe { $expr };
+        if res == 0 {
+            std::result::Result::<_, $crate::error::Error>::Ok($val)
+        } else {
+            Err($crate::error::NppError::from(res).into())
+        }
+    }};
+}
--- a/src/mapping.rs
+++ b/src/mapping.rs
@ -0,0 +1,165 @@
+use nvidia_video_codec_sys::cuvid::{
+    cuvidDecodeStatus, cuvidGetDecodeStatus, cuvidMapVideoFrame64, cuvidUnmapVideoFrame64,
+    CUdeviceptr, CUVIDGETDECODESTATUS, CUVIDPARSERDISPINFO, CUVIDPROCPARAMS,
+};
+
+use crate::{decoder::CuvidDecoder, error::Error};
+
+pub struct CuvidMapping<'a> {
+    decoder: &'a mut CuvidDecoder,
+    device_pointer: CUdeviceptr,
+    picture_index: i32,
+    pitch: u32,
+}
+
+struct NppImage {
+    dptr: *mut u8,
+    dpitch: i32,
+}
+
+impl Drop for NppImage {
+    fn drop(&mut self) {
+        unsafe { npp_sys::nppiFree(self.dptr as _) }
+    }
+}
+
+impl<'a> CuvidMapping<'a> {
+    pub fn decode_status(&mut self) -> Result<cuvidDecodeStatus, Error> {
+        let mut status: CUVIDGETDECODESTATUS = unsafe { std::mem::zeroed() };
+
+        call!(
+            cuvidGetDecodeStatus(self.decoder.inner, self.picture_index, &mut status),
+            status.decodeStatus
+        )
+    }
+
+    pub fn get_rgb_data_host(&mut self) -> Result<Vec<u8>, Error> {
+        let size = self.decoder.frame_layout.width as usize
+            * self.decoder.frame_layout.height as usize
+            * 3;
+        let luma_height_offset = (self.decoder.frame_layout.height as u64 + 1) & !1u64;
+
+        let mut dpitch = 0i32;
+        let dptr = unsafe {
+            npp_sys::nppiMalloc_8u_C3(
+                self.decoder.frame_layout.width as _,
+                self.decoder.frame_layout.height as _,
+                &mut dpitch,
+            )
+        };
+
+        let dptr = NppImage { dptr, dpitch };
+
+        npp_call!(npp_sys::nppiNV12ToRGB_8u_P2C3R(
+            &[
+                self.device_pointer as *const u8,
+                (self.device_pointer + self.pitch as u64 * luma_height_offset) as *const u8,
+            ] as *const *const u8,
+            self.pitch as _,
+            dptr.dptr,
+            dptr.dpitch,
+            npp_sys::NppiSize {
+                width: self.decoder.frame_layout.width as _,
+                height: self.decoder.frame_layout.height as _,
+            },
+        ))?;
+
+        let mut data = Vec::with_capacity(size);
+        unsafe { data.set_len(size) };
+
+        self.decoder.stream.copy2d_dev_to_host(
+            data.as_mut_ptr() as _,
+            dptr.dptr as _,
+            dptr.dpitch as _,
+            self.decoder.frame_layout.width as usize * 3,
+            (
+                self.decoder.frame_layout.width as usize * 3,
+                self.decoder.frame_layout.height as usize,
+            ),
+        )?;
+
+        self.decoder.stream.synchronize()?;
+        Ok(data)
+    }
+
+    pub fn get_yuv_data_host(&mut self) -> Result<Vec<u8>, Error> {
+        let mut device_height_offset = 0;
+        let mut luma_height_offset = 0;
+        let mut offset = 0;
+        let total_size: usize = self
+            .decoder
+            .frame_layout
+            .planes
+            .iter()
+            .map(|x| x.get_size())
+            .sum();
+
+        let mut blob = Vec::with_capacity(total_size);
+        unsafe { blob.set_len(total_size) };
+
+        for plane in &self.decoder.frame_layout.planes {
+            if plane.is_luma() {
+                luma_height_offset = (plane.surface_height() as u64 + 1) & !1u64;
+            }
+            let frame_size = plane.get_size();
+
+            self.decoder.stream.copy2d_dev_to_host(
+                blob[offset..offset + frame_size].as_mut_ptr() as _,
+                self.device_pointer + self.pitch as u64 * device_height_offset,
+                self.pitch as _,
+                plane.surface_width(),
+                (plane.surface_width(), plane.surface_height()),
+            )?;
+
+            offset += frame_size;
+            device_height_offset += luma_height_offset;
+        }
+
+        self.decoder.stream.synchronize()?;
+
+        Ok(blob)
+    }
+
+    pub(crate) fn new(
+        decoder: &'a mut CuvidDecoder,
+        disp_info: &CUVIDPARSERDISPINFO,
+    ) -> Result<Self, Error> {
+        let mut vp_params: CUVIDPROCPARAMS = unsafe { std::mem::zeroed() };
+        vp_params.output_stream = decoder.stream.inner();
+        vp_params.progressive_frame = disp_info.progressive_frame;
+        vp_params.second_field = disp_info.repeat_first_field + 1;
+        vp_params.top_field_first = disp_info.top_field_first;
+        vp_params.unpaired_field = if disp_info.repeat_first_field < 0 {
+            1
+        } else {
+            0
+        };
+        let mut device_pointer: CUdeviceptr = 0;
+        let mut pitch = 0u32;
+
+        call!(
+            cuvidMapVideoFrame64(
+                decoder.inner,
+                disp_info.picture_index,
+                &mut device_pointer,
+                &mut pitch,
+                &mut vp_params,
+            ),
+            Self {
+                picture_index: disp_info.picture_index,
+                device_pointer,
+                pitch,
+                decoder,
+            }
+        )
+    }
+}
+
+impl<'a> Drop for CuvidMapping<'a> {
+    fn drop(&mut self) {
+        let _ = call!(cuvidUnmapVideoFrame64(
+            self.decoder.inner,
+            self.device_pointer
+        ));
+    }
+}
--- a/src/parser.rs
+++ b/src/parser.rs
@ -0,0 +1,147 @@
+use std::{ffi::c_void, sync::Arc};
+
+use nvidia_video_codec_sys::cuvid::{
+    cuvidCreateVideoParser, cuvidDestroyVideoParser, cuvidParseVideoData,
+    CUvideopacketflags_CUVID_PKT_ENDOFSTREAM, CUvideopacketflags_CUVID_PKT_TIMESTAMP,
+    CUvideoparser, CUVIDEOFORMAT, CUVIDPARSERDISPINFO, CUVIDPARSERPARAMS, CUVIDPICPARAMS,
+    CUVIDSOURCEDATAPACKET,
+};
+use parking_lot::Mutex;
+
+use crate::{
+    decoder::{CuvidCodec, CuvidDecoder, CuvidDecoderConfig, CuvidRect},
+    error::Error,
+    frame::DecodedFrame,
+};
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
+pub struct Config {
+    pub additional_decode_surfaces: u32,
+    pub additional_output_surfaces: u32,
+    pub crop: Option<CuvidRect>,
+    pub resize: Option<(u16, u16)>,
+}
+
+pub struct DecodingContext {
+    decoder: Mutex<Option<CuvidDecoder>>,
+    config: Config,
+    on_frame: Box<dyn Fn(Result<DecodedFrame, Error>) -> bool>,
+}
+
+pub struct CuvidVideoParser {
+    inner: CUvideoparser,
+    context: Arc<DecodingContext>,
+}
+
+unsafe impl Send for CuvidVideoParser {}
+
+impl CuvidVideoParser {
+    pub fn new<F: Fn(Result<DecodedFrame, Error>) -> bool + 'static>(
+        codec: CuvidCodec,
+        config: Config,
+        on_frame: F,
+    ) -> Result<Self, Error> {
+        let context = Arc::new(DecodingContext {
+            decoder: Mutex::new(None),
+            on_frame: Box::new(on_frame),
+            config,
+        });
+
+        let mut params: CUVIDPARSERPARAMS = unsafe { std::mem::zeroed() };
+        params.CodecType = codec as _;
+        params.ulMaxNumDecodeSurfaces = 1;
+        params.pUserData = Arc::as_ptr(&context) as *mut c_void;
+        params.pfnSequenceCallback = Some(Self::seq_cb);
+        params.pfnDecodePicture = Some(Self::decode_cb);
+        params.pfnDisplayPicture = Some(Self::display_cb);
+
+        let mut inner = std::ptr::null_mut();
+        call!(
+            cuvidCreateVideoParser(&mut inner, &mut params),
+            Self { inner, context }
+        )
+    }
+
+    pub fn feed_data(&mut self, data: &[u8], timestamp: i64, last_one: bool) -> Result<(), Error> {
+        let mut flags = 0u32;
+        flags |= CUvideopacketflags_CUVID_PKT_TIMESTAMP;
+
+        if last_one || data.is_empty() {
+            flags |= CUvideopacketflags_CUVID_PKT_ENDOFSTREAM;
+        }
+
+        let mut packet = CUVIDSOURCEDATAPACKET {
+            flags: flags as _,
+            payload_size: data.len() as _,
+            payload: data.as_ptr(),
+            timestamp,
+        };
+
+        call!(cuvidParseVideoData(self.inner, &mut packet))
+    }
+
+    unsafe extern "C" fn seq_cb(this: *mut c_void, format: *mut CUVIDEOFORMAT) -> i32 {
+        let format = unsafe { &*format };
+        let this = unsafe { &*(this as *const DecodingContext) };
+        let mut lock = this.decoder.lock();
+        let mut config = CuvidDecoderConfig::try_from(format).unwrap();
+        config.crop = this.config.crop;
+        config.resize = this.config.resize;
+        config.num_decode_surfaces += this.config.additional_decode_surfaces;
+        config.num_output_surfaces += this.config.additional_output_surfaces;
+
+        match CuvidDecoder::new(config) {
+            Ok(decoder) => {
+                println!("decoder created with format: \n{:#?}", format);
+                lock.replace(decoder);
+                format.min_num_decode_surfaces.into()
+            }
+            Err(err) => {
+                println!("error: {}", err);
+                0
+            }
+        }
+    }
+
+    unsafe extern "C" fn decode_cb(this: *mut c_void, params: *mut CUVIDPICPARAMS) -> i32 {
+        let params = unsafe { &mut *params };
+        let this = unsafe { &*(this as *const DecodingContext) };
+        let mut lock = this.decoder.lock();
+        if let Some(decoder) = &mut *lock {
+            match decoder.decode(params) {
+                Ok(_) => 1,
+                Err(err) => {
+                    println!("decoder error: {}", err);
+
+                    0
+                }
+            }
+        } else {
+            println!("decoder not initialized !");
+            0
+        }
+    }
+
+    unsafe extern "C" fn display_cb(this: *mut c_void, disp_info: *mut CUVIDPARSERDISPINFO) -> i32 {
+        let disp_info = unsafe { &mut *disp_info };
+        let this = unsafe { &*(this as *const DecodingContext) };
+        let mut lock = this.decoder.lock();
+
+        if let Some(decoder) = &mut *lock {
+            if !(this.on_frame)(decoder.get_frame(disp_info)) {
+                return 0;
+            }
+
+            1
+        } else {
+            println!("decoder not initialized !");
+            0
+        }
+    }
+}
+
+impl Drop for CuvidVideoParser {
+    fn drop(&mut self) {
+        let _ = call!(cuvidDestroyVideoParser(self.inner));
+    }
+}