RDM Support RGB16 to YUV16

awxkee · Nov 23, 2024 · 7266a48 · 7266a48
1 parent a46135c
commit 7266a48
Show file tree

Hide file tree

Showing 4 changed files with 224 additions and 22 deletions.
diff --git a/app/src/main.rs b/app/src/main.rs
@@ -112,8 +112,7 @@ fn main() {
     //
     // println!("rgb_to_yuv_nv12 time: {:?}", start_time.elapsed());
     //
-    // let end_time = Instant::now().sub(start_time);
-    // println!("Forward time: {:?}", end_time);
+    println!("Forward time: {:?}", start_time.elapsed());
     // // //
     // let full_size = if width % 2 == 0 {
     //     2 * width as usize * height as usize

diff --git a/src/neon/mod.rs b/src/neon/mod.rs
@@ -57,7 +57,7 @@ mod yuy2_to_yuv;
 pub use rgb_to_y::neon_rgb_to_y_row;
 pub use rgb_to_ycgco::neon_rgb_to_ycgco_row;
 pub use rgb_to_ycgco_r::neon_rgb_to_ycgcor_row;
-pub use rgb_to_yuv_p16::neon_rgba_to_yuv_p16;
+pub(crate) use rgb_to_yuv_p16::{neon_rgba_to_yuv_p16, neon_rgba_to_yuv_p16_rdm};
 pub(crate) use rgba_to_nv::{neon_rgbx_to_nv_row, neon_rgbx_to_nv_row_rdm};
 pub(crate) use rgba_to_yuv::{neon_rgba_to_yuv, neon_rgba_to_yuv_rdm};
 pub use y_p16_to_rgba16::neon_y_p16_to_rgba16_row;

diff --git a/src/neon/rgb_to_yuv_p16.rs b/src/neon/rgb_to_yuv_p16.rs
@@ -33,7 +33,7 @@ use crate::yuv_support::{
 use crate::{YuvBytesPacking, YuvEndianness};
 use std::arch::aarch64::*;
 
-pub unsafe fn neon_rgba_to_yuv_p16<
+pub(crate) unsafe fn neon_rgba_to_yuv_p16<
     const ORIGIN_CHANNELS: u8,
     const SAMPLING: u8,
     const ENDIANNESS: u8,
@@ -83,6 +83,10 @@ pub unsafe fn neon_rgba_to_yuv_p16<
 
     let v_shift_count = vdupq_n_s16(16 - BIT_DEPTH as i16);
 
+    let i_bias_y = vdupq_n_u16(range.bias_y as u16);
+    let i_cap_y = vdupq_n_u16(range.range_y as u16 + range.bias_y as u16);
+    let i_cap_uv = vdupq_n_u16(range.bias_y as u16 + range.range_uv as u16);
+
     while cx + 8 < width {
         let r_values;
         let g_values;
@@ -137,9 +141,15 @@ pub unsafe fn neon_rgba_to_yuv_p16<
             vget_low_s16(v_yb),
         );
 
-        let mut y_vl = vcombine_u16(
-            vqshrun_n_s32::<PRECISION>(y_l),
-            vqshrun_n_s32::<PRECISION>(y_h),
+        let mut y_vl = vminq_u16(
+            vmaxq_u16(
+                vcombine_u16(
+                    vqshrun_n_s32::<PRECISION>(y_l),
+                    vqshrun_n_s32::<PRECISION>(y_h),
+                ),
+                i_bias_y,
+            ),
+            i_cap_y,
         );
 
         if bytes_position == YuvBytesPacking::MostSignificantBytes {
@@ -173,9 +183,15 @@ pub unsafe fn neon_rgba_to_yuv_p16<
                 vget_low_s16(v_cb_b),
             );
 
-            let mut cb_vl = vcombine_u16(
-                vqshrun_n_s32::<PRECISION>(cb_l),
-                vqshrun_n_s32::<PRECISION>(cb_h),
+            let mut cb_vl = vminq_u16(
+                vmaxq_u16(
+                    vcombine_u16(
+                        vqshrun_n_s32::<PRECISION>(cb_l),
+                        vqshrun_n_s32::<PRECISION>(cb_h),
+                    ),
+                    i_bias_y,
+                ),
+                i_cap_uv,
             );
 
             let mut cr_h = vmlal_high_s16(uv_bias, vreinterpretq_s16_u16(r_values), v_cr_r);
@@ -198,9 +214,15 @@ pub unsafe fn neon_rgba_to_yuv_p16<
                 vget_low_s16(v_cr_b),
             );
 
-            let mut cr_vl = vcombine_u16(
-                vqshrun_n_s32::<PRECISION>(cr_l),
-                vqshrun_n_s32::<PRECISION>(cr_h),
+            let mut cr_vl = vminq_u16(
+                vmaxq_u16(
+                    vcombine_u16(
+                        vqshrun_n_s32::<PRECISION>(cr_l),
+                        vqshrun_n_s32::<PRECISION>(cr_h),
+                    ),
+                    i_bias_y,
+                ),
+                i_cap_uv,
             );
 
             match chroma_subsampling {
@@ -247,3 +269,169 @@ pub unsafe fn neon_rgba_to_yuv_p16<
 
     ProcessedOffset { ux, cx }
 }
+
+#[target_feature(enable = "rdm")]
+pub(crate) unsafe fn neon_rgba_to_yuv_p16_rdm<
+    const ORIGIN_CHANNELS: u8,
+    const SAMPLING: u8,
+    const ENDIANNESS: u8,
+    const BYTES_POSITION: u8,
+    const PRECISION: i32,
+    const BIT_DEPTH: usize,
+>(
+    transform: &CbCrForwardTransform<i32>,
+    range: &YuvChromaRange,
+    y_plane: &mut [u16],
+    u_plane: &mut [u16],
+    v_plane: &mut [u16],
+    rgba: &[u16],
+    start_cx: usize,
+    start_ux: usize,
+    width: usize,
+    compute_uv_row: bool,
+) -> ProcessedOffset {
+    let chroma_subsampling: YuvChromaSubsampling = SAMPLING.into();
+    let source_channels: YuvSourceChannels = ORIGIN_CHANNELS.into();
+    let endianness: YuvEndianness = ENDIANNESS.into();
+    let bytes_position: YuvBytesPacking = BYTES_POSITION.into();
+    let channels = source_channels.get_channels_count();
+
+    let bias_y = range.bias_y as i16;
+    let bias_uv = range.bias_uv as i16;
+
+    let y_ptr = y_plane.as_mut_ptr();
+    let u_ptr = u_plane.as_mut_ptr();
+    let v_ptr = v_plane.as_mut_ptr();
+
+    let y_bias = vdupq_n_s16(bias_y);
+    let uv_bias = vdupq_n_s16(bias_uv);
+    let v_yr = vdupq_n_s16(transform.yr as i16);
+    let v_yg = vdupq_n_s16(transform.yg as i16);
+    let v_yb = vdupq_n_s16(transform.yb as i16);
+    let v_cb_r = vdupq_n_s16(transform.cb_r as i16);
+    let v_cb_g = vdupq_n_s16(transform.cb_g as i16);
+    let v_cb_b = vdupq_n_s16(transform.cb_b as i16);
+    let v_cr_r = vdupq_n_s16(transform.cr_r as i16);
+    let v_cr_g = vdupq_n_s16(transform.cr_g as i16);
+    let v_cr_b = vdupq_n_s16(transform.cr_b as i16);
+
+    let i_bias_y = vdupq_n_s16(range.bias_y as i16);
+    let i_cap_y = vdupq_n_s16(range.range_y as i16 + range.bias_y as i16);
+    let i_cap_uv = vdupq_n_s16(range.bias_y as i16 + range.range_uv as i16);
+
+    let mut cx = start_cx;
+    let mut ux = start_ux;
+
+    let v_shift_count = vdupq_n_s16(16 - BIT_DEPTH as i16);
+
+    while cx + 8 < width {
+        let mut r_values;
+        let mut g_values;
+        let mut b_values;
+
+        let src_ptr = rgba.get_unchecked(cx * channels..);
+
+        match source_channels {
+            YuvSourceChannels::Rgb | YuvSourceChannels::Bgr => {
+                let rgb_values = vld3q_u16(src_ptr.as_ptr());
+                if source_channels == YuvSourceChannels::Rgb {
+                    r_values = rgb_values.0;
+                    g_values = rgb_values.1;
+                    b_values = rgb_values.2;
+                } else {
+                    r_values = rgb_values.2;
+                    g_values = rgb_values.1;
+                    b_values = rgb_values.0;
+                }
+            }
+            YuvSourceChannels::Rgba => {
+                let rgb_values = vld4q_u16(src_ptr.as_ptr());
+                r_values = rgb_values.0;
+                g_values = rgb_values.1;
+                b_values = rgb_values.2;
+            }
+            YuvSourceChannels::Bgra => {
+                let rgb_values = vld4q_u16(src_ptr.as_ptr());
+                r_values = rgb_values.2;
+                g_values = rgb_values.1;
+                b_values = rgb_values.0;
+            }
+        }
+
+        r_values = vshlq_n_u16::<3>(r_values);
+        g_values = vshlq_n_u16::<3>(g_values);
+        b_values = vshlq_n_u16::<3>(b_values);
+
+        let mut y_values = vqrdmlahq_s16(y_bias, vreinterpretq_s16_u16(r_values), v_yr);
+        y_values = vqrdmlahq_s16(y_values, vreinterpretq_s16_u16(g_values), v_yg);
+        y_values = vqrdmlahq_s16(y_values, vreinterpretq_s16_u16(b_values), v_yb);
+
+        let mut y_vl = vreinterpretq_u16_s16(vminq_s16(vmaxq_s16(y_values, i_bias_y), i_cap_y));
+
+        if bytes_position == YuvBytesPacking::MostSignificantBytes {
+            y_vl = vshlq_u16(y_vl, v_shift_count);
+        }
+
+        if endianness == YuvEndianness::BigEndian {
+            y_vl = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(y_vl)));
+        }
+
+        vst1q_u16(y_ptr.add(cx), y_vl);
+
+        if compute_uv_row {
+            let mut cb_h = vqrdmlahq_s16(uv_bias, vreinterpretq_s16_u16(r_values), v_cb_r);
+            cb_h = vqrdmlahq_s16(cb_h, vreinterpretq_s16_u16(g_values), v_cb_g);
+            cb_h = vqrdmlahq_s16(cb_h, vreinterpretq_s16_u16(b_values), v_cb_b);
+
+            let mut cb_vl = vreinterpretq_u16_s16(vminq_s16(vmaxq_s16(cb_h, i_bias_y), i_cap_uv));
+
+            let mut cr_h = vqrdmlahq_s16(uv_bias, vreinterpretq_s16_u16(r_values), v_cr_r);
+            cr_h = vqrdmlahq_s16(cr_h, vreinterpretq_s16_u16(g_values), v_cr_g);
+            cr_h = vqrdmlahq_s16(cr_h, vreinterpretq_s16_u16(b_values), v_cr_b);
+
+            let mut cr_vl = vreinterpretq_u16_s16(vminq_s16(vmaxq_s16(cr_h, i_bias_y), i_cap_uv));
+
+            match chroma_subsampling {
+                YuvChromaSubsampling::Yuv420 | YuvChromaSubsampling::Yuv422 => {
+                    let mut cb_s = vrshrn_n_u32::<1>(vpaddlq_u16(cb_vl));
+                    let mut cr_s = vrshrn_n_u32::<1>(vpaddlq_u16(cr_vl));
+
+                    if bytes_position == YuvBytesPacking::MostSignificantBytes {
+                        cb_s = vshl_u16(cb_s, vget_low_s16(v_shift_count));
+                        cr_s = vshl_u16(cr_s, vget_low_s16(v_shift_count));
+                    }
+
+                    if endianness == YuvEndianness::BigEndian {
+                        cb_s = vreinterpret_u16_u8(vrev16_u8(vreinterpret_u8_u16(cb_s)));
+                        cr_s = vreinterpret_u16_u8(vrev16_u8(vreinterpret_u8_u16(cr_s)));
+                    }
+
+                    vst1_u16(u_ptr.add(ux), cb_s);
+                    vst1_u16(v_ptr.add(ux), cr_s);
+
+                    ux += 4;
+                }
+                YuvChromaSubsampling::Yuv444 => {
+                    if bytes_position == YuvBytesPacking::MostSignificantBytes {
+                        cb_vl = vshlq_u16(cb_vl, v_shift_count);
+                        cr_vl = vshlq_u16(cr_vl, v_shift_count);
+                    }
+
+                    if endianness == YuvEndianness::BigEndian {
+                        cb_vl = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(cb_vl)));
+                        cr_vl = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(cr_vl)));
+                    }
+
+                    vst1q_u16(u_ptr.add(ux), cb_vl);
+                    vst1q_u16(v_ptr.add(ux), cr_vl);
+
+                    ux += 8;
+                }
+            }
+        }
+
+        cx += 8;
+    }
+
+    ProcessedOffset { ux, cx }
+}
diff --git a/src/rgb_to_yuv_p16.rs b/src/rgb_to_yuv_p16.rs
@@ -28,7 +28,7 @@
  */
 use crate::internals::ProcessedOffset;
 #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
-use crate::neon::neon_rgba_to_yuv_p16;
+use crate::neon::{neon_rgba_to_yuv_p16, neon_rgba_to_yuv_p16_rdm};
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 use crate::sse::sse_rgba_to_yuv_p16;
 use crate::yuv_error::check_rgba_destination;
@@ -111,6 +111,28 @@ fn rgbx_to_yuv_ant<
 
     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
     let mut _use_sse = std::arch::is_x86_feature_detected!("sse4.1");
+    #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
+    let is_rdm_available = std::arch::is_aarch64_feature_detected!("rdm");
+    #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
+    let neon_wide_row_handler = if is_rdm_available && BIT_DEPTH <= 12 {
+        neon_rgba_to_yuv_p16_rdm::<
+            ORIGIN_CHANNELS,
+            SAMPLING,
+            ENDIANNESS,
+            BYTES_POSITION,
+            PRECISION,
+            BIT_DEPTH,
+        >
+    } else {
+        neon_rgba_to_yuv_p16::<
+            ORIGIN_CHANNELS,
+            SAMPLING,
+            ENDIANNESS,
+            BYTES_POSITION,
+            PRECISION,
+            BIT_DEPTH,
+        >
+    };
 
     #[allow(unused_variables)]
     let process_wide_row = |_y_plane: &mut [u16],
@@ -148,14 +170,7 @@ fn rgbx_to_yuv_ant<
 
         #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
         unsafe {
-            _offset = neon_rgba_to_yuv_p16::<
-                ORIGIN_CHANNELS,
-                SAMPLING,
-                ENDIANNESS,
-                BYTES_POSITION,
-                PRECISION,
-                BIT_DEPTH,
-            >(
+            _offset = neon_wide_row_handler(
                 &transform,
                 &range,
                 _y_plane,