YUV 4:0:0 RDM NEON

awxkee · Nov 28, 2024 · 1a57d75 · 1a57d75
1 parent 42c4a5e
commit 1a57d75
Show file tree

Hide file tree

Showing 3 changed files with 93 additions and 9 deletions.
diff --git a/src/neon/mod.rs b/src/neon/mod.rs
@@ -64,7 +64,7 @@ pub(crate) use rgb_to_yuv_p16::{neon_rgba_to_yuv_p16, neon_rgba_to_yuv_p16_rdm};
 pub(crate) use rgba_to_nv::{neon_rgbx_to_nv_row, neon_rgbx_to_nv_row_rdm};
 pub(crate) use rgba_to_yuv::{neon_rgba_to_yuv, neon_rgba_to_yuv_rdm};
 pub(crate) use rgba_to_yuv420::{neon_rgba_to_yuv420, neon_rgba_to_yuv_rdm420};
-pub(crate) use y_p16_to_rgba16::neon_y_p16_to_rgba16_row;
+pub(crate) use y_p16_to_rgba16::{neon_y_p16_to_rgba16_row, neon_y_p16_to_rgba16_rdm};
 pub(crate) use y_to_rgb::{neon_y_to_rgb_row, neon_y_to_rgb_row_rdm};
 pub(crate) use ycgco_to_rgb::neon_ycgco_to_rgb_row;
 pub(crate) use ycgco_to_rgb_alpha::neon_ycgco_to_rgb_alpha_row;
@@ -84,4 +84,4 @@ pub(crate) use yuv_to_rgba420::{neon_yuv_to_rgba_row420, neon_yuv_to_rgba_row_rd
 pub(crate) use yuv_to_rgba_alpha::{neon_yuv_to_rgba_alpha, neon_yuv_to_rgba_alpha_rdm};
 pub(crate) use yuv_to_yuy2::yuv_to_yuy2_neon_impl;
 pub(crate) use yuy2_to_rgb::yuy2_to_rgb_neon;
-pub(crate) use yuy2_to_yuv::yuy2_to_yuv_neon_impl;
+pub(crate) use yuy2_to_yuv::yuy2_to_yuv_neon_impl;
diff --git a/src/neon/y_p16_to_rgba16.rs b/src/neon/y_p16_to_rgba16.rs
@@ -34,6 +34,86 @@ use crate::yuv_support::{
     CbCrInverseTransform, YuvBytesPacking, YuvChromaRange, YuvEndianness, YuvSourceChannels,
 };
 
+#[target_feature(enable = "rdm")]
+pub(crate) unsafe fn neon_y_p16_to_rgba16_rdm<
+    const DESTINATION_CHANNELS: u8,
+    const ENDIANNESS: u8,
+    const BYTES_POSITION: u8,
+>(
+    y_ld_ptr: *const u16,
+    rgba: *mut u16,
+    width: u32,
+    range: &YuvChromaRange,
+    transform: &CbCrInverseTransform<i32>,
+    start_cx: usize,
+    bit_depth: usize,
+) -> ProcessedOffset {
+    let destination_channels: YuvSourceChannels = DESTINATION_CHANNELS.into();
+    let channels = destination_channels.get_channels_count();
+    let endianness: YuvEndianness = ENDIANNESS.into();
+    let bytes_position: YuvBytesPacking = BYTES_POSITION.into();
+    let dst_ptr = rgba;
+
+    let y_corr = vdupq_n_s16(range.bias_y as i16);
+    let v_min_values = vdupq_n_s16(0i16);
+    let v_alpha = vdupq_n_u16((1 << bit_depth) - 1);
+    let v_msb_shift = vdupq_n_s16(bit_depth as i16 - 16);
+
+    let mut cx = start_cx;
+
+    const V_SCALE: i32 = 2;
+
+    while cx + 8 < width as usize {
+        let y_values: int16x8_t;
+
+        match endianness {
+            YuvEndianness::BigEndian => {
+                let mut y_u_values = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(
+                    vld1q_u16(y_ld_ptr.add(cx)),
+                )));
+                if bytes_position == YuvBytesPacking::MostSignificantBytes {
+                    y_u_values = vshlq_u16(y_u_values, v_msb_shift);
+                }
+                y_values = vsubq_s16(vreinterpretq_s16_u16(y_u_values), y_corr);
+            }
+            YuvEndianness::LittleEndian => {
+                let mut y_vl = vld1q_u16(y_ld_ptr.add(cx));
+                if bytes_position == YuvBytesPacking::MostSignificantBytes {
+                    y_vl = vshlq_u16(y_vl, v_msb_shift);
+                }
+                y_values = vsubq_s16(vreinterpretq_s16_u16(y_vl), y_corr);
+            }
+        }
+
+        let y_high = vqrdmulhq_n_s16(vshlq_n_s16::<V_SCALE>(y_values), transform.y_coef as i16);
+
+        let r_values = vreinterpretq_u16_s16(vmaxq_s16(y_high, v_min_values));
+
+        match destination_channels {
+            YuvSourceChannels::Rgb => {
+                let dst_pack = uint16x8x3_t(r_values, r_values, r_values);
+                vst3q_u16(dst_ptr.add(cx * channels), dst_pack);
+            }
+            YuvSourceChannels::Bgr => {
+                let dst_pack = uint16x8x3_t(r_values, r_values, r_values);
+                vst3q_u16(dst_ptr.add(cx * channels), dst_pack);
+            }
+            YuvSourceChannels::Rgba => {
+                let dst_pack = uint16x8x4_t(r_values, r_values, r_values, v_alpha);
+                vst4q_u16(dst_ptr.add(cx * channels), dst_pack);
+            }
+            YuvSourceChannels::Bgra => {
+                let dst_pack = uint16x8x4_t(r_values, r_values, r_values, v_alpha);
+                vst4q_u16(dst_ptr.add(cx * channels), dst_pack);
+            }
+        }
+
+        cx += 8;
+    }
+
+    ProcessedOffset { cx, ux: 0 }
+}
+
 pub(crate) unsafe fn neon_y_p16_to_rgba16_row<
     const DESTINATION_CHANNELS: u8,
     const ENDIANNESS: u8,

diff --git a/src/y_p16_to_rgb16.rs b/src/y_p16_to_rgb16.rs
@@ -28,7 +28,7 @@
  */
 use crate::built_coefficients::get_built_inverse_transform;
 #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
-use crate::neon::neon_y_p16_to_rgba16_row;
+use crate::neon::{neon_y_p16_to_rgba16_rdm, neon_y_p16_to_rgba16_row};
 use crate::yuv_support::*;
 use crate::{YuvError, YuvGrayImage};
 #[cfg(feature = "rayon")]
@@ -92,6 +92,15 @@ fn yuv400_p16_to_rgbx<
             .zip(image.y_plane.chunks_exact(image.y_stride as usize));
     }
 
+    #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
+    let is_rdm_available = std::arch::is_aarch64_feature_detected!("rdm");
+    #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
+    let neon_wide_handler = if is_rdm_available {
+        neon_y_p16_to_rgba16_rdm::<DESTINATION_CHANNELS, ENDIANNESS, BYTES_POSITION>
+    } else {
+        neon_y_p16_to_rgba16_row::<DESTINATION_CHANNELS, ENDIANNESS, BYTES_POSITION, PRECISION>
+    };
+
     match range {
         YuvRange::Limited => {
             iter.for_each(|(rgba16, y_plane)| {
@@ -101,12 +110,7 @@ fn yuv400_p16_to_rgbx<
                 #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
                 {
                     unsafe {
-                        let offset = neon_y_p16_to_rgba16_row::<
-                            DESTINATION_CHANNELS,
-                            ENDIANNESS,
-                            BYTES_POSITION,
-                            PRECISION,
-                        >(
+                        let offset = neon_wide_handler(
                             y_plane.as_ptr(),
                             rgba16.as_mut_ptr(),
                             image.width,