Merge remote-tracking branch 'origin/dev' into dev

# Conflicts: # src/from_identity.rs
awxkee · Nov 28, 2024 · cbf3888 · cbf3888
2 parents 24b97ef + ad64a2a
commit cbf3888
Show file tree

Hide file tree

Showing 11 changed files with 175 additions and 242 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -14,6 +14,7 @@ categories = ["multimedia::images", "multimedia::video"]
 homepage = "https://github.com/awxkee/yuvutils-rs"
 repository = "https://github.com/awxkee/yuvutils-rs"
 exclude = ["*.jpg", "assets/*", "*.png"]
+rust-version = "1.73.0"
 
 [dependencies]
 num-traits = "0.2.19"

diff --git a/src/from_identity.rs b/src/from_identity.rs
@@ -41,158 +41,7 @@ use rayon::iter::{IndexedParallelIterator, ParallelIterator};
 #[cfg(feature = "rayon")]
 use rayon::prelude::{ParallelSlice, ParallelSliceMut};
 use std::fmt::Debug;
-use std::marker::PhantomData;
-
-struct WideRowGbrProcessor<T> {
-    _phantom: PhantomData<T>,
-    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-    _use_sse: bool,
-    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-    _use_avx: bool,
-}
-
-impl<T> Default for WideRowGbrProcessor<T> {
-    fn default() -> Self {
-        WideRowGbrProcessor {
-            _phantom: PhantomData,
-            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-            _use_sse: std::arch::is_x86_feature_detected!("sse4.1"),
-            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-            _use_avx: std::arch::is_x86_feature_detected!("avx2"),
-        }
-    }
-}
-
-struct WideRowGbrLimitedProcessor<T> {
-    _phantom: PhantomData<T>,
-    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-    _use_sse: bool,
-    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-    _use_avx: bool,
-}
-
-impl<T> Default for WideRowGbrLimitedProcessor<T> {
-    fn default() -> Self {
-        WideRowGbrLimitedProcessor {
-            _phantom: PhantomData,
-            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-            _use_sse: std::arch::is_x86_feature_detected!("sse4.1"),
-            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-            _use_avx: std::arch::is_x86_feature_detected!("avx2"),
-        }
-    }
-}
-
-trait FullRangeWideRow<V> {
-    fn handle_row<const DEST: u8>(
-        &self,
-        g_plane: &[V],
-        b_plane: &[V],
-        r_plane: &[V],
-        rgba: &mut [V],
-        start_cx: usize,
-        width: usize,
-    ) -> usize;
-}
-
-trait LimitedRangeWideRow<V> {
-    fn handle_row<const DEST: u8, const BIT_DEPTH: usize>(
-        &self,
-        g_plane: &[V],
-        b_plane: &[V],
-        r_plane: &[V],
-        rgba: &mut [V],
-        start_cx: usize,
-        width: usize,
-        y_bias: i32,
-        y_coeff: i32,
-    ) -> usize;
-}
-
-impl FullRangeWideRow<u8> for WideRowGbrProcessor<u8> {
-    fn handle_row<const DEST: u8>(
-        &self,
-        g_plane: &[u8],
-        b_plane: &[u8],
-        r_plane: &[u8],
-        rgba: &mut [u8],
-        start_cx: usize,
-        width: usize,
-    ) -> usize {
-        let mut _cx = start_cx;
-        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-        {
-            if self._use_avx {
-                _cx = avx_yuv_to_rgba_row_full::<DEST>(g_plane, b_plane, r_plane, rgba, _cx, width);
-            }
-            if self._use_sse {
-                _cx = sse_yuv_to_rgba_row_full::<DEST>(g_plane, b_plane, r_plane, rgba, _cx, width);
-            }
-        }
-        _cx
-    }
-}
-
-impl FullRangeWideRow<u16> for WideRowGbrProcessor<u16> {
-    fn handle_row<const DEST: u8>(
-        &self,
-        _g_plane: &[u16],
-        _b_plane: &[u16],
-        _r_plane: &[u16],
-        _rgba: &mut [u16],
-        _start_cx: usize,
-        _width: usize,
-    ) -> usize {
-        let mut _cx = 0;
-        _cx
-    }
-}
-
-impl LimitedRangeWideRow<u8> for WideRowGbrLimitedProcessor<u8> {
-    fn handle_row<const DEST: u8, const BIT_DEPTH: usize>(
-        &self,
-        g_plane: &[u8],
-        b_plane: &[u8],
-        r_plane: &[u8],
-        rgba: &mut [u8],
-        start_cx: usize,
-        width: usize,
-        y_bias: i32,
-        y_coeff: i32,
-    ) -> usize {
-        let mut _cx = start_cx;
-        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-        {
-            if self._use_avx {
-                _cx = avx_yuv_to_rgba_row_limited::<DEST>(
-                    g_plane, b_plane, r_plane, rgba, _cx, width, y_bias, y_coeff,
-                );
-            }
-            if self._use_sse {
-                _cx = sse_yuv_to_rgba_row_limited::<DEST>(
-                    g_plane, b_plane, r_plane, rgba, _cx, width, y_bias, y_coeff,
-                );
-            }
-        }
-        _cx
-    }
-}
-
-impl LimitedRangeWideRow<u16> for WideRowGbrLimitedProcessor<u16> {
-    fn handle_row<const DEST: u8, const BIT_DEPTH: usize>(
-        &self,
-        _g_plane: &[u16],
-        _b_plane: &[u16],
-        _r_plane: &[u16],
-        _rgba: &mut [u16],
-        _start_cx: usize,
-        _width: usize,
-        _y_bias: i32,
-        _y_coeff: i32,
-    ) -> usize {
-        0
-    }
-}
+use std::mem::size_of;
 
 #[inline]
 fn gbr_to_rgbx_impl<

diff --git a/src/from_identity_alpha.rs b/src/from_identity_alpha.rs
@@ -37,6 +37,7 @@ use rayon::iter::{IndexedParallelIterator, ParallelIterator};
 #[cfg(feature = "rayon")]
 use rayon::prelude::{ParallelSlice, ParallelSliceMut};
 use std::fmt::Debug;
+use std::mem::size_of;
 
 #[inline]
 fn gbr_to_rgbx_alpha_impl<

diff --git a/src/images.rs b/src/images.rs
@@ -34,6 +34,7 @@ use crate::YuvError;
 use std::fmt::Debug;
 
 #[derive(Debug)]
+/// Shared storage type
 pub enum BufferStoreMut<'a, T: Copy + Debug> {
     Borrowed(&'a mut [T]),
     Owned(Vec<T>),

diff --git a/src/neon/mod.rs b/src/neon/mod.rs
@@ -64,7 +64,7 @@ pub(crate) use rgb_to_yuv_p16::{neon_rgba_to_yuv_p16, neon_rgba_to_yuv_p16_rdm};
 pub(crate) use rgba_to_nv::{neon_rgbx_to_nv_row, neon_rgbx_to_nv_row_rdm};
 pub(crate) use rgba_to_yuv::{neon_rgba_to_yuv, neon_rgba_to_yuv_rdm};
 pub(crate) use rgba_to_yuv420::{neon_rgba_to_yuv420, neon_rgba_to_yuv_rdm420};
-pub(crate) use y_p16_to_rgba16::neon_y_p16_to_rgba16_row;
+pub(crate) use y_p16_to_rgba16::{neon_y_p16_to_rgba16_rdm, neon_y_p16_to_rgba16_row};
 pub(crate) use y_to_rgb::{neon_y_to_rgb_row, neon_y_to_rgb_row_rdm};
 pub(crate) use ycgco_to_rgb::neon_ycgco_to_rgb_row;
 pub(crate) use ycgco_to_rgb_alpha::neon_ycgco_to_rgb_alpha_row;

diff --git a/src/neon/y_p16_to_rgba16.rs b/src/neon/y_p16_to_rgba16.rs
@@ -30,18 +30,17 @@
 use std::arch::aarch64::*;
 
 use crate::internals::ProcessedOffset;
-use crate::yuv_support::{
-    CbCrInverseTransform, YuvBytesPacking, YuvChromaRange, YuvEndianness, YuvSourceChannels,
-};
+use crate::neon::neon_simd_support::vldq_s16_endian;
+use crate::yuv_support::{CbCrInverseTransform, YuvChromaRange, YuvSourceChannels};
 
-pub(crate) unsafe fn neon_y_p16_to_rgba16_row<
+#[target_feature(enable = "rdm")]
+pub(crate) unsafe fn neon_y_p16_to_rgba16_rdm<
     const DESTINATION_CHANNELS: u8,
     const ENDIANNESS: u8,
     const BYTES_POSITION: u8,
-    const PRECISION: i32,
 >(
-    y_ld_ptr: *const u16,
-    rgba: *mut u16,
+    y_ld_ptr: &[u16],
+    rgba: &mut [u16],
     width: u32,
     range: &YuvChromaRange,
     transform: &CbCrInverseTransform<i32>,
@@ -50,66 +49,143 @@ pub(crate) unsafe fn neon_y_p16_to_rgba16_row<
 ) -> ProcessedOffset {
     let destination_channels: YuvSourceChannels = DESTINATION_CHANNELS.into();
     let channels = destination_channels.get_channels_count();
-    let endianness: YuvEndianness = ENDIANNESS.into();
-    let bytes_position: YuvBytesPacking = BYTES_POSITION.into();
     let dst_ptr = rgba;
 
-    let y_corr = vdupq_n_s16(range.bias_y as i16);
-    let v_luma_coeff = vdupq_n_s16(transform.y_coef as i16);
+    let y_corr = vdupq_n_u16(range.bias_y as u16);
     let v_min_values = vdupq_n_s16(0i16);
     let v_alpha = vdupq_n_u16((1 << bit_depth) - 1);
     let v_msb_shift = vdupq_n_s16(bit_depth as i16 - 16);
 
     let mut cx = start_cx;
 
+    const V_SCALE: i32 = 2;
+
     while cx + 8 < width as usize {
-        let y_values: int16x8_t;
-
-        match endianness {
-            YuvEndianness::BigEndian => {
-                let mut y_u_values = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(
-                    vld1q_u16(y_ld_ptr.add(cx)),
-                )));
-                if bytes_position == YuvBytesPacking::MostSignificantBytes {
-                    y_u_values = vshlq_u16(y_u_values, v_msb_shift);
-                }
-                y_values = vsubq_s16(vreinterpretq_s16_u16(y_u_values), y_corr);
+        let y_values: int16x8_t = vreinterpretq_s16_u16(vqsubq_u16(
+            vreinterpretq_u16_s16(vldq_s16_endian::<ENDIANNESS, BYTES_POSITION>(
+                y_ld_ptr.get_unchecked(cx..).as_ptr(),
+                v_msb_shift,
+            )),
+            y_corr,
+        ));
+
+        let y_high = vqrdmulhq_n_s16(vshlq_n_s16::<V_SCALE>(y_values), transform.y_coef as i16);
+
+        let r_values = vminq_u16(
+            vreinterpretq_u16_s16(vmaxq_s16(y_high, v_min_values)),
+            v_alpha,
+        );
+
+        match destination_channels {
+            YuvSourceChannels::Rgb => {
+                let dst_pack = uint16x8x3_t(r_values, r_values, r_values);
+                vst3q_u16(
+                    dst_ptr.get_unchecked_mut(cx * channels..).as_mut_ptr(),
+                    dst_pack,
+                );
+            }
+            YuvSourceChannels::Bgr => {
+                let dst_pack = uint16x8x3_t(r_values, r_values, r_values);
+                vst3q_u16(
+                    dst_ptr.get_unchecked_mut(cx * channels..).as_mut_ptr(),
+                    dst_pack,
+                );
+            }
+            YuvSourceChannels::Rgba => {
+                let dst_pack = uint16x8x4_t(r_values, r_values, r_values, v_alpha);
+                vst4q_u16(
+                    dst_ptr.get_unchecked_mut(cx * channels..).as_mut_ptr(),
+                    dst_pack,
+                );
             }
-            YuvEndianness::LittleEndian => {
-                let mut y_vl = vld1q_u16(y_ld_ptr.add(cx));
-                if bytes_position == YuvBytesPacking::MostSignificantBytes {
-                    y_vl = vshlq_u16(y_vl, v_msb_shift);
-                }
-                y_values = vsubq_s16(vreinterpretq_s16_u16(y_vl), y_corr);
+            YuvSourceChannels::Bgra => {
+                let dst_pack = uint16x8x4_t(r_values, r_values, r_values, v_alpha);
+                vst4q_u16(
+                    dst_ptr.get_unchecked_mut(cx * channels..).as_mut_ptr(),
+                    dst_pack,
+                );
             }
         }
 
+        cx += 8;
+    }
+
+    ProcessedOffset { cx, ux: 0 }
+}
+
+pub(crate) unsafe fn neon_y_p16_to_rgba16_row<
+    const DESTINATION_CHANNELS: u8,
+    const ENDIANNESS: u8,
+    const BYTES_POSITION: u8,
+    const PRECISION: i32,
+>(
+    y_ld_ptr: &[u16],
+    rgba: &mut [u16],
+    width: u32,
+    range: &YuvChromaRange,
+    transform: &CbCrInverseTransform<i32>,
+    start_cx: usize,
+    bit_depth: usize,
+) -> ProcessedOffset {
+    let destination_channels: YuvSourceChannels = DESTINATION_CHANNELS.into();
+    let channels = destination_channels.get_channels_count();
+    let dst_ptr = rgba;
+
+    let y_corr = vdupq_n_u16(range.bias_y as u16);
+    let v_luma_coeff = vdupq_n_s16(transform.y_coef as i16);
+    let v_alpha = vdupq_n_u16((1 << bit_depth) - 1);
+    let v_max_values = vdupq_n_s32((1 << bit_depth) - 1);
+    let v_msb_shift = vdupq_n_s16(bit_depth as i16 - 16);
+
+    let mut cx = start_cx;
+
+    while cx + 8 < width as usize {
+        let y_values: int16x8_t = vreinterpretq_s16_u16(vqsubq_u16(
+            vreinterpretq_u16_s16(vldq_s16_endian::<ENDIANNESS, BYTES_POSITION>(
+                y_ld_ptr.get_unchecked(cx..).as_ptr(),
+                v_msb_shift,
+            )),
+            y_corr,
+        ));
+
         let y_high = vmull_high_s16(y_values, v_luma_coeff);
 
-        let r_high = vrshrn_n_s32::<PRECISION>(y_high);
+        let r_high = vqmovun_s32(vminq_s32(vrshrq_n_s32::<PRECISION>(y_high), v_max_values));
 
         let y_low = vmull_s16(vget_low_s16(y_values), vget_low_s16(v_luma_coeff));
 
-        let r_low = vrshrn_n_s32::<PRECISION>(y_low);
+        let r_low = vqmovun_s32(vminq_s32(vrshrq_n_s32::<PRECISION>(y_low), v_max_values));
 
-        let r_values = vreinterpretq_u16_s16(vmaxq_s16(vcombine_s16(r_low, r_high), v_min_values));
+        let r_values = vcombine_u16(r_low, r_high);
 
         match destination_channels {
             YuvSourceChannels::Rgb => {
                 let dst_pack = uint16x8x3_t(r_values, r_values, r_values);
-                vst3q_u16(dst_ptr.add(cx * channels), dst_pack);
+                vst3q_u16(
+                    dst_ptr.get_unchecked_mut(cx * channels..).as_mut_ptr(),
+                    dst_pack,
+                );
             }
             YuvSourceChannels::Bgr => {
                 let dst_pack = uint16x8x3_t(r_values, r_values, r_values);
-                vst3q_u16(dst_ptr.add(cx * channels), dst_pack);
+                vst3q_u16(
+                    dst_ptr.get_unchecked_mut(cx * channels..).as_mut_ptr(),
+                    dst_pack,
+                );
             }
             YuvSourceChannels::Rgba => {
                 let dst_pack = uint16x8x4_t(r_values, r_values, r_values, v_alpha);
-                vst4q_u16(dst_ptr.add(cx * channels), dst_pack);
+                vst4q_u16(
+                    dst_ptr.get_unchecked_mut(cx * channels..).as_mut_ptr(),
+                    dst_pack,
+                );
             }
             YuvSourceChannels::Bgra => {
                 let dst_pack = uint16x8x4_t(r_values, r_values, r_values, v_alpha);
-                vst4q_u16(dst_ptr.add(cx * channels), dst_pack);
+                vst4q_u16(
+                    dst_ptr.get_unchecked_mut(cx * channels..).as_mut_ptr(),
+                    dst_pack,
+                );
             }
         }