From 24540fa17367af87a3365c8857df4279dd9d22fb Mon Sep 17 00:00:00 2001
From: Radzivon Bartoshyk <radzivon.bartoshyk@proton.me>
Date: Sat, 23 Nov 2024 11:55:32 +0000
Subject: [PATCH] AVX AYUV16 -> RGBA16

---
 src/avx2/mod.rs                     |   2 +
 src/avx2/yuv_p16_to_rgb16_alpha.rs  | 271 ++++++++++++++++++++++++++++
 src/avx512bw/ycgco_to_rgba_alpha.rs |   5 +-
 src/yuv_p16_rgba16_alpha.rs         |  34 +++-
 src/yuv_p16_rgba_p16.rs             |   1 +
 5 files changed, 310 insertions(+), 3 deletions(-)
 create mode 100644 src/avx2/yuv_p16_to_rgb16_alpha.rs
diff --git a/src/avx2/mod.rs b/src/avx2/mod.rs
index 633644c..71d132e 100644
--- a/src/avx2/mod.rs
+++ b/src/avx2/mod.rs
@@ -37,6 +37,7 @@ mod ycgco_to_rgb;
 mod ycgco_to_rgba_alpha;
 mod yuv_nv_to_rgba;
 mod yuv_p16_to_rgb16;
+mod yuv_p16_to_rgb16_alpha;
 mod yuv_to_rgba;
 mod yuv_to_rgba_alpha;
 mod yuv_to_yuv2;
@@ -51,6 +52,7 @@ pub(crate) use ycgco_to_rgb::avx2_ycgco_to_rgb_row;
 pub(crate) use ycgco_to_rgba_alpha::avx2_ycgco_to_rgba_alpha;
 pub(crate) use yuv_nv_to_rgba::avx2_yuv_nv_to_rgba_row;
 pub(crate) use yuv_p16_to_rgb16::avx_yuv_p16_to_rgba_row;
+pub(crate) use yuv_p16_to_rgb16_alpha::avx_yuv_p16_to_rgba_alpha_row;
 pub(crate) use yuv_to_rgba::avx2_yuv_to_rgba_row;
 pub(crate) use yuv_to_rgba_alpha::avx2_yuv_to_rgba_alpha;
 pub(crate) use yuv_to_yuv2::yuv_to_yuy2_avx2_row;
diff --git a/src/avx2/yuv_p16_to_rgb16_alpha.rs b/src/avx2/yuv_p16_to_rgb16_alpha.rs
new file mode 100644
index 0000000..9188615
--- /dev/null
+++ b/src/avx2/yuv_p16_to_rgb16_alpha.rs
@@ -0,0 +1,271 @@
+/*
+ * Copyright (c) Radzivon Bartoshyk, 10/2024. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1.  Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3.  Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+use crate::avx2::avx2_utils::{_mm256_interleave_rgba_epi16, shuffle};
+use crate::internals::ProcessedOffset;
+use crate::yuv_support::{
+    CbCrInverseTransform, YuvBytesPacking, YuvChromaRange, YuvChromaSubsampling, YuvEndianness,
+    YuvSourceChannels,
+};
+#[cfg(target_arch = "x86")]
+use std::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use std::arch::x86_64::*;
+
+pub(crate) unsafe fn avx_yuv_p16_to_rgba_alpha_row<
+    const DESTINATION_CHANNELS: u8,
+    const SAMPLING: u8,
+    const ENDIANNESS: u8,
+    const BYTES_POSITION: u8,
+    const BIT_DEPTH: usize,
+    const PRECISION: i32,
+>(
+    y_plane: &[u16],
+    u_plane: &[u16],
+    v_plane: &[u16],
+    a_plane: &[u16],
+    bgra: &mut [u16],
+    width: u32,
+    range: &YuvChromaRange,
+    transform: &CbCrInverseTransform<i32>,
+    start_cx: usize,
+    start_ux: usize,
+) -> ProcessedOffset {
+    unsafe {
+        avx_yuv_p16_to_rgba_row_alpha_impl::<
+            DESTINATION_CHANNELS,
+            SAMPLING,
+            ENDIANNESS,
+            BYTES_POSITION,
+            BIT_DEPTH,
+            PRECISION,
+        >(
+            y_plane, u_plane, v_plane, a_plane, bgra, width, range, transform, start_cx, start_ux,
+        )
+    }
+}
+
+#[target_feature(enable = "avx2")]
+unsafe fn avx_yuv_p16_to_rgba_row_alpha_impl<
+    const DESTINATION_CHANNELS: u8,
+    const SAMPLING: u8,
+    const ENDIANNESS: u8,
+    const BYTES_POSITION: u8,
+    const BIT_DEPTH: usize,
+    const PRECISION: i32,
+>(
+    y_plane: &[u16],
+    u_plane: &[u16],
+    v_plane: &[u16],
+    a_plane: &[u16],
+    bgra: &mut [u16],
+    width: u32,
+    range: &YuvChromaRange,
+    transform: &CbCrInverseTransform<i32>,
+    start_cx: usize,
+    start_ux: usize,
+) -> ProcessedOffset {
+    let destination_channels: YuvSourceChannels = DESTINATION_CHANNELS.into();
+    assert!(
+        destination_channels == YuvSourceChannels::Rgba
+            || destination_channels == YuvSourceChannels::Bgra
+    );
+    let channels = destination_channels.get_channels_count();
+    let chroma_subsampling: YuvChromaSubsampling = SAMPLING.into();
+    let endianness: YuvEndianness = ENDIANNESS.into();
+    let bytes_position: YuvBytesPacking = BYTES_POSITION.into();
+    let cr_coef = transform.cr_coef;
+    let cb_coef = transform.cb_coef;
+    let y_coef = transform.y_coef;
+    let g_coef_1 = transform.g_coeff_1;
+    let g_coef_2 = transform.g_coeff_2;
+
+    let bias_y = range.bias_y as i32;
+    let bias_uv = range.bias_uv as i32;
+
+    let dst_ptr = bgra;
+
+    let v_max_colors = _mm256_set1_epi16((1i16 << BIT_DEPTH as i16) - 1);
+
+    let y_corr = _mm256_set1_epi16(bias_y as i16);
+    let uv_corr = _mm256_set1_epi16(bias_uv as i16);
+    let uv_corr_q = _mm256_set1_epi16((bias_uv as i16) << 1);
+    let v_luma_coeff = _mm256_set1_epi16((y_coef as i16) << 1);
+    let v_cr_coeff = _mm256_set1_epi16((cr_coef as i16) << 1);
+    let v_cb_coeff = _mm256_set1_epi16((cb_coef as i16) << 1);
+    let zeros = _mm256_setzero_si256();
+    let v_g_coeff_1 = _mm256_set1_epi16(-((g_coef_1 as i16) << 1));
+    let v_g_coeff_2 = _mm256_set1_epi16(-((g_coef_2 as i16) << 1));
+
+    let mut cx = start_cx;
+    let mut ux = start_ux;
+
+    let v_big_shift_count = _mm_set1_epi64x(16i64 - BIT_DEPTH as i64);
+
+    let big_endian_shuffle_flag = _mm256_setr_epi8(
+        1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10,
+        13, 12, 15, 14,
+    );
+    let big_endian_shuffle_flag_sse =
+        _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+
+    while cx + 16 < width as usize {
+        let dst_ptr = dst_ptr.get_unchecked_mut(cx * channels..);
+
+        let mut y_vl = _mm256_loadu_si256(y_plane.get_unchecked(cx..).as_ptr() as *const __m256i);
+        if endianness == YuvEndianness::BigEndian {
+            y_vl = _mm256_shuffle_epi8(y_vl, big_endian_shuffle_flag);
+            y_vl = _mm256_permute2x128_si256::<0x01>(y_vl, y_vl);
+        }
+        if bytes_position == YuvBytesPacking::MostSignificantBytes {
+            y_vl = _mm256_srl_epi16(y_vl, v_big_shift_count);
+        }
+        let mut y_values = _mm256_sub_epi16(y_vl, y_corr);
+
+        let mut u_values;
+        let mut v_values;
+
+        match chroma_subsampling {
+            YuvChromaSubsampling::Yuv420 | YuvChromaSubsampling::Yuv422 => {
+                let mut u_vals =
+                    _mm_loadu_si128(u_plane.get_unchecked(ux..).as_ptr() as *const __m128i);
+                let mut v_vals =
+                    _mm_loadu_si128(v_plane.get_unchecked(ux..).as_ptr() as *const __m128i);
+
+                if endianness == YuvEndianness::BigEndian {
+                    u_vals = _mm_shuffle_epi8(u_vals, big_endian_shuffle_flag_sse);
+                    v_vals = _mm_shuffle_epi8(v_vals, big_endian_shuffle_flag_sse);
+                }
+                if bytes_position == YuvBytesPacking::MostSignificantBytes {
+                    u_vals = _mm_srl_epi16(u_vals, v_big_shift_count);
+                    v_vals = _mm_srl_epi16(v_vals, v_big_shift_count);
+                }
+
+                let u_expanded = _mm256_set_m128i(u_vals, u_vals); // [A7, A6, ..., A0 | A7, A6, ..., A0]
+                let v_expanded = _mm256_set_m128i(v_vals, v_vals); // [A7, A6, ..., A0 | A7, A6, ..., A0]
+                const MASK: i32 = shuffle(3, 1, 2, 0);
+                let u_vl =
+                    _mm256_permute4x64_epi64::<MASK>(_mm256_unpacklo_epi16(u_expanded, u_expanded));
+                let v_vl =
+                    _mm256_permute4x64_epi64::<MASK>(_mm256_unpacklo_epi16(v_expanded, v_expanded));
+
+                u_values = _mm256_sub_epi16(u_vl, uv_corr);
+                v_values = _mm256_sub_epi16(v_vl, uv_corr);
+            }
+            YuvChromaSubsampling::Yuv444 => {
+                let mut u_vals =
+                    _mm256_loadu_si256(u_plane.get_unchecked(ux..).as_ptr() as *const __m256i);
+                let mut v_vals =
+                    _mm256_loadu_si256(v_plane.get_unchecked(ux..).as_ptr() as *const __m256i);
+
+                if endianness == YuvEndianness::BigEndian {
+                    u_vals = _mm256_shuffle_epi8(u_vals, big_endian_shuffle_flag);
+                    v_vals = _mm256_shuffle_epi8(v_vals, big_endian_shuffle_flag);
+                    u_vals = _mm256_permute2x128_si256::<0x01>(u_vals, u_vals);
+                    v_vals = _mm256_permute2x128_si256::<0x01>(v_vals, v_vals);
+                }
+                if bytes_position == YuvBytesPacking::MostSignificantBytes {
+                    u_vals = _mm256_srl_epi16(u_vals, v_big_shift_count);
+                    v_vals = _mm256_srl_epi16(v_vals, v_big_shift_count);
+                }
+                u_values = _mm256_sub_epi16(u_vals, uv_corr_q);
+                v_values = _mm256_sub_epi16(v_vals, uv_corr_q);
+            }
+        }
+
+        u_values = _mm256_slli_epi16::<3>(u_values);
+        v_values = _mm256_slli_epi16::<3>(v_values);
+        y_values = _mm256_slli_epi16::<3>(y_values);
+
+        let y_vals = _mm256_mulhi_epi16(y_values, v_luma_coeff);
+
+        let r_vals = _mm256_add_epi16(y_vals, _mm256_mulhi_epi16(v_values, v_cr_coeff));
+        let b_vals = _mm256_add_epi16(y_vals, _mm256_mulhi_epi16(u_values, v_cb_coeff));
+        let g_vals = _mm256_add_epi16(
+            _mm256_add_epi16(y_vals, _mm256_mulhi_epi16(v_values, v_g_coeff_1)),
+            _mm256_mulhi_epi16(u_values, v_g_coeff_2),
+        );
+
+        let r_values = _mm256_min_epu16(_mm256_max_epi16(r_vals, zeros), v_max_colors);
+        let g_values = _mm256_min_epu16(_mm256_max_epi16(g_vals, zeros), v_max_colors);
+        let b_values = _mm256_min_epu16(_mm256_max_epi16(b_vals, zeros), v_max_colors);
+
+        let a_values = _mm256_loadu_si256(a_plane.get_unchecked(cx..).as_ptr() as *const __m256i);
+
+        match destination_channels {
+            YuvSourceChannels::Rgb | YuvSourceChannels::Bgr => {
+                unreachable!("This method can't be called with Rgb/Bgr");
+            }
+            YuvSourceChannels::Rgba => {
+                let dst_pack = _mm256_interleave_rgba_epi16(r_values, g_values, b_values, a_values);
+                _mm256_storeu_si256(dst_ptr.as_mut_ptr() as *mut __m256i, dst_pack.0);
+                _mm256_storeu_si256(
+                    dst_ptr.get_unchecked_mut(16..).as_mut_ptr() as *mut __m256i,
+                    dst_pack.1,
+                );
+                _mm256_storeu_si256(
+                    dst_ptr.get_unchecked_mut(32..).as_mut_ptr() as *mut __m256i,
+                    dst_pack.2,
+                );
+                _mm256_storeu_si256(
+                    dst_ptr.get_unchecked_mut(48..).as_mut_ptr() as *mut __m256i,
+                    dst_pack.3,
+                );
+            }
+            YuvSourceChannels::Bgra => {
+                let dst_pack = _mm256_interleave_rgba_epi16(b_values, g_values, r_values, a_values);
+                _mm256_storeu_si256(dst_ptr.as_mut_ptr() as *mut __m256i, dst_pack.0);
+                _mm256_storeu_si256(
+                    dst_ptr.get_unchecked_mut(16..).as_mut_ptr() as *mut __m256i,
+                    dst_pack.1,
+                );
+                _mm256_storeu_si256(
+                    dst_ptr.get_unchecked_mut(32..).as_mut_ptr() as *mut __m256i,
+                    dst_pack.2,
+                );
+                _mm256_storeu_si256(
+                    dst_ptr.get_unchecked_mut(48..).as_mut_ptr() as *mut __m256i,
+                    dst_pack.3,
+                );
+            }
+        }
+
+        cx += 16;
+        match chroma_subsampling {
+            YuvChromaSubsampling::Yuv420 | YuvChromaSubsampling::Yuv422 => {
+                ux += 8;
+            }
+            YuvChromaSubsampling::Yuv444 => {
+                ux += 16;
+            }
+        }
+    }
+
+    ProcessedOffset { cx, ux }
+}
diff --git a/src/avx512bw/ycgco_to_rgba_alpha.rs b/src/avx512bw/ycgco_to_rgba_alpha.rs
index 6c3a9d2..b035805 100644
--- a/src/avx512bw/ycgco_to_rgba_alpha.rs
+++ b/src/avx512bw/ycgco_to_rgba_alpha.rs
@@ -36,7 +36,10 @@ use std::arch::x86::*;
 use std::arch::x86_64::*;
 
 #[target_feature(enable = "avx512bw")]
-pub(crate) unsafe fn avx512_ycgco_to_rgba_alpha<const DESTINATION_CHANNELS: u8, const SAMPLING: u8>(
+pub(crate) unsafe fn avx512_ycgco_to_rgba_alpha<
+    const DESTINATION_CHANNELS: u8,
+    const SAMPLING: u8,
+>(
     range: &YuvChromaRange,
     y_plane: &[u8],
     cg_plane: &[u8],
diff --git a/src/yuv_p16_rgba16_alpha.rs b/src/yuv_p16_rgba16_alpha.rs
index 4047084..1fb0b2f 100644
--- a/src/yuv_p16_rgba16_alpha.rs
+++ b/src/yuv_p16_rgba16_alpha.rs
@@ -26,6 +26,10 @@
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+use crate::avx2::avx_yuv_p16_to_rgba_alpha_row;
+#[allow(unused_imports)]
+use crate::internals::ProcessedOffset;
 #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
 use crate::neon::{neon_yuv_p16_to_rgba16_alpha_row, neon_yuv_p16_to_rgba16_alpha_row_rdm};
 use crate::numerics::{qrshr, to_ne};
@@ -115,6 +119,8 @@ fn yuv_p16_to_image_alpha_ant<
     };
     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
     let _use_sse = std::arch::is_x86_feature_detected!("sse4.1");
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    let use_avx = std::arch::is_x86_feature_detected!("avx2");
 
     let process_wide_row = |_y_plane: &[u16],
                             _u_plane: &[u16],
@@ -143,6 +149,30 @@ fn yuv_p16_to_image_alpha_ant<
         #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
         {
             unsafe {
+                let mut v_offset = ProcessedOffset { cx: 0, ux: 0 };
+                if use_avx && BIT_DEPTH <= 12 {
+                    let offset = avx_yuv_p16_to_rgba_alpha_row::<
+                        DESTINATION_CHANNELS,
+                        SAMPLING,
+                        ENDIANNESS,
+                        BYTES_POSITION,
+                        BIT_DEPTH,
+                        PRECISION,
+                    >(
+                        _y_plane,
+                        _u_plane,
+                        _v_plane,
+                        _a_plane,
+                        _rgba,
+                        image.width,
+                        &range,
+                        &i_transform,
+                        v_offset.cx,
+                        v_offset.ux,
+                    );
+                    v_offset = offset;
+                    _cx = offset.cx;
+                }
                 if _use_sse && BIT_DEPTH <= 12 {
                     let offset = sse_yuv_p16_to_rgba_alpha_row::<
                         DESTINATION_CHANNELS,
@@ -160,8 +190,8 @@ fn yuv_p16_to_image_alpha_ant<
                         image.width,
                         &range,
                         &i_transform,
-                        0,
-                        0,
+                        v_offset.cx,
+                        v_offset.ux,
                     );
                     _cx = offset.cx;
                 }
diff --git a/src/yuv_p16_rgba_p16.rs b/src/yuv_p16_rgba_p16.rs
index ba33368..cb21ca5 100644
--- a/src/yuv_p16_rgba_p16.rs
+++ b/src/yuv_p16_rgba_p16.rs
@@ -28,6 +28,7 @@
  */
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 use crate::avx2::avx_yuv_p16_to_rgba_row;
+#[allow(unused_imports)]
 use crate::internals::ProcessedOffset;
 #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
 use crate::neon::{neon_yuv_p16_to_rgba16_row, neon_yuv_p16_to_rgba16_row_rdm};