From 24540fa17367af87a3365c8857df4279dd9d22fb Mon Sep 17 00:00:00 2001 From: Radzivon Bartoshyk Date: Sat, 23 Nov 2024 11:55:32 +0000 Subject: [PATCH] AVX AYUV16 -> RGBA16 --- src/avx2/mod.rs | 2 + src/avx2/yuv_p16_to_rgb16_alpha.rs | 271 ++++++++++++++++++++++++++++ src/avx512bw/ycgco_to_rgba_alpha.rs | 5 +- src/yuv_p16_rgba16_alpha.rs | 34 +++- src/yuv_p16_rgba_p16.rs | 1 + 5 files changed, 310 insertions(+), 3 deletions(-) create mode 100644 src/avx2/yuv_p16_to_rgb16_alpha.rs diff --git a/src/avx2/mod.rs b/src/avx2/mod.rs index 633644c..71d132e 100644 --- a/src/avx2/mod.rs +++ b/src/avx2/mod.rs @@ -37,6 +37,7 @@ mod ycgco_to_rgb; mod ycgco_to_rgba_alpha; mod yuv_nv_to_rgba; mod yuv_p16_to_rgb16; +mod yuv_p16_to_rgb16_alpha; mod yuv_to_rgba; mod yuv_to_rgba_alpha; mod yuv_to_yuv2; @@ -51,6 +52,7 @@ pub(crate) use ycgco_to_rgb::avx2_ycgco_to_rgb_row; pub(crate) use ycgco_to_rgba_alpha::avx2_ycgco_to_rgba_alpha; pub(crate) use yuv_nv_to_rgba::avx2_yuv_nv_to_rgba_row; pub(crate) use yuv_p16_to_rgb16::avx_yuv_p16_to_rgba_row; +pub(crate) use yuv_p16_to_rgb16_alpha::avx_yuv_p16_to_rgba_alpha_row; pub(crate) use yuv_to_rgba::avx2_yuv_to_rgba_row; pub(crate) use yuv_to_rgba_alpha::avx2_yuv_to_rgba_alpha; pub(crate) use yuv_to_yuv2::yuv_to_yuy2_avx2_row; diff --git a/src/avx2/yuv_p16_to_rgb16_alpha.rs b/src/avx2/yuv_p16_to_rgb16_alpha.rs new file mode 100644 index 0000000..9188615 --- /dev/null +++ b/src/avx2/yuv_p16_to_rgb16_alpha.rs @@ -0,0 +1,271 @@ +/* + * Copyright (c) Radzivon Bartoshyk, 10/2024. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +use crate::avx2::avx2_utils::{_mm256_interleave_rgba_epi16, shuffle}; +use crate::internals::ProcessedOffset; +use crate::yuv_support::{ + CbCrInverseTransform, YuvBytesPacking, YuvChromaRange, YuvChromaSubsampling, YuvEndianness, + YuvSourceChannels, +}; +#[cfg(target_arch = "x86")] +use std::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use std::arch::x86_64::*; + +pub(crate) unsafe fn avx_yuv_p16_to_rgba_alpha_row< + const DESTINATION_CHANNELS: u8, + const SAMPLING: u8, + const ENDIANNESS: u8, + const BYTES_POSITION: u8, + const BIT_DEPTH: usize, + const PRECISION: i32, +>( + y_plane: &[u16], + u_plane: &[u16], + v_plane: &[u16], + a_plane: &[u16], + bgra: &mut [u16], + width: u32, + range: &YuvChromaRange, + transform: &CbCrInverseTransform, + start_cx: usize, + start_ux: usize, +) -> ProcessedOffset { + unsafe { + avx_yuv_p16_to_rgba_row_alpha_impl::< + DESTINATION_CHANNELS, + SAMPLING, + ENDIANNESS, + BYTES_POSITION, + BIT_DEPTH, + PRECISION, + >( + y_plane, u_plane, v_plane, a_plane, bgra, width, range, transform, start_cx, start_ux, + ) + } +} + +#[target_feature(enable = "avx2")] +unsafe fn avx_yuv_p16_to_rgba_row_alpha_impl< + const DESTINATION_CHANNELS: u8, + const SAMPLING: u8, + const ENDIANNESS: u8, + const BYTES_POSITION: u8, + const BIT_DEPTH: usize, + const PRECISION: i32, +>( + y_plane: &[u16], + u_plane: &[u16], + v_plane: &[u16], + a_plane: &[u16], + bgra: &mut [u16], + width: u32, + range: &YuvChromaRange, + transform: &CbCrInverseTransform, + start_cx: usize, + start_ux: usize, +) -> ProcessedOffset { + let destination_channels: YuvSourceChannels = DESTINATION_CHANNELS.into(); + assert!( + destination_channels == YuvSourceChannels::Rgba + || destination_channels == YuvSourceChannels::Bgra + ); + let channels = destination_channels.get_channels_count(); + let chroma_subsampling: YuvChromaSubsampling = SAMPLING.into(); + let endianness: YuvEndianness = ENDIANNESS.into(); + let bytes_position: YuvBytesPacking = BYTES_POSITION.into(); + let cr_coef = transform.cr_coef; + let cb_coef = transform.cb_coef; + let y_coef = transform.y_coef; + let g_coef_1 = transform.g_coeff_1; + let g_coef_2 = transform.g_coeff_2; + + let bias_y = range.bias_y as i32; + let bias_uv = range.bias_uv as i32; + + let dst_ptr = bgra; + + let v_max_colors = _mm256_set1_epi16((1i16 << BIT_DEPTH as i16) - 1); + + let y_corr = _mm256_set1_epi16(bias_y as i16); + let uv_corr = _mm256_set1_epi16(bias_uv as i16); + let uv_corr_q = _mm256_set1_epi16((bias_uv as i16) << 1); + let v_luma_coeff = _mm256_set1_epi16((y_coef as i16) << 1); + let v_cr_coeff = _mm256_set1_epi16((cr_coef as i16) << 1); + let v_cb_coeff = _mm256_set1_epi16((cb_coef as i16) << 1); + let zeros = _mm256_setzero_si256(); + let v_g_coeff_1 = _mm256_set1_epi16(-((g_coef_1 as i16) << 1)); + let v_g_coeff_2 = _mm256_set1_epi16(-((g_coef_2 as i16) << 1)); + + let mut cx = start_cx; + let mut ux = start_ux; + + let v_big_shift_count = _mm_set1_epi64x(16i64 - BIT_DEPTH as i64); + + let big_endian_shuffle_flag = _mm256_setr_epi8( + 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, + 13, 12, 15, 14, + ); + let big_endian_shuffle_flag_sse = + _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + + while cx + 16 < width as usize { + let dst_ptr = dst_ptr.get_unchecked_mut(cx * channels..); + + let mut y_vl = _mm256_loadu_si256(y_plane.get_unchecked(cx..).as_ptr() as *const __m256i); + if endianness == YuvEndianness::BigEndian { + y_vl = _mm256_shuffle_epi8(y_vl, big_endian_shuffle_flag); + y_vl = _mm256_permute2x128_si256::<0x01>(y_vl, y_vl); + } + if bytes_position == YuvBytesPacking::MostSignificantBytes { + y_vl = _mm256_srl_epi16(y_vl, v_big_shift_count); + } + let mut y_values = _mm256_sub_epi16(y_vl, y_corr); + + let mut u_values; + let mut v_values; + + match chroma_subsampling { + YuvChromaSubsampling::Yuv420 | YuvChromaSubsampling::Yuv422 => { + let mut u_vals = + _mm_loadu_si128(u_plane.get_unchecked(ux..).as_ptr() as *const __m128i); + let mut v_vals = + _mm_loadu_si128(v_plane.get_unchecked(ux..).as_ptr() as *const __m128i); + + if endianness == YuvEndianness::BigEndian { + u_vals = _mm_shuffle_epi8(u_vals, big_endian_shuffle_flag_sse); + v_vals = _mm_shuffle_epi8(v_vals, big_endian_shuffle_flag_sse); + } + if bytes_position == YuvBytesPacking::MostSignificantBytes { + u_vals = _mm_srl_epi16(u_vals, v_big_shift_count); + v_vals = _mm_srl_epi16(v_vals, v_big_shift_count); + } + + let u_expanded = _mm256_set_m128i(u_vals, u_vals); // [A7, A6, ..., A0 | A7, A6, ..., A0] + let v_expanded = _mm256_set_m128i(v_vals, v_vals); // [A7, A6, ..., A0 | A7, A6, ..., A0] + const MASK: i32 = shuffle(3, 1, 2, 0); + let u_vl = + _mm256_permute4x64_epi64::(_mm256_unpacklo_epi16(u_expanded, u_expanded)); + let v_vl = + _mm256_permute4x64_epi64::(_mm256_unpacklo_epi16(v_expanded, v_expanded)); + + u_values = _mm256_sub_epi16(u_vl, uv_corr); + v_values = _mm256_sub_epi16(v_vl, uv_corr); + } + YuvChromaSubsampling::Yuv444 => { + let mut u_vals = + _mm256_loadu_si256(u_plane.get_unchecked(ux..).as_ptr() as *const __m256i); + let mut v_vals = + _mm256_loadu_si256(v_plane.get_unchecked(ux..).as_ptr() as *const __m256i); + + if endianness == YuvEndianness::BigEndian { + u_vals = _mm256_shuffle_epi8(u_vals, big_endian_shuffle_flag); + v_vals = _mm256_shuffle_epi8(v_vals, big_endian_shuffle_flag); + u_vals = _mm256_permute2x128_si256::<0x01>(u_vals, u_vals); + v_vals = _mm256_permute2x128_si256::<0x01>(v_vals, v_vals); + } + if bytes_position == YuvBytesPacking::MostSignificantBytes { + u_vals = _mm256_srl_epi16(u_vals, v_big_shift_count); + v_vals = _mm256_srl_epi16(v_vals, v_big_shift_count); + } + u_values = _mm256_sub_epi16(u_vals, uv_corr_q); + v_values = _mm256_sub_epi16(v_vals, uv_corr_q); + } + } + + u_values = _mm256_slli_epi16::<3>(u_values); + v_values = _mm256_slli_epi16::<3>(v_values); + y_values = _mm256_slli_epi16::<3>(y_values); + + let y_vals = _mm256_mulhi_epi16(y_values, v_luma_coeff); + + let r_vals = _mm256_add_epi16(y_vals, _mm256_mulhi_epi16(v_values, v_cr_coeff)); + let b_vals = _mm256_add_epi16(y_vals, _mm256_mulhi_epi16(u_values, v_cb_coeff)); + let g_vals = _mm256_add_epi16( + _mm256_add_epi16(y_vals, _mm256_mulhi_epi16(v_values, v_g_coeff_1)), + _mm256_mulhi_epi16(u_values, v_g_coeff_2), + ); + + let r_values = _mm256_min_epu16(_mm256_max_epi16(r_vals, zeros), v_max_colors); + let g_values = _mm256_min_epu16(_mm256_max_epi16(g_vals, zeros), v_max_colors); + let b_values = _mm256_min_epu16(_mm256_max_epi16(b_vals, zeros), v_max_colors); + + let a_values = _mm256_loadu_si256(a_plane.get_unchecked(cx..).as_ptr() as *const __m256i); + + match destination_channels { + YuvSourceChannels::Rgb | YuvSourceChannels::Bgr => { + unreachable!("This method can't be called with Rgb/Bgr"); + } + YuvSourceChannels::Rgba => { + let dst_pack = _mm256_interleave_rgba_epi16(r_values, g_values, b_values, a_values); + _mm256_storeu_si256(dst_ptr.as_mut_ptr() as *mut __m256i, dst_pack.0); + _mm256_storeu_si256( + dst_ptr.get_unchecked_mut(16..).as_mut_ptr() as *mut __m256i, + dst_pack.1, + ); + _mm256_storeu_si256( + dst_ptr.get_unchecked_mut(32..).as_mut_ptr() as *mut __m256i, + dst_pack.2, + ); + _mm256_storeu_si256( + dst_ptr.get_unchecked_mut(48..).as_mut_ptr() as *mut __m256i, + dst_pack.3, + ); + } + YuvSourceChannels::Bgra => { + let dst_pack = _mm256_interleave_rgba_epi16(b_values, g_values, r_values, a_values); + _mm256_storeu_si256(dst_ptr.as_mut_ptr() as *mut __m256i, dst_pack.0); + _mm256_storeu_si256( + dst_ptr.get_unchecked_mut(16..).as_mut_ptr() as *mut __m256i, + dst_pack.1, + ); + _mm256_storeu_si256( + dst_ptr.get_unchecked_mut(32..).as_mut_ptr() as *mut __m256i, + dst_pack.2, + ); + _mm256_storeu_si256( + dst_ptr.get_unchecked_mut(48..).as_mut_ptr() as *mut __m256i, + dst_pack.3, + ); + } + } + + cx += 16; + match chroma_subsampling { + YuvChromaSubsampling::Yuv420 | YuvChromaSubsampling::Yuv422 => { + ux += 8; + } + YuvChromaSubsampling::Yuv444 => { + ux += 16; + } + } + } + + ProcessedOffset { cx, ux } +} diff --git a/src/avx512bw/ycgco_to_rgba_alpha.rs b/src/avx512bw/ycgco_to_rgba_alpha.rs index 6c3a9d2..b035805 100644 --- a/src/avx512bw/ycgco_to_rgba_alpha.rs +++ b/src/avx512bw/ycgco_to_rgba_alpha.rs @@ -36,7 +36,10 @@ use std::arch::x86::*; use std::arch::x86_64::*; #[target_feature(enable = "avx512bw")] -pub(crate) unsafe fn avx512_ycgco_to_rgba_alpha( +pub(crate) unsafe fn avx512_ycgco_to_rgba_alpha< + const DESTINATION_CHANNELS: u8, + const SAMPLING: u8, +>( range: &YuvChromaRange, y_plane: &[u8], cg_plane: &[u8], diff --git a/src/yuv_p16_rgba16_alpha.rs b/src/yuv_p16_rgba16_alpha.rs index 4047084..1fb0b2f 100644 --- a/src/yuv_p16_rgba16_alpha.rs +++ b/src/yuv_p16_rgba16_alpha.rs @@ -26,6 +26,10 @@ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +use crate::avx2::avx_yuv_p16_to_rgba_alpha_row; +#[allow(unused_imports)] +use crate::internals::ProcessedOffset; #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] use crate::neon::{neon_yuv_p16_to_rgba16_alpha_row, neon_yuv_p16_to_rgba16_alpha_row_rdm}; use crate::numerics::{qrshr, to_ne}; @@ -115,6 +119,8 @@ fn yuv_p16_to_image_alpha_ant< }; #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] let _use_sse = std::arch::is_x86_feature_detected!("sse4.1"); + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + let use_avx = std::arch::is_x86_feature_detected!("avx2"); let process_wide_row = |_y_plane: &[u16], _u_plane: &[u16], @@ -143,6 +149,30 @@ fn yuv_p16_to_image_alpha_ant< #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { unsafe { + let mut v_offset = ProcessedOffset { cx: 0, ux: 0 }; + if use_avx && BIT_DEPTH <= 12 { + let offset = avx_yuv_p16_to_rgba_alpha_row::< + DESTINATION_CHANNELS, + SAMPLING, + ENDIANNESS, + BYTES_POSITION, + BIT_DEPTH, + PRECISION, + >( + _y_plane, + _u_plane, + _v_plane, + _a_plane, + _rgba, + image.width, + &range, + &i_transform, + v_offset.cx, + v_offset.ux, + ); + v_offset = offset; + _cx = offset.cx; + } if _use_sse && BIT_DEPTH <= 12 { let offset = sse_yuv_p16_to_rgba_alpha_row::< DESTINATION_CHANNELS, @@ -160,8 +190,8 @@ fn yuv_p16_to_image_alpha_ant< image.width, &range, &i_transform, - 0, - 0, + v_offset.cx, + v_offset.ux, ); _cx = offset.cx; } diff --git a/src/yuv_p16_rgba_p16.rs b/src/yuv_p16_rgba_p16.rs index ba33368..cb21ca5 100644 --- a/src/yuv_p16_rgba_p16.rs +++ b/src/yuv_p16_rgba_p16.rs @@ -28,6 +28,7 @@ */ #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] use crate::avx2::avx_yuv_p16_to_rgba_row; +#[allow(unused_imports)] use crate::internals::ProcessedOffset; #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] use crate::neon::{neon_yuv_p16_to_rgba16_row, neon_yuv_p16_to_rgba16_row_rdm};