From 8d360d92c780a43e72aa7e84a3daf9372cb34ff3 Mon Sep 17 00:00:00 2001 From: Radzivon Bartoshyk Date: Mon, 25 Nov 2024 20:09:20 +0000 Subject: [PATCH] AVX, SSE encoding/decoding improvements --- README.md | 6 +- app/benches/yuv8/main.rs | 100 ++++++---- src/avx2/mod.rs | 2 + src/avx2/rgb_to_nv.rs | 51 ++--- src/avx2/rgb_to_y.rs | 14 +- src/avx2/rgba_to_yuv.rs | 65 +++---- src/avx2/rgba_to_yuv420.rs | 350 ++++++++++++++++++++++++++++++++++ src/avx2/yuv_nv_to_rgba.rs | 49 ++--- src/avx2/yuv_nv_to_rgba420.rs | 34 ++-- src/avx2/yuv_to_rgba.rs | 22 +-- src/avx2/yuv_to_rgba420.rs | 34 ++-- src/avx2/yuv_to_rgba_alpha.rs | 47 ++--- src/rgba_to_yuv.rs | 44 ++++- src/sse/mod.rs | 2 + src/sse/rgb_to_nv.rs | 51 +++-- src/sse/rgb_to_y.rs | 13 +- src/sse/rgba_to_yuv.rs | 66 ++++--- src/sse/rgba_to_yuv420.rs | 319 +++++++++++++++++++++++++++++++ src/sse/yuv_nv_to_rgba.rs | 71 +++---- src/sse/yuv_nv_to_rgba420.rs | 50 ++--- src/sse/yuv_to_rgba.rs | 41 ++-- src/sse/yuv_to_rgba420.rs | 50 ++--- src/sse/yuv_to_rgba_alpha.rs | 49 ++--- 23 files changed, 1137 insertions(+), 393 deletions(-) create mode 100644 src/avx2/rgba_to_yuv420.rs create mode 100644 src/sse/rgba_to_yuv420.rs diff --git a/README.md b/README.md index 3644016..72c10f5 100644 --- a/README.md +++ b/README.md @@ -76,9 +76,9 @@ Tests performed on the image 5763x3842 | | time(NEON) | Time(AVX) | |------------------------|:----------:|:---------:| -| utils RGB->YUV 4:2:0 | 3.48ms | 6.14ms | +| utils RGB->YUV 4:2:0 | 3.48ms | 3.64ms | | libyuv RGB->YUV 4:2:0 | 3.58ms | 33.87ms | -| utils RGBA->YUV 4:2:0 | 4.32ms | 7.34ms | +| utils RGBA->YUV 4:2:0 | 4.32ms | 5.74ms | | libyuv RGBA->YUV 4:2:0 | 4.87ms | 23.48ms | | utils RGBA->YUV 4:2:2 | 4.83ms | 7.08ms | | libyuv RGBA->YUV 4:2:2 | 5.90ms | 35.23ms | @@ -90,7 +90,7 @@ Tests performed on the image 5763x3842 |------------------------|:----------:|:---------:| | utils YUV NV12->RGB | 3.86ms | 6.48ms | | libyuv YUV NV12->RGB | 5.20ms | 45.28ms | -| utils YUV 4:2:0->RGB | 3.28ms | 5.44ms | +| utils YUV 4:2:0->RGB | 3.28ms | 5.34ms | | libyuv YUV 4:2:0->RGB | 5.70ms | 44.95ms | | utils YUV 4:2:0->RGBA | 3.82ms | 5.98ms | | libyuv YUV 4:2:0->RGBA | 6.13ms | 6.88ms | diff --git a/app/benches/yuv8/main.rs b/app/benches/yuv8/main.rs index 4408cf1..c510271 100644 --- a/app/benches/yuv8/main.rs +++ b/app/benches/yuv8/main.rs @@ -26,7 +26,7 @@ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ - +use std::alloc::Layout; use criterion::{criterion_group, criterion_main, Criterion}; use image::{GenericImageView, ImageReader}; use yuv_sys::{ @@ -99,25 +99,36 @@ pub fn criterion_benchmark(c: &mut Criterion) { }); c.bench_function("libyuv RGB -> YUV 4:2:0", |b| { - let mut test_planar = YuvPlanarImageMut::::alloc( - dimensions.0, - dimensions.1, - YuvChromaSubsampling::Yuv420, - ); - b.iter(|| unsafe { - rs_RGB24ToI420( - src_bytes.as_ptr(), - stride as i32, - test_planar.y_plane.borrow_mut().as_mut_ptr(), - test_planar.y_stride as i32, - test_planar.u_plane.borrow_mut().as_mut_ptr(), - test_planar.u_stride as i32, - test_planar.v_plane.borrow_mut().as_mut_ptr(), - test_planar.v_stride as i32, - test_planar.width as i32, - test_planar.height as i32, - ); - }) + unsafe { + let layout_rgb = Layout::from_size_align(dimensions.0 as usize * dimensions.1 as usize * 3, 16).unwrap(); + let layout_y = Layout::from_size_align(dimensions.0 as usize * dimensions.1 as usize, 16).unwrap(); + let layout_uv = Layout::from_size_align((dimensions.0 as usize + 1) / 2 * (dimensions.1 as usize + 1) / 2, 16).unwrap(); + let target_y = std::alloc::alloc(layout_y); + let target_u = std::alloc::alloc(layout_uv); + let target_v = std::alloc::alloc(layout_uv); + let source_rgb = std::alloc::alloc(layout_rgb); + for (x, src) in src_bytes.iter().enumerate() { + *source_rgb.add(x) = *src; + } + b.iter(|| { + rs_RGB24ToI420( + source_rgb, + stride as i32, + target_y, + dimensions.0 as i32, + target_u, + (dimensions.0 as i32 + 1) / 2, + target_v, + (dimensions.0 as i32 + 1) / 2, + dimensions.0 as i32, + dimensions.1 as i32, + ); + }); + std::alloc::dealloc(target_y, layout_y); + std::alloc::dealloc(target_u, layout_uv); + std::alloc::dealloc(target_v, layout_uv); + std::alloc::dealloc(source_rgb, layout_rgb); + } }); c.bench_function("yuvutils RGBA -> YUV 4:2:0", |b| { @@ -139,25 +150,36 @@ pub fn criterion_benchmark(c: &mut Criterion) { }); c.bench_function("libyuv RGBA -> YUV 4:2:0", |b| { - let mut test_planar = YuvPlanarImageMut::::alloc( - dimensions.0, - dimensions.1, - YuvChromaSubsampling::Yuv420, - ); - b.iter(|| unsafe { - rs_ABGRToI420( - rgba_image.as_ptr(), - dimensions.0 as i32 * 4i32, - test_planar.y_plane.borrow_mut().as_mut_ptr(), - test_planar.y_stride as i32, - test_planar.u_plane.borrow_mut().as_mut_ptr(), - test_planar.u_stride as i32, - test_planar.v_plane.borrow_mut().as_mut_ptr(), - test_planar.v_stride as i32, - test_planar.width as i32, - test_planar.height as i32, - ); - }) + unsafe { + let layout_rgba = Layout::from_size_align(dimensions.0 as usize * dimensions.1 as usize * 4, 16).unwrap(); + let layout_y = Layout::from_size_align(dimensions.0 as usize * dimensions.1 as usize, 16).unwrap(); + let layout_uv = Layout::from_size_align((dimensions.0 as usize + 1) / 2 * (dimensions.1 as usize + 1) / 2, 16).unwrap(); + let target_y = std::alloc::alloc(layout_y); + let target_u = std::alloc::alloc(layout_uv); + let target_v = std::alloc::alloc(layout_uv); + let source_rgb = std::alloc::alloc(layout_rgba); + for (x, src) in src_bytes.iter().enumerate() { + *source_rgb.add(x) = *src; + } + b.iter(|| { + rs_ABGRToI420( + source_rgb, + dimensions.0 as i32 * 4i32, + target_y, + dimensions.0 as i32, + target_u, + (dimensions.0 as i32 + 1) / 2, + target_v, + (dimensions.0 as i32 + 1) / 2, + dimensions.0 as i32, + dimensions.1 as i32, + ); + }); + std::alloc::dealloc(target_y, layout_y); + std::alloc::dealloc(target_u, layout_uv); + std::alloc::dealloc(target_v, layout_uv); + std::alloc::dealloc(source_rgb, layout_rgba); + } }); c.bench_function("yuvutils RGBA -> YUV 4:2:2", |b| { diff --git a/src/avx2/mod.rs b/src/avx2/mod.rs index 07bbd26..7f67999 100644 --- a/src/avx2/mod.rs +++ b/src/avx2/mod.rs @@ -45,6 +45,7 @@ mod yuv_to_rgba_alpha; mod yuv_to_yuv2; mod yuy2_to_rgb; mod yuy2_to_yuv; +mod rgba_to_yuv420; pub(crate) use rgb_to_nv::avx2_rgba_to_nv; pub(crate) use rgb_to_y::avx2_rgb_to_y_row; @@ -62,3 +63,4 @@ pub(crate) use yuv_to_rgba_alpha::avx2_yuv_to_rgba_alpha; pub(crate) use yuv_to_yuv2::yuv_to_yuy2_avx2_row; pub(crate) use yuy2_to_rgb::yuy2_to_rgb_avx; pub(crate) use yuy2_to_yuv::yuy2_to_yuv_avx; +pub(crate) use rgba_to_yuv420::avx2_rgba_to_yuv420; \ No newline at end of file diff --git a/src/avx2/rgb_to_nv.rs b/src/avx2/rgb_to_nv.rs index 051d135..efb9add 100644 --- a/src/avx2/rgb_to_nv.rs +++ b/src/avx2/rgb_to_nv.rs @@ -94,7 +94,8 @@ unsafe fn avx2_rgba_to_nv_impl< let mut uv_x = start_ux; const V_SHR: i32 = 3; - const V_SCALE: i32 = 7; + const V_SCALE: i32 = 6; + let rounding_const_bias: i16 = 1 << (V_SHR - 1); let bias_y = range.bias_y as i16 * (1 << V_SHR) + rounding_const_bias; let bias_uv = range.bias_uv as i16 * (1 << V_SHR) + rounding_const_bias; @@ -180,10 +181,10 @@ unsafe fn avx2_rgba_to_nv_impl< y_bias, _mm256_add_epi16( _mm256_add_epi16( - _mm256_mulhi_epi16(r_low, v_yr), - _mm256_mulhi_epi16(g_low, v_yg), + _mm256_mulhrs_epi16(r_low, v_yr), + _mm256_mulhrs_epi16(g_low, v_yg), ), - _mm256_mulhi_epi16(b_low, v_yb), + _mm256_mulhrs_epi16(b_low, v_yb), ), )), i_cap_y, @@ -197,10 +198,10 @@ unsafe fn avx2_rgba_to_nv_impl< y_bias, _mm256_add_epi16( _mm256_add_epi16( - _mm256_mulhi_epi16(r_high, v_yr), - _mm256_mulhi_epi16(g_high, v_yg), + _mm256_mulhrs_epi16(r_high, v_yr), + _mm256_mulhrs_epi16(g_high, v_yg), ), - _mm256_mulhi_epi16(b_high, v_yb), + _mm256_mulhrs_epi16(b_high, v_yb), ), )), i_cap_y, @@ -218,10 +219,10 @@ unsafe fn avx2_rgba_to_nv_impl< uv_bias, _mm256_add_epi16( _mm256_add_epi16( - _mm256_mulhi_epi16(r_low, v_cb_r), - _mm256_mulhi_epi16(g_low, v_cb_g), + _mm256_mulhrs_epi16(r_low, v_cb_r), + _mm256_mulhrs_epi16(g_low, v_cb_g), ), - _mm256_mulhi_epi16(b_low, v_cb_b), + _mm256_mulhrs_epi16(b_low, v_cb_b), ), )), i_cap_uv, @@ -234,10 +235,10 @@ unsafe fn avx2_rgba_to_nv_impl< uv_bias, _mm256_add_epi16( _mm256_add_epi16( - _mm256_mulhi_epi16(r_low, v_cr_r), - _mm256_mulhi_epi16(g_low, v_cr_g), + _mm256_mulhrs_epi16(r_low, v_cr_r), + _mm256_mulhrs_epi16(g_low, v_cr_g), ), - _mm256_mulhi_epi16(b_low, v_cr_b), + _mm256_mulhrs_epi16(b_low, v_cr_b), ), )), i_cap_uv, @@ -250,10 +251,10 @@ unsafe fn avx2_rgba_to_nv_impl< uv_bias, _mm256_add_epi16( _mm256_add_epi16( - _mm256_mulhi_epi16(r_high, v_cb_r), - _mm256_mulhi_epi16(g_high, v_cb_g), + _mm256_mulhrs_epi16(r_high, v_cb_r), + _mm256_mulhrs_epi16(g_high, v_cb_g), ), - _mm256_mulhi_epi16(b_high, v_cb_b), + _mm256_mulhrs_epi16(b_high, v_cb_b), ), )), i_cap_uv, @@ -266,10 +267,10 @@ unsafe fn avx2_rgba_to_nv_impl< uv_bias, _mm256_add_epi16( _mm256_add_epi16( - _mm256_mulhi_epi16(r_high, v_cr_r), - _mm256_mulhi_epi16(g_high, v_cr_g), + _mm256_mulhrs_epi16(r_high, v_cr_r), + _mm256_mulhrs_epi16(g_high, v_cr_g), ), - _mm256_mulhi_epi16(b_high, v_cr_b), + _mm256_mulhrs_epi16(b_high, v_cr_b), ), )), i_cap_uv, @@ -300,10 +301,10 @@ unsafe fn avx2_rgba_to_nv_impl< uv_bias, _mm256_add_epi16( _mm256_add_epi16( - _mm256_mulhi_epi16(r1, v_cb_r), - _mm256_mulhi_epi16(g1, v_cb_g), + _mm256_mulhrs_epi16(r1, v_cb_r), + _mm256_mulhrs_epi16(g1, v_cb_g), ), - _mm256_mulhi_epi16(b1, v_cb_b), + _mm256_mulhrs_epi16(b1, v_cb_b), ), )), i_cap_uv, @@ -316,10 +317,10 @@ unsafe fn avx2_rgba_to_nv_impl< uv_bias, _mm256_add_epi16( _mm256_add_epi16( - _mm256_mulhi_epi16(r1, v_cr_r), - _mm256_mulhi_epi16(g1, v_cr_g), + _mm256_mulhrs_epi16(r1, v_cr_r), + _mm256_mulhrs_epi16(g1, v_cr_g), ), - _mm256_mulhi_epi16(b1, v_cr_b), + _mm256_mulhrs_epi16(b1, v_cr_b), ), )), i_cap_uv, diff --git a/src/avx2/rgb_to_y.rs b/src/avx2/rgb_to_y.rs index 4091873..29cd446 100644 --- a/src/avx2/rgb_to_y.rs +++ b/src/avx2/rgb_to_y.rs @@ -67,7 +67,7 @@ pub(crate) unsafe fn avx2_rgb_to_y_row_impl( let mut cx = start_cx; const V_SHR: i32 = 3; - const V_SCALE: i32 = 7; + const V_SCALE: i32 = 6; let rounding_const_bias: i16 = 1 << (V_SHR - 1); let bias_y = range.bias_y as i16 * (1 << V_SHR) + rounding_const_bias; @@ -144,10 +144,10 @@ pub(crate) unsafe fn avx2_rgb_to_y_row_impl( y_bias, _mm256_add_epi16( _mm256_add_epi16( - _mm256_mulhi_epi16(r_low, v_yr), - _mm256_mulhi_epi16(g_low, v_yg), + _mm256_mulhrs_epi16(r_low, v_yr), + _mm256_mulhrs_epi16(g_low, v_yg), ), - _mm256_mulhi_epi16(b_low, v_yb), + _mm256_mulhrs_epi16(b_low, v_yb), ), )), i_cap_y, @@ -161,10 +161,10 @@ pub(crate) unsafe fn avx2_rgb_to_y_row_impl( y_bias, _mm256_add_epi16( _mm256_add_epi16( - _mm256_mulhi_epi16(r_high, v_yr), - _mm256_mulhi_epi16(g_high, v_yg), + _mm256_mulhrs_epi16(r_high, v_yr), + _mm256_mulhrs_epi16(g_high, v_yg), ), - _mm256_mulhi_epi16(b_high, v_yb), + _mm256_mulhrs_epi16(b_high, v_yb), ), )), i_cap_y, diff --git a/src/avx2/rgba_to_yuv.rs b/src/avx2/rgba_to_yuv.rs index 93f8b8c..25c46f4 100644 --- a/src/avx2/rgba_to_yuv.rs +++ b/src/avx2/rgba_to_yuv.rs @@ -49,20 +49,10 @@ pub(crate) fn avx2_rgba_to_yuv( start_cx: usize, start_ux: usize, width: usize, - compute_uv_row: bool, ) -> ProcessedOffset { unsafe { avx2_rgba_to_yuv_impl::( - transform, - range, - y_plane, - u_plane, - v_plane, - rgba, - start_cx, - start_ux, - width, - compute_uv_row, + transform, range, y_plane, u_plane, v_plane, rgba, start_cx, start_ux, width, ) } } @@ -78,7 +68,6 @@ unsafe fn avx2_rgba_to_yuv_impl( start_cx: usize, start_ux: usize, width: usize, - compute_uv_row: bool, ) -> ProcessedOffset { let chroma_subsampling: YuvChromaSubsampling = SAMPLING.into(); let source_channels: YuvSourceChannels = ORIGIN_CHANNELS.into(); @@ -93,7 +82,7 @@ unsafe fn avx2_rgba_to_yuv_impl( let mut uv_x = start_ux; const V_SHR: i32 = 3; - const V_SCALE: i32 = 7; + const V_SCALE: i32 = 6; let rounding_const_bias: i16 = 1 << (V_SHR - 1); let bias_y = range.bias_y as i16 * (1 << V_SHR) + rounding_const_bias; let bias_uv = range.bias_uv as i16 * (1 << V_SHR) + rounding_const_bias; @@ -179,10 +168,10 @@ unsafe fn avx2_rgba_to_yuv_impl( y_bias, _mm256_add_epi16( _mm256_add_epi16( - _mm256_mulhi_epi16(r_low, v_yr), - _mm256_mulhi_epi16(g_low, v_yg), + _mm256_mulhrs_epi16(r_low, v_yr), + _mm256_mulhrs_epi16(g_low, v_yg), ), - _mm256_mulhi_epi16(b_low, v_yb), + _mm256_mulhrs_epi16(b_low, v_yb), ), )), i_cap_y, @@ -196,10 +185,10 @@ unsafe fn avx2_rgba_to_yuv_impl( y_bias, _mm256_add_epi16( _mm256_add_epi16( - _mm256_mulhi_epi16(r_high, v_yr), - _mm256_mulhi_epi16(g_high, v_yg), + _mm256_mulhrs_epi16(r_high, v_yr), + _mm256_mulhrs_epi16(g_high, v_yg), ), - _mm256_mulhi_epi16(b_high, v_yb), + _mm256_mulhrs_epi16(b_high, v_yb), ), )), i_cap_y, @@ -217,10 +206,10 @@ unsafe fn avx2_rgba_to_yuv_impl( uv_bias, _mm256_add_epi16( _mm256_add_epi16( - _mm256_mulhi_epi16(r_low, v_cb_r), - _mm256_mulhi_epi16(g_low, v_cb_g), + _mm256_mulhrs_epi16(r_low, v_cb_r), + _mm256_mulhrs_epi16(g_low, v_cb_g), ), - _mm256_mulhi_epi16(b_low, v_cb_b), + _mm256_mulhrs_epi16(b_low, v_cb_b), ), )), i_cap_uv, @@ -233,10 +222,10 @@ unsafe fn avx2_rgba_to_yuv_impl( uv_bias, _mm256_add_epi16( _mm256_add_epi16( - _mm256_mulhi_epi16(r_low, v_cr_r), - _mm256_mulhi_epi16(g_low, v_cr_g), + _mm256_mulhrs_epi16(r_low, v_cr_r), + _mm256_mulhrs_epi16(g_low, v_cr_g), ), - _mm256_mulhi_epi16(b_low, v_cr_b), + _mm256_mulhrs_epi16(b_low, v_cr_b), ), )), i_cap_uv, @@ -249,10 +238,10 @@ unsafe fn avx2_rgba_to_yuv_impl( uv_bias, _mm256_add_epi16( _mm256_add_epi16( - _mm256_mulhi_epi16(r_high, v_cb_r), - _mm256_mulhi_epi16(g_high, v_cb_g), + _mm256_mulhrs_epi16(r_high, v_cb_r), + _mm256_mulhrs_epi16(g_high, v_cb_g), ), - _mm256_mulhi_epi16(b_high, v_cb_b), + _mm256_mulhrs_epi16(b_high, v_cb_b), ), )), i_cap_uv, @@ -265,10 +254,10 @@ unsafe fn avx2_rgba_to_yuv_impl( uv_bias, _mm256_add_epi16( _mm256_add_epi16( - _mm256_mulhi_epi16(r_high, v_cr_r), - _mm256_mulhi_epi16(g_high, v_cr_g), + _mm256_mulhrs_epi16(r_high, v_cr_r), + _mm256_mulhrs_epi16(g_high, v_cr_g), ), - _mm256_mulhi_epi16(b_high, v_cr_b), + _mm256_mulhrs_epi16(b_high, v_cr_b), ), )), i_cap_uv, @@ -283,7 +272,7 @@ unsafe fn avx2_rgba_to_yuv_impl( _mm256_storeu_si256(v_ptr.add(uv_x) as *mut __m256i, cr); uv_x += 32; } else if chroma_subsampling == YuvChromaSubsampling::Yuv422 - || (chroma_subsampling == YuvChromaSubsampling::Yuv420 && compute_uv_row) + || (chroma_subsampling == YuvChromaSubsampling::Yuv420) { let r1 = _mm256_avg_epu16(r_low, r_high); let g1 = _mm256_avg_epu16(g_low, g_high); @@ -294,10 +283,10 @@ unsafe fn avx2_rgba_to_yuv_impl( uv_bias, _mm256_add_epi16( _mm256_add_epi16( - _mm256_mulhi_epi16(r1, v_cb_r), - _mm256_mulhi_epi16(g1, v_cb_g), + _mm256_mulhrs_epi16(r1, v_cb_r), + _mm256_mulhrs_epi16(g1, v_cb_g), ), - _mm256_mulhi_epi16(b1, v_cb_b), + _mm256_mulhrs_epi16(b1, v_cb_b), ), )), i_cap_uv, @@ -310,10 +299,10 @@ unsafe fn avx2_rgba_to_yuv_impl( uv_bias, _mm256_add_epi16( _mm256_add_epi16( - _mm256_mulhi_epi16(r1, v_cr_r), - _mm256_mulhi_epi16(g1, v_cr_g), + _mm256_mulhrs_epi16(r1, v_cr_r), + _mm256_mulhrs_epi16(g1, v_cr_g), ), - _mm256_mulhi_epi16(b1, v_cr_b), + _mm256_mulhrs_epi16(b1, v_cr_b), ), )), i_cap_uv, diff --git a/src/avx2/rgba_to_yuv420.rs b/src/avx2/rgba_to_yuv420.rs new file mode 100644 index 0000000..f03618d --- /dev/null +++ b/src/avx2/rgba_to_yuv420.rs @@ -0,0 +1,350 @@ +/* + * Copyright (c) Radzivon Bartoshyk, 10/2024. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +use crate::avx2::avx2_utils::{ + _mm256_deinterleave_rgba_epi8, avx2_deinterleave_rgb, avx2_pack_u16, +}; +use crate::internals::ProcessedOffset; +use crate::yuv_support::{ + CbCrForwardTransform, YuvChromaRange, YuvSourceChannels, +}; +#[cfg(target_arch = "x86")] +use std::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use std::arch::x86_64::*; + +pub(crate) fn avx2_rgba_to_yuv420( + transform: &CbCrForwardTransform, + range: &YuvChromaRange, + y_plane0: &mut [u8], + y_plane1: &mut [u8], + u_plane: &mut [u8], + v_plane: &mut [u8], + rgba0: &[u8], + rgba1: &[u8], + start_cx: usize, + start_ux: usize, + width: usize, +) -> ProcessedOffset { + unsafe { + avx2_rgba_to_yuv_impl420::( + transform, range, y_plane0, y_plane1, u_plane, v_plane, rgba0, rgba1, start_cx, + start_ux, width, + ) + } +} + +#[target_feature(enable = "avx2")] +unsafe fn avx2_rgba_to_yuv_impl420( + transform: &CbCrForwardTransform, + range: &YuvChromaRange, + y_plane0: &mut [u8], + y_plane1: &mut [u8], + u_plane: &mut [u8], + v_plane: &mut [u8], + rgba0: &[u8], + rgba1: &[u8], + start_cx: usize, + start_ux: usize, + width: usize, +) -> ProcessedOffset { + let source_channels: YuvSourceChannels = ORIGIN_CHANNELS.into(); + let channels = source_channels.get_channels_count(); + + let u_ptr = u_plane.as_mut_ptr(); + let v_ptr = v_plane.as_mut_ptr(); + + let mut cx = start_cx; + let mut uv_x = start_ux; + + const V_SHR: i32 = 3; + const V_SCALE: i32 = 6; + let rounding_const_bias: i16 = 1 << (V_SHR - 1); + let bias_y = range.bias_y as i16 * (1 << V_SHR) + rounding_const_bias; + let bias_uv = range.bias_uv as i16 * (1 << V_SHR) + rounding_const_bias; + + let i_bias_y = _mm256_set1_epi16(range.bias_y as i16); + let i_cap_y = _mm256_set1_epi16(range.range_y as i16 + range.bias_y as i16); + let i_cap_uv = _mm256_set1_epi16(range.bias_y as i16 + range.range_uv as i16); + + let y_bias = _mm256_set1_epi16(bias_y); + let uv_bias = _mm256_set1_epi16(bias_uv); + let v_yr = _mm256_set1_epi16(transform.yr as i16); + let v_yg = _mm256_set1_epi16(transform.yg as i16); + let v_yb = _mm256_set1_epi16(transform.yb as i16); + let v_cb_r = _mm256_set1_epi16(transform.cb_r as i16); + let v_cb_g = _mm256_set1_epi16(transform.cb_g as i16); + let v_cb_b = _mm256_set1_epi16(transform.cb_b as i16); + let v_cr_r = _mm256_set1_epi16(transform.cr_r as i16); + let v_cr_g = _mm256_set1_epi16(transform.cr_g as i16); + let v_cr_b = _mm256_set1_epi16(transform.cr_b as i16); + + while cx + 32 < width { + let (r_values0, g_values0, b_values0); + let (r_values1, g_values1, b_values1); + + let px = cx * channels; + + match source_channels { + YuvSourceChannels::Rgb | YuvSourceChannels::Bgr => { + let source_ptr0 = rgba0.get_unchecked(px..).as_ptr(); + let row_1 = _mm256_loadu_si256(source_ptr0 as *const __m256i); + let row_2 = _mm256_loadu_si256(source_ptr0.add(32) as *const __m256i); + let row_3 = _mm256_loadu_si256(source_ptr0.add(64) as *const __m256i); + + let (it1, it2, it3) = avx2_deinterleave_rgb(row_1, row_2, row_3); + if source_channels == YuvSourceChannels::Rgb { + r_values0 = it1; + g_values0 = it2; + b_values0 = it3; + } else { + r_values0 = it3; + g_values0 = it2; + b_values0 = it1; + } + + let source_ptr1 = rgba0.get_unchecked(px..).as_ptr(); + let row_11 = _mm256_loadu_si256(source_ptr1 as *const __m256i); + let row_21 = _mm256_loadu_si256(source_ptr1.add(32) as *const __m256i); + let row_31 = _mm256_loadu_si256(source_ptr1.add(64) as *const __m256i); + + let (it11, it21, it31) = avx2_deinterleave_rgb(row_11, row_21, row_31); + if source_channels == YuvSourceChannels::Rgb { + r_values1 = it11; + g_values1 = it21; + b_values1 = it31; + } else { + r_values1 = it31; + g_values1 = it21; + b_values1 = it11; + } + } + YuvSourceChannels::Rgba | YuvSourceChannels::Bgra => { + let source_ptr0 = rgba0.get_unchecked(px..).as_ptr(); + let row_1 = _mm256_loadu_si256(source_ptr0 as *const __m256i); + let row_2 = _mm256_loadu_si256(source_ptr0.add(32) as *const __m256i); + let row_3 = _mm256_loadu_si256(source_ptr0.add(64) as *const __m256i); + let row_4 = _mm256_loadu_si256(source_ptr0.add(96) as *const __m256i); + + let (it1, it2, it3, _) = _mm256_deinterleave_rgba_epi8(row_1, row_2, row_3, row_4); + if source_channels == YuvSourceChannels::Rgba { + r_values0 = it1; + g_values0 = it2; + b_values0 = it3; + } else { + r_values0 = it3; + g_values0 = it2; + b_values0 = it1; + } + + let source_ptr1 = rgba1.get_unchecked(px..).as_ptr(); + let row_11 = _mm256_loadu_si256(source_ptr1 as *const __m256i); + let row_21 = _mm256_loadu_si256(source_ptr1.add(32) as *const __m256i); + let row_31 = _mm256_loadu_si256(source_ptr1.add(64) as *const __m256i); + let row_41 = _mm256_loadu_si256(source_ptr1.add(96) as *const __m256i); + + let (it1, it2, it3, _) = _mm256_deinterleave_rgba_epi8(row_11, row_21, row_31, row_41); + if source_channels == YuvSourceChannels::Rgba { + r_values1 = it1; + g_values1 = it2; + b_values1 = it3; + } else { + r_values1 = it3; + g_values1 = it2; + b_values1 = it1; + } + } + } + + let r0_low = + _mm256_slli_epi16::(_mm256_cvtepu8_epi16(_mm256_castsi256_si128(r_values0))); + let r0_high = _mm256_slli_epi16::(_mm256_cvtepu8_epi16( + _mm256_extracti128_si256::<1>(r_values0), + )); + let g0_low = + _mm256_slli_epi16::(_mm256_cvtepu8_epi16(_mm256_castsi256_si128(g_values0))); + let g0_high = _mm256_slli_epi16::(_mm256_cvtepu8_epi16( + _mm256_extracti128_si256::<1>(g_values0), + )); + let b0_low = + _mm256_slli_epi16::(_mm256_cvtepu8_epi16(_mm256_castsi256_si128(b_values0))); + let b0_high = _mm256_slli_epi16::(_mm256_cvtepu8_epi16( + _mm256_extracti128_si256::<1>(b_values0), + )); + + let y0_l = _mm256_max_epi16( + _mm256_min_epi16( + _mm256_srai_epi16::(_mm256_add_epi16( + y_bias, + _mm256_add_epi16( + _mm256_add_epi16( + _mm256_mulhrs_epi16(r0_low, v_yr), + _mm256_mulhrs_epi16(g0_low, v_yg), + ), + _mm256_mulhrs_epi16(b0_low, v_yb), + ), + )), + i_cap_y, + ), + i_bias_y, + ); + + let y0_h = _mm256_max_epi16( + _mm256_min_epi16( + _mm256_srai_epi16::(_mm256_add_epi16( + y_bias, + _mm256_add_epi16( + _mm256_add_epi16( + _mm256_mulhrs_epi16(r0_high, v_yr), + _mm256_mulhrs_epi16(g0_high, v_yg), + ), + _mm256_mulhrs_epi16(b0_high, v_yb), + ), + )), + i_cap_y, + ), + i_bias_y, + ); + + let r1_low = + _mm256_slli_epi16::(_mm256_cvtepu8_epi16(_mm256_castsi256_si128(r_values1))); + let r1_high = _mm256_slli_epi16::(_mm256_cvtepu8_epi16( + _mm256_extracti128_si256::<1>(r_values1), + )); + let g1_low = + _mm256_slli_epi16::(_mm256_cvtepu8_epi16(_mm256_castsi256_si128(g_values1))); + let g1_high = _mm256_slli_epi16::(_mm256_cvtepu8_epi16( + _mm256_extracti128_si256::<1>(g_values1), + )); + let b1_low = + _mm256_slli_epi16::(_mm256_cvtepu8_epi16(_mm256_castsi256_si128(b_values1))); + let b1_high = _mm256_slli_epi16::(_mm256_cvtepu8_epi16( + _mm256_extracti128_si256::<1>(b_values1), + )); + + let y1_l = _mm256_max_epi16( + _mm256_min_epi16( + _mm256_srai_epi16::(_mm256_add_epi16( + y_bias, + _mm256_add_epi16( + _mm256_add_epi16( + _mm256_mulhrs_epi16(r1_low, v_yr), + _mm256_mulhrs_epi16(g1_low, v_yg), + ), + _mm256_mulhrs_epi16(b1_low, v_yb), + ), + )), + i_cap_y, + ), + i_bias_y, + ); + + let y1_h = _mm256_max_epi16( + _mm256_min_epi16( + _mm256_srai_epi16::(_mm256_add_epi16( + y_bias, + _mm256_add_epi16( + _mm256_add_epi16( + _mm256_mulhrs_epi16(r1_high, v_yr), + _mm256_mulhrs_epi16(g1_high, v_yg), + ), + _mm256_mulhrs_epi16(b1_high, v_yb), + ), + )), + i_cap_y, + ), + i_bias_y, + ); + + let y0_yuv = avx2_pack_u16(y0_l, y0_h); + let y1_yuv = avx2_pack_u16(y1_l, y1_h); + + _mm256_storeu_si256( + y_plane0.get_unchecked_mut(cx..).as_mut_ptr() as *mut __m256i, + y0_yuv, + ); + _mm256_storeu_si256( + y_plane1.get_unchecked_mut(cx..).as_mut_ptr() as *mut __m256i, + y1_yuv, + ); + + let r_uv = _mm256_avg_epu16(r0_low, r0_high); + let g_uv = _mm256_avg_epu16(g0_low, g0_high); + let b_uv = _mm256_avg_epu16(b0_low, b0_high); + let cb = _mm256_max_epi16( + _mm256_min_epi16( + _mm256_srai_epi16::(_mm256_add_epi16( + uv_bias, + _mm256_add_epi16( + _mm256_add_epi16( + _mm256_mulhrs_epi16(r_uv, v_cb_r), + _mm256_mulhrs_epi16(g_uv, v_cb_g), + ), + _mm256_mulhrs_epi16(b_uv, v_cb_b), + ), + )), + i_cap_uv, + ), + i_bias_y, + ); + let cr = _mm256_max_epi16( + _mm256_min_epi16( + _mm256_srai_epi16::(_mm256_add_epi16( + uv_bias, + _mm256_add_epi16( + _mm256_add_epi16( + _mm256_mulhrs_epi16(r_uv, v_cr_r), + _mm256_mulhrs_epi16(g_uv, v_cr_g), + ), + _mm256_mulhrs_epi16(b_uv, v_cr_b), + ), + )), + i_cap_uv, + ), + i_bias_y, + ); + + let cb = avx2_pack_u16(cb, cb); + let cr = avx2_pack_u16(cr, cr); + + _mm_storeu_si128( + u_ptr.add(uv_x) as *mut _ as *mut __m128i, + _mm256_castsi256_si128(cb), + ); + _mm_storeu_si128( + v_ptr.add(uv_x) as *mut _ as *mut __m128i, + _mm256_castsi256_si128(cr), + ); + uv_x += 16; + + cx += 32; + } + + ProcessedOffset { cx, ux: uv_x } +} diff --git a/src/avx2/yuv_nv_to_rgba.rs b/src/avx2/yuv_nv_to_rgba.rs index 723ce3e..0274e53 100644 --- a/src/avx2/yuv_nv_to_rgba.rs +++ b/src/avx2/yuv_nv_to_rgba.rs @@ -84,6 +84,9 @@ unsafe fn avx2_yuv_nv_to_rgba_row_impl< let uv_ptr = uv_plane.as_ptr(); let rgba_ptr = rgba.as_mut_ptr(); + const SCALE: i32 = 6; + const V_SHR: i32 = 3; + let y_corr = _mm256_set1_epi8(range.bias_y as i8); let uv_corr = _mm256_set1_epi16(range.bias_uv as i16); let v_luma_coeff = _mm256_set1_epi16(transform.y_coef as i16); @@ -92,7 +95,7 @@ unsafe fn avx2_yuv_nv_to_rgba_row_impl< let v_g_coeff_1 = _mm256_set1_epi16(transform.g_coeff_1 as i16); let v_g_coeff_2 = _mm256_set1_epi16(transform.g_coeff_2 as i16); let v_alpha = _mm256_set1_epi8(255u8 as i8); - let rounding_const = _mm256_set1_epi16(1 << 2); + let rounding_const = _mm256_set1_epi16(1 << (V_SHR - 1)); while cx + 32 < width { let y_values = @@ -148,58 +151,58 @@ unsafe fn avx2_yuv_nv_to_rgba_row_impl< } let u_high = - _mm256_slli_epi16::<7>(_mm256_sub_epi16(_mm256_cvtepu8_epi16(u_high_u8), uv_corr)); + _mm256_slli_epi16::(_mm256_sub_epi16(_mm256_cvtepu8_epi16(u_high_u8), uv_corr)); let v_high = - _mm256_slli_epi16::<7>(_mm256_sub_epi16(_mm256_cvtepu8_epi16(v_high_u8), uv_corr)); - let y_high = _mm256_mulhi_epi16( - _mm256_slli_epi16::<7>(_mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>( + _mm256_slli_epi16::(_mm256_sub_epi16(_mm256_cvtepu8_epi16(v_high_u8), uv_corr)); + let y_high = _mm256_mulhrs_epi16( + _mm256_slli_epi16::(_mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>( y_values, ))), v_luma_coeff, ); - let r_high = _mm256_srli_epi16::<3>(_mm256_add_epi16( - _mm256_add_epi16(y_high, _mm256_mulhi_epi16(v_high, v_cr_coeff)), + let r_high = _mm256_srli_epi16::(_mm256_add_epi16( + _mm256_add_epi16(y_high, _mm256_mulhrs_epi16(v_high, v_cr_coeff)), rounding_const, )); - let b_high = _mm256_srli_epi16::<3>(_mm256_add_epi16( - _mm256_add_epi16(y_high, _mm256_mulhi_epi16(u_high, v_cb_coeff)), + let b_high = _mm256_srli_epi16::(_mm256_add_epi16( + _mm256_add_epi16(y_high, _mm256_mulhrs_epi16(u_high, v_cb_coeff)), rounding_const, )); - let g_high = _mm256_srli_epi16::<3>(_mm256_add_epi16( + let g_high = _mm256_srli_epi16::(_mm256_add_epi16( _mm256_sub_epi16( y_high, _mm256_add_epi16( - _mm256_mulhi_epi16(v_high, v_g_coeff_1), - _mm256_mulhi_epi16(u_high, v_g_coeff_2), + _mm256_mulhrs_epi16(v_high, v_g_coeff_1), + _mm256_mulhrs_epi16(u_high, v_g_coeff_2), ), ), rounding_const, )); let u_low = - _mm256_slli_epi16::<7>(_mm256_sub_epi16(_mm256_cvtepu8_epi16(u_low_u8), uv_corr)); + _mm256_slli_epi16::(_mm256_sub_epi16(_mm256_cvtepu8_epi16(u_low_u8), uv_corr)); let v_low = - _mm256_slli_epi16::<7>(_mm256_sub_epi16(_mm256_cvtepu8_epi16(v_low_u8), uv_corr)); - let y_low = _mm256_mulhi_epi16( - _mm256_slli_epi16::<7>(_mm256_cvtepu8_epi16(_mm256_castsi256_si128(y_values))), + _mm256_slli_epi16::(_mm256_sub_epi16(_mm256_cvtepu8_epi16(v_low_u8), uv_corr)); + let y_low = _mm256_mulhrs_epi16( + _mm256_slli_epi16::(_mm256_cvtepu8_epi16(_mm256_castsi256_si128(y_values))), v_luma_coeff, ); - let r_low = _mm256_srli_epi16::<3>(_mm256_add_epi16( - _mm256_add_epi16(y_low, _mm256_mulhi_epi16(v_low, v_cr_coeff)), + let r_low = _mm256_srli_epi16::(_mm256_add_epi16( + _mm256_add_epi16(y_low, _mm256_mulhrs_epi16(v_low, v_cr_coeff)), rounding_const, )); - let b_low = _mm256_srli_epi16::<3>(_mm256_add_epi16( - _mm256_add_epi16(y_low, _mm256_mulhi_epi16(u_low, v_cb_coeff)), + let b_low = _mm256_srli_epi16::(_mm256_add_epi16( + _mm256_add_epi16(y_low, _mm256_mulhrs_epi16(u_low, v_cb_coeff)), rounding_const, )); - let g_low = _mm256_srli_epi16::<3>(_mm256_add_epi16( + let g_low = _mm256_srli_epi16::(_mm256_add_epi16( _mm256_sub_epi16( y_low, _mm256_add_epi16( - _mm256_mulhi_epi16(v_low, v_g_coeff_1), - _mm256_mulhi_epi16(u_low, v_g_coeff_2), + _mm256_mulhrs_epi16(v_low, v_g_coeff_1), + _mm256_mulhrs_epi16(u_low, v_g_coeff_2), ), ), rounding_const, diff --git a/src/avx2/yuv_nv_to_rgba420.rs b/src/avx2/yuv_nv_to_rgba420.rs index 2b8c4f4..82637f8 100644 --- a/src/avx2/yuv_nv_to_rgba420.rs +++ b/src/avx2/yuv_nv_to_rgba420.rs @@ -75,7 +75,7 @@ unsafe fn avx2_yuv_nv_to_rgba_row_impl420(_mm256_sub_epi16(_mm256_cvtepu8_epi16(u_high_u8), uv_corr)); let v_high = _mm256_slli_epi16::(_mm256_sub_epi16(_mm256_cvtepu8_epi16(v_high_u8), uv_corr)); - let y_high0 = _mm256_mulhi_epi16( + let y_high0 = _mm256_mulhrs_epi16( _mm256_slli_epi16::(_mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>( y_values0, ))), v_luma_coeff, ); - let y_high1 = _mm256_mulhi_epi16( + let y_high1 = _mm256_mulhrs_epi16( _mm256_slli_epi16::(_mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>( y_values1, ))), @@ -138,16 +138,16 @@ unsafe fn avx2_yuv_nv_to_rgba_row_impl420(_mm256_add_epi16( - _mm256_add_epi16(y_high0, _mm256_mulhi_epi16(v_high, v_cr_coeff)), + _mm256_add_epi16(y_high0, _mm256_mulhrs_epi16(v_high, v_cr_coeff)), rounding_const, )); let b_high0 = _mm256_srli_epi16::(_mm256_add_epi16( - _mm256_add_epi16(y_high0, _mm256_mulhi_epi16(u_high, v_cb_coeff)), + _mm256_add_epi16(y_high0, _mm256_mulhrs_epi16(u_high, v_cb_coeff)), rounding_const, )); let g_high0 = _mm256_srli_epi16::(_mm256_add_epi16( @@ -155,11 +155,11 @@ unsafe fn avx2_yuv_nv_to_rgba_row_impl420(_mm256_add_epi16( - _mm256_add_epi16(y_high1, _mm256_mulhi_epi16(v_high, v_cr_coeff)), + _mm256_add_epi16(y_high1, _mm256_mulhrs_epi16(v_high, v_cr_coeff)), rounding_const, )); let b_high1 = _mm256_srli_epi16::(_mm256_add_epi16( - _mm256_add_epi16(y_high1, _mm256_mulhi_epi16(u_high, v_cb_coeff)), + _mm256_add_epi16(y_high1, _mm256_mulhrs_epi16(u_high, v_cb_coeff)), rounding_const, )); let g_high1 = _mm256_srli_epi16::(_mm256_add_epi16( @@ -171,26 +171,26 @@ unsafe fn avx2_yuv_nv_to_rgba_row_impl420(_mm256_sub_epi16(_mm256_cvtepu8_epi16(u_low_u8), uv_corr)); let v_low = _mm256_slli_epi16::(_mm256_sub_epi16(_mm256_cvtepu8_epi16(v_low_u8), uv_corr)); - let y_low0 = _mm256_mulhi_epi16( + let y_low0 = _mm256_mulhrs_epi16( _mm256_slli_epi16::(_mm256_cvtepu8_epi16(_mm256_castsi256_si128(y_values0))), v_luma_coeff, ); - let y_low1 = _mm256_mulhi_epi16( + let y_low1 = _mm256_mulhrs_epi16( _mm256_slli_epi16::(_mm256_cvtepu8_epi16(_mm256_castsi256_si128(y_values1))), v_luma_coeff, ); let g_coeff_lo = _mm256_add_epi16( - _mm256_mulhi_epi16(v_low, v_g_coeff_1), - _mm256_mulhi_epi16(u_low, v_g_coeff_2), + _mm256_mulhrs_epi16(v_low, v_g_coeff_1), + _mm256_mulhrs_epi16(u_low, v_g_coeff_2), ); let r_low0 = _mm256_srli_epi16::(_mm256_add_epi16( - _mm256_add_epi16(y_low0, _mm256_mulhi_epi16(v_low, v_cr_coeff)), + _mm256_add_epi16(y_low0, _mm256_mulhrs_epi16(v_low, v_cr_coeff)), rounding_const, )); let b_low0 = _mm256_srli_epi16::(_mm256_add_epi16( - _mm256_add_epi16(y_low0, _mm256_mulhi_epi16(u_low, v_cb_coeff)), + _mm256_add_epi16(y_low0, _mm256_mulhrs_epi16(u_low, v_cb_coeff)), rounding_const, )); let g_low0 = _mm256_srli_epi16::(_mm256_add_epi16( @@ -199,11 +199,11 @@ unsafe fn avx2_yuv_nv_to_rgba_row_impl420(_mm256_add_epi16( - _mm256_add_epi16(y_low1, _mm256_mulhi_epi16(v_low, v_cr_coeff)), + _mm256_add_epi16(y_low1, _mm256_mulhrs_epi16(v_low, v_cr_coeff)), rounding_const, )); let b_low1 = _mm256_srli_epi16::(_mm256_add_epi16( - _mm256_add_epi16(y_low1, _mm256_mulhi_epi16(u_low, v_cb_coeff)), + _mm256_add_epi16(y_low1, _mm256_mulhrs_epi16(u_low, v_cb_coeff)), rounding_const, )); let g_low1 = _mm256_srli_epi16::(_mm256_add_epi16( diff --git a/src/avx2/yuv_to_rgba.rs b/src/avx2/yuv_to_rgba.rs index 1937351..36711b3 100644 --- a/src/avx2/yuv_to_rgba.rs +++ b/src/avx2/yuv_to_rgba.rs @@ -87,7 +87,7 @@ unsafe fn avx2_yuv_to_rgba_row_impl(_mm256_sub_epi16(u_high_u16, uv_corr)); let v_high = _mm256_slli_epi16::(_mm256_sub_epi16(v_high_u16, uv_corr)); - let y_high = _mm256_mulhi_epi16( + let y_high = _mm256_mulhrs_epi16( _mm256_slli_epi16::(_mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>( y_values, ))), @@ -129,19 +129,19 @@ unsafe fn avx2_yuv_to_rgba_row_impl(_mm256_add_epi16( - _mm256_add_epi16(y_high, _mm256_mulhi_epi16(v_high, v_cr_coeff)), + _mm256_add_epi16(y_high, _mm256_mulhrs_epi16(v_high, v_cr_coeff)), rounding_const, )); let b_high = _mm256_srai_epi16::(_mm256_add_epi16( - _mm256_add_epi16(y_high, _mm256_mulhi_epi16(u_high, v_cb_coeff)), + _mm256_add_epi16(y_high, _mm256_mulhrs_epi16(u_high, v_cb_coeff)), rounding_const, )); let g_high = _mm256_srai_epi16::(_mm256_add_epi16( _mm256_sub_epi16( y_high, _mm256_add_epi16( - _mm256_mulhi_epi16(v_high, v_g_coeff_1), - _mm256_mulhi_epi16(u_high, v_g_coeff_2), + _mm256_mulhrs_epi16(v_high, v_g_coeff_1), + _mm256_mulhrs_epi16(u_high, v_g_coeff_2), ), ), rounding_const, @@ -149,25 +149,25 @@ unsafe fn avx2_yuv_to_rgba_row_impl(_mm256_sub_epi16(u_low_u16, uv_corr)); let v_low = _mm256_slli_epi16::(_mm256_sub_epi16(v_low_u16, uv_corr)); - let y_low = _mm256_mulhi_epi16( + let y_low = _mm256_mulhrs_epi16( _mm256_slli_epi16::(_mm256_cvtepu8_epi16(_mm256_castsi256_si128(y_values))), v_luma_coeff, ); let r_low = _mm256_srai_epi16::(_mm256_add_epi16( - _mm256_add_epi16(y_low, _mm256_mulhi_epi16(v_low, v_cr_coeff)), + _mm256_add_epi16(y_low, _mm256_mulhrs_epi16(v_low, v_cr_coeff)), rounding_const, )); let b_low = _mm256_srai_epi16::(_mm256_add_epi16( - _mm256_add_epi16(y_low, _mm256_mulhi_epi16(u_low, v_cb_coeff)), + _mm256_add_epi16(y_low, _mm256_mulhrs_epi16(u_low, v_cb_coeff)), rounding_const, )); let g_low = _mm256_srai_epi16::(_mm256_add_epi16( _mm256_sub_epi16( y_low, _mm256_add_epi16( - _mm256_mulhi_epi16(v_low, v_g_coeff_1), - _mm256_mulhi_epi16(u_low, v_g_coeff_2), + _mm256_mulhrs_epi16(v_low, v_g_coeff_1), + _mm256_mulhrs_epi16(u_low, v_g_coeff_2), ), ), rounding_const, diff --git a/src/avx2/yuv_to_rgba420.rs b/src/avx2/yuv_to_rgba420.rs index aa62cf5..f85b167 100644 --- a/src/avx2/yuv_to_rgba420.rs +++ b/src/avx2/yuv_to_rgba420.rs @@ -87,7 +87,7 @@ unsafe fn avx2_yuv_to_rgba_row_impl420( let v_g_coeff_2 = _mm256_set1_epi16(transform.g_coeff_2 as i16); let v_alpha = _mm256_set1_epi8(255u8 as i8); - const SCALE: i32 = 7; + const SCALE: i32 = 6; const V_SHR: i32 = 3; let rounding_const = _mm256_set1_epi16(1 << (V_SHR - 1)); @@ -112,13 +112,13 @@ unsafe fn avx2_yuv_to_rgba_row_impl420( let u_high = _mm256_slli_epi16::(_mm256_sub_epi16(u_high_u16, uv_corr)); let v_high = _mm256_slli_epi16::(_mm256_sub_epi16(v_high_u16, uv_corr)); - let y_high0 = _mm256_mulhi_epi16( + let y_high0 = _mm256_mulhrs_epi16( _mm256_slli_epi16::(_mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>( y_values0, ))), v_luma_coeff, ); - let y_high1 = _mm256_mulhi_epi16( + let y_high1 = _mm256_mulhrs_epi16( _mm256_slli_epi16::(_mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>( y_values1, ))), @@ -126,16 +126,16 @@ unsafe fn avx2_yuv_to_rgba_row_impl420( ); let g_coeff_hi = _mm256_add_epi16( - _mm256_mulhi_epi16(v_high, v_g_coeff_1), - _mm256_mulhi_epi16(u_high, v_g_coeff_2), + _mm256_mulhrs_epi16(v_high, v_g_coeff_1), + _mm256_mulhrs_epi16(u_high, v_g_coeff_2), ); let r_high0 = _mm256_srai_epi16::(_mm256_add_epi16( - _mm256_add_epi16(y_high0, _mm256_mulhi_epi16(v_high, v_cr_coeff)), + _mm256_add_epi16(y_high0, _mm256_mulhrs_epi16(v_high, v_cr_coeff)), rounding_const, )); let b_high0 = _mm256_srai_epi16::(_mm256_add_epi16( - _mm256_add_epi16(y_high0, _mm256_mulhi_epi16(u_high, v_cb_coeff)), + _mm256_add_epi16(y_high0, _mm256_mulhrs_epi16(u_high, v_cb_coeff)), rounding_const, )); let g_high0 = _mm256_srai_epi16::(_mm256_add_epi16( @@ -144,11 +144,11 @@ unsafe fn avx2_yuv_to_rgba_row_impl420( )); let r_high1 = _mm256_srai_epi16::(_mm256_add_epi16( - _mm256_add_epi16(y_high1, _mm256_mulhi_epi16(v_high, v_cr_coeff)), + _mm256_add_epi16(y_high1, _mm256_mulhrs_epi16(v_high, v_cr_coeff)), rounding_const, )); let b_high1 = _mm256_srai_epi16::(_mm256_add_epi16( - _mm256_add_epi16(y_high1, _mm256_mulhi_epi16(u_high, v_cb_coeff)), + _mm256_add_epi16(y_high1, _mm256_mulhrs_epi16(u_high, v_cb_coeff)), rounding_const, )); let g_high1 = _mm256_srai_epi16::(_mm256_add_epi16( @@ -158,26 +158,26 @@ unsafe fn avx2_yuv_to_rgba_row_impl420( let u_low = _mm256_slli_epi16::(_mm256_sub_epi16(u_low_u16, uv_corr)); let v_low = _mm256_slli_epi16::(_mm256_sub_epi16(v_low_u16, uv_corr)); - let y_low0 = _mm256_mulhi_epi16( + let y_low0 = _mm256_mulhrs_epi16( _mm256_slli_epi16::(_mm256_cvtepu8_epi16(_mm256_castsi256_si128(y_values0))), v_luma_coeff, ); - let y_low1 = _mm256_mulhi_epi16( + let y_low1 = _mm256_mulhrs_epi16( _mm256_slli_epi16::(_mm256_cvtepu8_epi16(_mm256_castsi256_si128(y_values1))), v_luma_coeff, ); let g_coeff_lo = _mm256_add_epi16( - _mm256_mulhi_epi16(v_low, v_g_coeff_1), - _mm256_mulhi_epi16(u_low, v_g_coeff_2), + _mm256_mulhrs_epi16(v_low, v_g_coeff_1), + _mm256_mulhrs_epi16(u_low, v_g_coeff_2), ); let r_low0 = _mm256_srai_epi16::(_mm256_add_epi16( - _mm256_add_epi16(y_low0, _mm256_mulhi_epi16(v_low, v_cr_coeff)), + _mm256_add_epi16(y_low0, _mm256_mulhrs_epi16(v_low, v_cr_coeff)), rounding_const, )); let b_low0 = _mm256_srai_epi16::(_mm256_add_epi16( - _mm256_add_epi16(y_low0, _mm256_mulhi_epi16(u_low, v_cb_coeff)), + _mm256_add_epi16(y_low0, _mm256_mulhrs_epi16(u_low, v_cb_coeff)), rounding_const, )); let g_low0 = _mm256_srai_epi16::(_mm256_add_epi16( @@ -186,11 +186,11 @@ unsafe fn avx2_yuv_to_rgba_row_impl420( )); let r_low1 = _mm256_srai_epi16::(_mm256_add_epi16( - _mm256_add_epi16(y_low1, _mm256_mulhi_epi16(v_low, v_cr_coeff)), + _mm256_add_epi16(y_low1, _mm256_mulhrs_epi16(v_low, v_cr_coeff)), rounding_const, )); let b_low1 = _mm256_srai_epi16::(_mm256_add_epi16( - _mm256_add_epi16(y_low1, _mm256_mulhi_epi16(u_low, v_cb_coeff)), + _mm256_add_epi16(y_low1, _mm256_mulhrs_epi16(u_low, v_cb_coeff)), rounding_const, )); let g_low1 = _mm256_srai_epi16::(_mm256_add_epi16( diff --git a/src/avx2/yuv_to_rgba_alpha.rs b/src/avx2/yuv_to_rgba_alpha.rs index 141f440..a33bee4 100644 --- a/src/avx2/yuv_to_rgba_alpha.rs +++ b/src/avx2/yuv_to_rgba_alpha.rs @@ -93,6 +93,9 @@ unsafe fn avx2_yuv_to_rgba_alpha_impl(_mm256_sub_epi16(u_high_u16, uv_corr)); - let v_high = _mm256_slli_epi16::<7>(_mm256_sub_epi16(v_high_u16, uv_corr)); - let y_high = _mm256_mulhi_epi16( - _mm256_slli_epi16::<7>(_mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>( + let u_high = _mm256_slli_epi16::(_mm256_sub_epi16(u_high_u16, uv_corr)); + let v_high = _mm256_slli_epi16::(_mm256_sub_epi16(v_high_u16, uv_corr)); + let y_high = _mm256_mulhrs_epi16( + _mm256_slli_epi16::(_mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>( y_values, ))), v_luma_coeff, ); - let r_high = _mm256_srai_epi16::<3>(_mm256_add_epi16( - _mm256_add_epi16(y_high, _mm256_mulhi_epi16(v_high, v_cr_coeff)), + let r_high = _mm256_srai_epi16::(_mm256_add_epi16( + _mm256_add_epi16(y_high, _mm256_mulhrs_epi16(v_high, v_cr_coeff)), rounding_const, )); let b_high = _mm256_srai_epi16::<3>(_mm256_add_epi16( - _mm256_add_epi16(y_high, _mm256_mulhi_epi16(u_high, v_cb_coeff)), + _mm256_add_epi16(y_high, _mm256_mulhrs_epi16(u_high, v_cb_coeff)), rounding_const, )); - let g_high = _mm256_srai_epi16::<3>(_mm256_add_epi16( + let g_high = _mm256_srai_epi16::(_mm256_add_epi16( _mm256_sub_epi16( y_high, _mm256_add_epi16( - _mm256_mulhi_epi16(v_high, v_g_coeff_1), - _mm256_mulhi_epi16(u_high, v_g_coeff_2), + _mm256_mulhrs_epi16(v_high, v_g_coeff_1), + _mm256_mulhrs_epi16(u_high, v_g_coeff_2), ), ), rounding_const, )); - let u_low = _mm256_slli_epi16::<7>(_mm256_sub_epi16(u_low_u16, uv_corr)); - let v_low = _mm256_slli_epi16::<7>(_mm256_sub_epi16(v_low_u16, uv_corr)); - let y_low = _mm256_mulhi_epi16( - _mm256_slli_epi16::<7>(_mm256_cvtepu8_epi16(_mm256_castsi256_si128(y_values))), + let u_low = _mm256_slli_epi16::(_mm256_sub_epi16(u_low_u16, uv_corr)); + let v_low = _mm256_slli_epi16::(_mm256_sub_epi16(v_low_u16, uv_corr)); + let y_low = _mm256_mulhrs_epi16( + _mm256_slli_epi16::(_mm256_cvtepu8_epi16(_mm256_castsi256_si128(y_values))), v_luma_coeff, ); - let r_low = _mm256_srai_epi16::<3>(_mm256_add_epi16( - _mm256_add_epi16(y_low, _mm256_mulhi_epi16(v_low, v_cr_coeff)), + let r_low = _mm256_srai_epi16::(_mm256_add_epi16( + _mm256_add_epi16(y_low, _mm256_mulhrs_epi16(v_low, v_cr_coeff)), rounding_const, )); - let b_low = _mm256_srai_epi16::<3>(_mm256_add_epi16( - _mm256_add_epi16(y_low, _mm256_mulhi_epi16(u_low, v_cb_coeff)), + let b_low = _mm256_srai_epi16::(_mm256_add_epi16( + _mm256_add_epi16(y_low, _mm256_mulhrs_epi16(u_low, v_cb_coeff)), rounding_const, )); - let g_low = _mm256_srai_epi16::<3>(_mm256_add_epi16( + let g_low = _mm256_srai_epi16::(_mm256_add_epi16( _mm256_sub_epi16( y_low, _mm256_add_epi16( - _mm256_mulhi_epi16(v_low, v_g_coeff_1), - _mm256_mulhi_epi16(u_low, v_g_coeff_2), + _mm256_mulhrs_epi16(v_low, v_g_coeff_1), + _mm256_mulhrs_epi16(u_low, v_g_coeff_2), ), ), rounding_const, diff --git a/src/rgba_to_yuv.rs b/src/rgba_to_yuv.rs index 227d009..a9253b2 100644 --- a/src/rgba_to_yuv.rs +++ b/src/rgba_to_yuv.rs @@ -27,7 +27,7 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -use crate::avx2::avx2_rgba_to_yuv; +use crate::avx2::{avx2_rgba_to_yuv, avx2_rgba_to_yuv420}; #[cfg(all( any(target_arch = "x86", target_arch = "x86_64"), feature = "nightly_avx512" @@ -40,7 +40,7 @@ use crate::neon::{ neon_rgba_to_yuv, neon_rgba_to_yuv420, neon_rgba_to_yuv_rdm, neon_rgba_to_yuv_rdm420, }; #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -use crate::sse::sse_rgba_to_yuv_row; +use crate::sse::{sse_rgba_to_yuv_row, sse_rgba_to_yuv_row420}; use crate::yuv_error::check_rgba_destination; #[allow(unused_imports)] use crate::yuv_support::*; @@ -156,7 +156,6 @@ fn rgbx_to_yuv8( _offset.cx, _offset.ux, planar_image.width as usize, - compute_uv_row, ); _offset = processed_offset; } @@ -172,7 +171,6 @@ fn rgbx_to_yuv8( _offset.cx, _offset.ux, planar_image.width as usize, - compute_uv_row, ); _offset = processed_offset; } @@ -220,6 +218,42 @@ fn rgbx_to_yuv8( ); _offset = offset; } + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + { + if use_avx { + let processed_offset = avx2_rgba_to_yuv420::( + &transform, + &range, + _y_plane0, + _y_plane1, + _u_plane, + _v_plane, + _rgba0, + _rgba1, + _offset.cx, + _offset.ux, + planar_image.width as usize, + ); + _offset = processed_offset; + } + + if use_sse { + let processed_offset = sse_rgba_to_yuv_row420::( + &transform, + &range, + _y_plane0, + _y_plane1, + _u_plane, + _v_plane, + _rgba0, + _rgba1, + _offset.cx, + _offset.ux, + planar_image.width as usize, + ); + _offset = processed_offset; + } + } _offset }; @@ -343,7 +377,7 @@ fn rgbx_to_yuv8( let b11 = src11[src_chans.get_b_channel_offset()] as i32; let y_11 = (r11 * transform.yr + g11 * transform.yg + b11 * transform.yb + bias_y) >> PRECISION; - y_dst0[1] = y_11.max(i_bias_y).min(i_cap_y) as u8; + y_dst1[1] = y_11.max(i_bias_y).min(i_cap_y) as u8; let ruv = (r00 + r01 + 1) >> 1; let guv = (g00 + g01 + 1) >> 1; diff --git a/src/sse/mod.rs b/src/sse/mod.rs index 68f0981..15be6cb 100644 --- a/src/sse/mod.rs +++ b/src/sse/mod.rs @@ -50,6 +50,7 @@ mod yuv_to_rgba_alpha; mod yuv_to_yuy2; mod yuy2_to_rgb; mod yuy2_to_yuv; +mod rgba_to_yuv420; pub(crate) use rgb_to_nv::sse_rgba_to_nv_row; pub(crate) use rgb_to_y::sse_rgb_to_y; @@ -72,3 +73,4 @@ pub(crate) use yuv_to_rgba_alpha::sse_yuv_to_rgba_alpha_row; pub(crate) use yuv_to_yuy2::yuv_to_yuy2_sse; pub(crate) use yuy2_to_rgb::yuy2_to_rgb_sse; pub(crate) use yuy2_to_yuv::yuy2_to_yuv_sse; +pub(crate) use rgba_to_yuv420::sse_rgba_to_yuv_row420; \ No newline at end of file diff --git a/src/sse/rgb_to_nv.rs b/src/sse/rgb_to_nv.rs index a2182a3..2fedfae 100644 --- a/src/sse/rgb_to_nv.rs +++ b/src/sse/rgb_to_nv.rs @@ -96,7 +96,7 @@ unsafe fn sse_rgba_to_nv_row_impl< let mut uv_x = start_ux; const V_SHR: i32 = 3; - const V_SCALE: i32 = 7; + const V_SCALE: i32 = 6; let rounding_const_bias: i16 = 1 << (V_SHR - 1); let bias_y = range.bias_y as i16 * (1 << V_SHR) + rounding_const_bias; let bias_uv = range.bias_uv as i16 * (1 << V_SHR) + rounding_const_bias; @@ -174,8 +174,8 @@ unsafe fn sse_rgba_to_nv_row_impl< _mm_srai_epi16::(_mm_add_epi16( y_bias, _mm_add_epi16( - _mm_add_epi16(_mm_mulhi_epi16(r_low, v_yr), _mm_mulhi_epi16(g_low, v_yg)), - _mm_mulhi_epi16(b_low, v_yb), + _mm_add_epi16(_mm_mulhrs_epi16(r_low, v_yr), _mm_mulhrs_epi16(g_low, v_yg)), + _mm_mulhrs_epi16(b_low, v_yb), ), )), i_cap_y, @@ -188,8 +188,11 @@ unsafe fn sse_rgba_to_nv_row_impl< _mm_srai_epi16::(_mm_add_epi16( y_bias, _mm_add_epi16( - _mm_add_epi16(_mm_mulhi_epi16(r_high, v_yr), _mm_mulhi_epi16(g_high, v_yg)), - _mm_mulhi_epi16(b_high, v_yb), + _mm_add_epi16( + _mm_mulhrs_epi16(r_high, v_yr), + _mm_mulhrs_epi16(g_high, v_yg), + ), + _mm_mulhrs_epi16(b_high, v_yb), ), )), i_cap_y, @@ -207,10 +210,10 @@ unsafe fn sse_rgba_to_nv_row_impl< uv_bias, _mm_add_epi16( _mm_add_epi16( - _mm_mulhi_epi16(r_low, v_cb_r), - _mm_mulhi_epi16(g_low, v_cb_g), + _mm_mulhrs_epi16(r_low, v_cb_r), + _mm_mulhrs_epi16(g_low, v_cb_g), ), - _mm_mulhi_epi16(b_low, v_cb_b), + _mm_mulhrs_epi16(b_low, v_cb_b), ), )), i_cap_uv, @@ -223,10 +226,10 @@ unsafe fn sse_rgba_to_nv_row_impl< uv_bias, _mm_add_epi16( _mm_add_epi16( - _mm_mulhi_epi16(r_low, v_cr_r), - _mm_mulhi_epi16(g_low, v_cr_g), + _mm_mulhrs_epi16(r_low, v_cr_r), + _mm_mulhrs_epi16(g_low, v_cr_g), ), - _mm_mulhi_epi16(b_low, v_cr_b), + _mm_mulhrs_epi16(b_low, v_cr_b), ), )), i_cap_uv, @@ -239,10 +242,10 @@ unsafe fn sse_rgba_to_nv_row_impl< uv_bias, _mm_add_epi16( _mm_add_epi16( - _mm_mulhi_epi16(r_high, v_cb_r), - _mm_mulhi_epi16(g_high, v_cb_g), + _mm_mulhrs_epi16(r_high, v_cb_r), + _mm_mulhrs_epi16(g_high, v_cb_g), ), - _mm_mulhi_epi16(b_high, v_cb_b), + _mm_mulhrs_epi16(b_high, v_cb_b), ), )), i_cap_uv, @@ -255,10 +258,10 @@ unsafe fn sse_rgba_to_nv_row_impl< uv_bias, _mm_add_epi16( _mm_add_epi16( - _mm_mulhi_epi16(r_high, v_cr_r), - _mm_mulhi_epi16(g_high, v_cr_g), + _mm_mulhrs_epi16(r_high, v_cr_r), + _mm_mulhrs_epi16(g_high, v_cr_g), ), - _mm_mulhi_epi16(b_high, v_cr_b), + _mm_mulhrs_epi16(b_high, v_cr_b), ), )), i_cap_uv, @@ -294,8 +297,11 @@ unsafe fn sse_rgba_to_nv_row_impl< _mm_srai_epi16::(_mm_add_epi16( uv_bias, _mm_add_epi16( - _mm_add_epi16(_mm_mulhi_epi16(r1, v_cb_r), _mm_mulhi_epi16(g1, v_cb_g)), - _mm_mulhi_epi16(b1, v_cb_b), + _mm_add_epi16( + _mm_mulhrs_epi16(r1, v_cb_r), + _mm_mulhrs_epi16(g1, v_cb_g), + ), + _mm_mulhrs_epi16(b1, v_cb_b), ), )), i_cap_uv, @@ -308,8 +314,11 @@ unsafe fn sse_rgba_to_nv_row_impl< _mm_srai_epi16::(_mm_add_epi16( uv_bias, _mm_add_epi16( - _mm_add_epi16(_mm_mulhi_epi16(r1, v_cr_r), _mm_mulhi_epi16(g1, v_cr_g)), - _mm_mulhi_epi16(b1, v_cr_b), + _mm_add_epi16( + _mm_mulhrs_epi16(r1, v_cr_r), + _mm_mulhrs_epi16(g1, v_cr_g), + ), + _mm_mulhrs_epi16(b1, v_cr_b), ), )), i_cap_uv, diff --git a/src/sse/rgb_to_y.rs b/src/sse/rgb_to_y.rs index f7116d2..5f0f046 100644 --- a/src/sse/rgb_to_y.rs +++ b/src/sse/rgb_to_y.rs @@ -65,7 +65,7 @@ unsafe fn sse_rgb_to_y_impl( let mut cx = start_cx; const V_SHR: i32 = 3; - const V_SCALE: i32 = 7; + const V_SCALE: i32 = 6; let rounding_const_bias: i16 = 1 << (V_SHR - 1); let bias_y = range.bias_y as i16 * (1 << V_SHR) + rounding_const_bias; @@ -133,8 +133,8 @@ unsafe fn sse_rgb_to_y_impl( _mm_srai_epi16::(_mm_add_epi16( y_bias, _mm_add_epi16( - _mm_add_epi16(_mm_mulhi_epi16(r_low, v_yr), _mm_mulhi_epi16(g_low, v_yg)), - _mm_mulhi_epi16(b_low, v_yb), + _mm_add_epi16(_mm_mulhrs_epi16(r_low, v_yr), _mm_mulhrs_epi16(g_low, v_yg)), + _mm_mulhrs_epi16(b_low, v_yb), ), )), i_cap_y, @@ -147,8 +147,11 @@ unsafe fn sse_rgb_to_y_impl( _mm_srai_epi16::(_mm_add_epi16( y_bias, _mm_add_epi16( - _mm_add_epi16(_mm_mulhi_epi16(r_high, v_yr), _mm_mulhi_epi16(g_high, v_yg)), - _mm_mulhi_epi16(b_high, v_yb), + _mm_add_epi16( + _mm_mulhrs_epi16(r_high, v_yr), + _mm_mulhrs_epi16(g_high, v_yg), + ), + _mm_mulhrs_epi16(b_high, v_yb), ), )), i_cap_y, diff --git a/src/sse/rgba_to_yuv.rs b/src/sse/rgba_to_yuv.rs index de648d5..be395ac 100644 --- a/src/sse/rgba_to_yuv.rs +++ b/src/sse/rgba_to_yuv.rs @@ -47,20 +47,10 @@ pub(crate) fn sse_rgba_to_yuv_row start_cx: usize, start_ux: usize, width: usize, - compute_uv_row: bool, ) -> ProcessedOffset { unsafe { sse_rgba_to_yuv_row_impl::( - transform, - range, - y_plane, - u_plane, - v_plane, - rgba, - start_cx, - start_ux, - width, - compute_uv_row, + transform, range, y_plane, u_plane, v_plane, rgba, start_cx, start_ux, width, ) } } @@ -76,7 +66,6 @@ unsafe fn sse_rgba_to_yuv_row_impl ProcessedOffset { let chroma_subsampling: YuvChromaSubsampling = SAMPLING.into(); let source_channels: YuvSourceChannels = ORIGIN_CHANNELS.into(); @@ -91,7 +80,7 @@ unsafe fn sse_rgba_to_yuv_row_impl(_mm_add_epi16( y_bias, _mm_add_epi16( - _mm_add_epi16(_mm_mulhi_epi16(r_low, v_yr), _mm_mulhi_epi16(g_low, v_yg)), - _mm_mulhi_epi16(b_low, v_yb), + _mm_add_epi16(_mm_mulhrs_epi16(r_low, v_yr), _mm_mulhrs_epi16(g_low, v_yg)), + _mm_mulhrs_epi16(b_low, v_yb), ), )), i_cap_y, @@ -183,8 +172,11 @@ unsafe fn sse_rgba_to_yuv_row_impl(_mm_add_epi16( y_bias, _mm_add_epi16( - _mm_add_epi16(_mm_mulhi_epi16(r_high, v_yr), _mm_mulhi_epi16(g_high, v_yg)), - _mm_mulhi_epi16(b_high, v_yb), + _mm_add_epi16( + _mm_mulhrs_epi16(r_high, v_yr), + _mm_mulhrs_epi16(g_high, v_yg), + ), + _mm_mulhrs_epi16(b_high, v_yb), ), )), i_cap_y, @@ -202,10 +194,10 @@ unsafe fn sse_rgba_to_yuv_row_impl(_mm_add_epi16( uv_bias, _mm_add_epi16( - _mm_add_epi16(_mm_mulhi_epi16(r1, v_cb_r), _mm_mulhi_epi16(g1, v_cb_g)), - _mm_mulhi_epi16(b1, v_cb_b), + _mm_add_epi16( + _mm_mulhrs_epi16(r1, v_cb_r), + _mm_mulhrs_epi16(g1, v_cb_g), + ), + _mm_mulhrs_epi16(b1, v_cb_b), ), )), i_cap_uv, @@ -294,8 +289,11 @@ unsafe fn sse_rgba_to_yuv_row_impl(_mm_add_epi16( uv_bias, _mm_add_epi16( - _mm_add_epi16(_mm_mulhi_epi16(r1, v_cr_r), _mm_mulhi_epi16(g1, v_cr_g)), - _mm_mulhi_epi16(b1, v_cr_b), + _mm_add_epi16( + _mm_mulhrs_epi16(r1, v_cr_r), + _mm_mulhrs_epi16(g1, v_cr_g), + ), + _mm_mulhrs_epi16(b1, v_cr_b), ), )), i_cap_uv, diff --git a/src/sse/rgba_to_yuv420.rs b/src/sse/rgba_to_yuv420.rs new file mode 100644 index 0000000..b53a6c7 --- /dev/null +++ b/src/sse/rgba_to_yuv420.rs @@ -0,0 +1,319 @@ +/* + * Copyright (c) Radzivon Bartoshyk, 10/2024. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +use crate::internals::ProcessedOffset; +use crate::sse::sse_support::{sse_deinterleave_rgb, sse_deinterleave_rgba}; +use crate::yuv_support::{CbCrForwardTransform, YuvChromaRange, YuvSourceChannels}; +#[cfg(target_arch = "x86")] +use std::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use std::arch::x86_64::*; + +pub(crate) fn sse_rgba_to_yuv_row420( + transform: &CbCrForwardTransform, + range: &YuvChromaRange, + y_plane0: &mut [u8], + y_plane1: &mut [u8], + u_plane: &mut [u8], + v_plane: &mut [u8], + rgba0: &[u8], + rgba1: &[u8], + start_cx: usize, + start_ux: usize, + width: usize, +) -> ProcessedOffset { + unsafe { + sse_rgba_to_yuv_row_impl420::( + transform, range, y_plane0, y_plane1, u_plane, v_plane, rgba0, rgba1, start_cx, + start_ux, width, + ) + } +} + +#[target_feature(enable = "sse4.1")] +unsafe fn sse_rgba_to_yuv_row_impl420( + transform: &CbCrForwardTransform, + range: &YuvChromaRange, + y_plane0: &mut [u8], + y_plane1: &mut [u8], + u_plane: &mut [u8], + v_plane: &mut [u8], + rgba0: &[u8], + rgba1: &[u8], + start_cx: usize, + start_ux: usize, + width: usize, +) -> ProcessedOffset { + let source_channels: YuvSourceChannels = ORIGIN_CHANNELS.into(); + let channels = source_channels.get_channels_count(); + + let u_ptr = u_plane.as_mut_ptr(); + let v_ptr = v_plane.as_mut_ptr(); + + let mut cx = start_cx; + let mut uv_x = start_ux; + + const V_SHR: i32 = 3; + const V_SCALE: i32 = 6; + let rounding_const_bias: i16 = 1 << (V_SHR - 1); + let bias_y = range.bias_y as i16 * (1 << V_SHR) + rounding_const_bias; + let bias_uv = range.bias_uv as i16 * (1 << V_SHR) + rounding_const_bias; + + let i_bias_y = _mm_set1_epi16(range.bias_y as i16); + let i_cap_y = _mm_set1_epi16(range.range_y as i16 + range.bias_y as i16); + let i_cap_uv = _mm_set1_epi16(range.bias_y as i16 + range.range_uv as i16); + + let zeros = _mm_setzero_si128(); + + let y_bias = _mm_set1_epi16(bias_y); + let uv_bias = _mm_set1_epi16(bias_uv); + let v_yr = _mm_set1_epi16(transform.yr as i16); + let v_yg = _mm_set1_epi16(transform.yg as i16); + let v_yb = _mm_set1_epi16(transform.yb as i16); + let v_cb_r = _mm_set1_epi16(transform.cb_r as i16); + let v_cb_g = _mm_set1_epi16(transform.cb_g as i16); + let v_cb_b = _mm_set1_epi16(transform.cb_b as i16); + let v_cr_r = _mm_set1_epi16(transform.cr_r as i16); + let v_cr_g = _mm_set1_epi16(transform.cr_g as i16); + let v_cr_b = _mm_set1_epi16(transform.cr_b as i16); + + while cx + 16 < width { + let (r_values0, g_values0, b_values0); + let (r_values1, g_values1, b_values1); + + let px = cx * channels; + + match source_channels { + YuvSourceChannels::Rgb | YuvSourceChannels::Bgr => { + let row_start0 = rgba0.get_unchecked(px..).as_ptr(); + let row_1 = _mm_loadu_si128(row_start0 as *const __m128i); + let row_2 = _mm_loadu_si128(row_start0.add(16) as *const __m128i); + let row_3 = _mm_loadu_si128(row_start0.add(32) as *const __m128i); + + let (it1, it2, it3) = sse_deinterleave_rgb(row_1, row_2, row_3); + if source_channels == YuvSourceChannels::Rgb { + r_values0 = it1; + g_values0 = it2; + b_values0 = it3; + } else { + r_values0 = it3; + g_values0 = it2; + b_values0 = it1; + } + + let row_start1 = rgba1.get_unchecked(px..).as_ptr(); + let row_11 = _mm_loadu_si128(row_start1 as *const __m128i); + let row_21 = _mm_loadu_si128(row_start1.add(16) as *const __m128i); + let row_31 = _mm_loadu_si128(row_start1.add(32) as *const __m128i); + + let (it11, it21, it31) = sse_deinterleave_rgb(row_11, row_21, row_31); + if source_channels == YuvSourceChannels::Rgb { + r_values1 = it11; + g_values1 = it21; + b_values1 = it31; + } else { + r_values1 = it31; + g_values1 = it21; + b_values1 = it11; + } + } + YuvSourceChannels::Rgba | YuvSourceChannels::Bgra => { + let row_start0 = rgba0.get_unchecked(px..).as_ptr(); + let row_1 = _mm_loadu_si128(row_start0 as *const __m128i); + let row_2 = _mm_loadu_si128(row_start0.add(16) as *const __m128i); + let row_3 = _mm_loadu_si128(row_start0.add(32) as *const __m128i); + let row_4 = _mm_loadu_si128(row_start0.add(48) as *const __m128i); + + let (it1, it2, it3, _) = sse_deinterleave_rgba(row_1, row_2, row_3, row_4); + if source_channels == YuvSourceChannels::Rgba { + r_values0 = it1; + g_values0 = it2; + b_values0 = it3; + } else { + r_values0 = it3; + g_values0 = it2; + b_values0 = it1; + } + + let row_start1 = rgba1.get_unchecked(px..).as_ptr(); + let row_11 = _mm_loadu_si128(row_start1 as *const __m128i); + let row_21 = _mm_loadu_si128(row_start1.add(16) as *const __m128i); + let row_31 = _mm_loadu_si128(row_start1.add(32) as *const __m128i); + let row_41 = _mm_loadu_si128(row_start1.add(48) as *const __m128i); + + let (it11, it21, it31, _) = sse_deinterleave_rgba(row_11, row_21, row_31, row_41); + if source_channels == YuvSourceChannels::Rgba { + r_values1 = it11; + g_values1 = it21; + b_values1 = it31; + } else { + r_values1 = it31; + g_values1 = it21; + b_values1 = it11; + } + } + } + + let r0_low = _mm_slli_epi16::(_mm_cvtepu8_epi16(r_values0)); + let r0_high = _mm_slli_epi16::(_mm_unpackhi_epi8(r_values0, zeros)); + let g0_low = _mm_slli_epi16::(_mm_cvtepu8_epi16(g_values0)); + let g0_high = _mm_slli_epi16::(_mm_unpackhi_epi8(g_values0, zeros)); + let b0_low = _mm_slli_epi16::(_mm_cvtepu8_epi16(b_values0)); + let b0_high = _mm_slli_epi16::(_mm_unpackhi_epi8(b_values0, zeros)); + + let y0_l = _mm_max_epi16( + _mm_min_epi16( + _mm_srai_epi16::(_mm_add_epi16( + y_bias, + _mm_add_epi16( + _mm_add_epi16( + _mm_mulhrs_epi16(r0_low, v_yr), + _mm_mulhrs_epi16(g0_low, v_yg), + ), + _mm_mulhrs_epi16(b0_low, v_yb), + ), + )), + i_cap_y, + ), + i_bias_y, + ); + + let y0_h = _mm_max_epi16( + _mm_min_epi16( + _mm_srai_epi16::(_mm_add_epi16( + y_bias, + _mm_add_epi16( + _mm_add_epi16( + _mm_mulhrs_epi16(r0_high, v_yr), + _mm_mulhrs_epi16(g0_high, v_yg), + ), + _mm_mulhrs_epi16(b0_high, v_yb), + ), + )), + i_cap_y, + ), + i_bias_y, + ); + + let r1_low = _mm_slli_epi16::(_mm_cvtepu8_epi16(r_values1)); + let r1_high = _mm_slli_epi16::(_mm_unpackhi_epi8(r_values1, zeros)); + let g1_low = _mm_slli_epi16::(_mm_cvtepu8_epi16(g_values1)); + let g1_high = _mm_slli_epi16::(_mm_unpackhi_epi8(g_values1, zeros)); + let b1_low = _mm_slli_epi16::(_mm_cvtepu8_epi16(b_values1)); + let b1_high = _mm_slli_epi16::(_mm_unpackhi_epi8(b_values1, zeros)); + + let y1_l = _mm_max_epi16( + _mm_min_epi16( + _mm_srai_epi16::(_mm_add_epi16( + y_bias, + _mm_add_epi16( + _mm_add_epi16( + _mm_mulhrs_epi16(r1_low, v_yr), + _mm_mulhrs_epi16(g1_low, v_yg), + ), + _mm_mulhrs_epi16(b1_low, v_yb), + ), + )), + i_cap_y, + ), + i_bias_y, + ); + + let y1_h = _mm_max_epi16( + _mm_min_epi16( + _mm_srai_epi16::(_mm_add_epi16( + y_bias, + _mm_add_epi16( + _mm_add_epi16( + _mm_mulhrs_epi16(r1_high, v_yr), + _mm_mulhrs_epi16(g1_high, v_yg), + ), + _mm_mulhrs_epi16(b1_high, v_yb), + ), + )), + i_cap_y, + ), + i_bias_y, + ); + + let y0_yuv = _mm_packus_epi16(y0_l, y0_h); + let y1_yuv = _mm_packus_epi16(y1_l, y1_h); + + _mm_storeu_si128( + y_plane0.get_unchecked_mut(cx..).as_mut_ptr() as *mut __m128i, + y0_yuv, + ); + _mm_storeu_si128( + y_plane1.get_unchecked_mut(cx..).as_mut_ptr() as *mut __m128i, + y1_yuv, + ); + + let r1 = _mm_avg_epu16(r0_low, r0_high); + let g1 = _mm_avg_epu16(g0_low, g0_high); + let b1 = _mm_avg_epu16(b0_low, b0_high); + + let cbk = _mm_max_epi16( + _mm_min_epi16( + _mm_srai_epi16::(_mm_add_epi16( + uv_bias, + _mm_add_epi16( + _mm_add_epi16(_mm_mulhrs_epi16(r1, v_cb_r), _mm_mulhrs_epi16(g1, v_cb_g)), + _mm_mulhrs_epi16(b1, v_cb_b), + ), + )), + i_cap_uv, + ), + i_bias_y, + ); + + let crk = _mm_max_epi16( + _mm_min_epi16( + _mm_srai_epi16::(_mm_add_epi16( + uv_bias, + _mm_add_epi16( + _mm_add_epi16(_mm_mulhrs_epi16(r1, v_cr_r), _mm_mulhrs_epi16(g1, v_cr_g)), + _mm_mulhrs_epi16(b1, v_cr_b), + ), + )), + i_cap_uv, + ), + i_bias_y, + ); + + let cb = _mm_packus_epi16(cbk, cbk); + let cr = _mm_packus_epi16(crk, crk); + + std::ptr::copy_nonoverlapping(&cb as *const _ as *const u8, u_ptr.add(uv_x), 8); + std::ptr::copy_nonoverlapping(&cr as *const _ as *const u8, v_ptr.add(uv_x), 8); + uv_x += 8; + cx += 16; + } + + ProcessedOffset { cx, ux: uv_x } +} diff --git a/src/sse/yuv_nv_to_rgba.rs b/src/sse/yuv_nv_to_rgba.rs index 92663c7..eeddc51 100644 --- a/src/sse/yuv_nv_to_rgba.rs +++ b/src/sse/yuv_nv_to_rgba.rs @@ -86,6 +86,9 @@ unsafe fn sse_yuv_nv_to_rgba_impl< let uv_ptr = uv_plane.as_ptr(); let rgba_ptr = rgba.as_mut_ptr(); + const SCALE: i32 = 6; + const V_SHR: i32 = 3; + let y_corr = _mm_set1_epi8(range.bias_y as i8); let uv_corr = _mm_set1_epi16(range.bias_uv as i16); let v_luma_coeff = _mm_set1_epi16(transform.y_coef as i16); @@ -94,7 +97,7 @@ unsafe fn sse_yuv_nv_to_rgba_impl< let v_g_coeff_1 = _mm_set1_epi16(transform.g_coeff_1 as i16); let v_g_coeff_2 = _mm_set1_epi16(transform.g_coeff_2 as i16); let v_alpha = _mm_set1_epi8(255u8 as i8); - let rounding_const = _mm_set1_epi16(1 << 2); + let rounding_const = _mm_set1_epi16(1 << (V_SHR - 1)); let zeros = _mm_setzero_si128(); @@ -151,53 +154,53 @@ unsafe fn sse_yuv_nv_to_rgba_impl< } } - let u_high = _mm_slli_epi16::<7>(_mm_sub_epi16(u_high_u16, uv_corr)); - let v_high = _mm_slli_epi16::<7>(_mm_sub_epi16(v_high_u16, uv_corr)); - let y_high = _mm_mulhi_epi16( - _mm_slli_epi16::<7>(_mm_unpackhi_epi8(y_values, zeros)), + let u_high = _mm_slli_epi16::(_mm_sub_epi16(u_high_u16, uv_corr)); + let v_high = _mm_slli_epi16::(_mm_sub_epi16(v_high_u16, uv_corr)); + let y_high = _mm_mulhrs_epi16( + _mm_slli_epi16::(_mm_unpackhi_epi8(y_values, zeros)), v_luma_coeff, ); - let r_high = _mm_srai_epi16::<3>(_mm_add_epi16( - _mm_add_epi16(y_high, _mm_mulhi_epi16(v_high, v_cr_coeff)), + let r_high = _mm_srai_epi16::(_mm_add_epi16( + _mm_add_epi16(y_high, _mm_mulhrs_epi16(v_high, v_cr_coeff)), rounding_const, )); - let b_high = _mm_srai_epi16::<3>(_mm_add_epi16( - _mm_add_epi16(y_high, _mm_mulhi_epi16(u_high, v_cb_coeff)), + let b_high = _mm_srai_epi16::(_mm_add_epi16( + _mm_add_epi16(y_high, _mm_mulhrs_epi16(u_high, v_cb_coeff)), rounding_const, )); - let g_high = _mm_srai_epi16::<3>(_mm_add_epi16( + let g_high = _mm_srai_epi16::(_mm_add_epi16( _mm_sub_epi16( y_high, _mm_add_epi16( - _mm_mulhi_epi16(v_high, v_g_coeff_1), - _mm_mulhi_epi16(u_high, v_g_coeff_2), + _mm_mulhrs_epi16(v_high, v_g_coeff_1), + _mm_mulhrs_epi16(u_high, v_g_coeff_2), ), ), rounding_const, )); - let u_low = _mm_slli_epi16::<7>(_mm_sub_epi16(u_low_u16, uv_corr)); - let v_low = _mm_slli_epi16::<7>(_mm_sub_epi16(v_low_u16, uv_corr)); - let y_low = _mm_mulhi_epi16( - _mm_slli_epi16::<7>(_mm_cvtepu8_epi16(y_values)), + let u_low = _mm_slli_epi16::(_mm_sub_epi16(u_low_u16, uv_corr)); + let v_low = _mm_slli_epi16::(_mm_sub_epi16(v_low_u16, uv_corr)); + let y_low = _mm_mulhrs_epi16( + _mm_slli_epi16::(_mm_cvtepu8_epi16(y_values)), v_luma_coeff, ); - let r_low = _mm_srai_epi16::<3>(_mm_add_epi16( - _mm_add_epi16(y_low, _mm_mulhi_epi16(v_low, v_cr_coeff)), + let r_low = _mm_srai_epi16::(_mm_add_epi16( + _mm_add_epi16(y_low, _mm_mulhrs_epi16(v_low, v_cr_coeff)), rounding_const, )); - let b_low = _mm_srai_epi16::<3>(_mm_add_epi16( - _mm_add_epi16(y_low, _mm_mulhi_epi16(u_low, v_cb_coeff)), + let b_low = _mm_srai_epi16::(_mm_add_epi16( + _mm_add_epi16(y_low, _mm_mulhrs_epi16(u_low, v_cb_coeff)), rounding_const, )); - let g_low = _mm_srai_epi16::<3>(_mm_add_epi16( + let g_low = _mm_srai_epi16::(_mm_add_epi16( _mm_sub_epi16( y_low, _mm_add_epi16( - _mm_mulhi_epi16(v_low, v_g_coeff_1), - _mm_mulhi_epi16(u_low, v_g_coeff_2), + _mm_mulhrs_epi16(v_low, v_g_coeff_1), + _mm_mulhrs_epi16(u_low, v_g_coeff_2), ), ), rounding_const, @@ -290,27 +293,27 @@ unsafe fn sse_yuv_nv_to_rgba_impl< } } - let u_low = _mm_slli_epi16::<7>(_mm_sub_epi16(u_low_u16, uv_corr)); - let v_low = _mm_slli_epi16::<7>(_mm_sub_epi16(v_low_u16, uv_corr)); - let y_low = _mm_mulhi_epi16( - _mm_slli_epi16::<7>(_mm_cvtepu8_epi16(y_values)), + let u_low = _mm_slli_epi16::(_mm_sub_epi16(u_low_u16, uv_corr)); + let v_low = _mm_slli_epi16::(_mm_sub_epi16(v_low_u16, uv_corr)); + let y_low = _mm_mulhrs_epi16( + _mm_slli_epi16::(_mm_cvtepu8_epi16(y_values)), v_luma_coeff, ); - let r_low = _mm_srai_epi16::<3>(_mm_add_epi16( - _mm_add_epi16(y_low, _mm_mulhi_epi16(v_low, v_cr_coeff)), + let r_low = _mm_srai_epi16::(_mm_add_epi16( + _mm_add_epi16(y_low, _mm_mulhrs_epi16(v_low, v_cr_coeff)), rounding_const, )); - let b_low = _mm_srai_epi16::<3>(_mm_add_epi16( - _mm_add_epi16(y_low, _mm_mulhi_epi16(u_low, v_cb_coeff)), + let b_low = _mm_srai_epi16::(_mm_add_epi16( + _mm_add_epi16(y_low, _mm_mulhrs_epi16(u_low, v_cb_coeff)), rounding_const, )); - let g_low = _mm_srai_epi16::<3>(_mm_add_epi16( + let g_low = _mm_srai_epi16::(_mm_add_epi16( _mm_sub_epi16( y_low, _mm_add_epi16( - _mm_mulhi_epi16(v_low, v_g_coeff_1), - _mm_mulhi_epi16(u_low, v_g_coeff_2), + _mm_mulhrs_epi16(v_low, v_g_coeff_1), + _mm_mulhrs_epi16(u_low, v_g_coeff_2), ), ), rounding_const, diff --git a/src/sse/yuv_nv_to_rgba420.rs b/src/sse/yuv_nv_to_rgba420.rs index 87006ff..5d3689f 100644 --- a/src/sse/yuv_nv_to_rgba420.rs +++ b/src/sse/yuv_nv_to_rgba420.rs @@ -77,7 +77,7 @@ unsafe fn sse_yuv_nv_to_rgba_impl420(_mm_sub_epi16(u_high_u16, uv_corr)); let v_high = _mm_slli_epi16::(_mm_sub_epi16(v_high_u16, uv_corr)); - let y_high0 = _mm_mulhi_epi16( + let y_high0 = _mm_mulhrs_epi16( _mm_slli_epi16::(_mm_unpackhi_epi8(y_values0, zeros)), v_luma_coeff, ); - let y_high1 = _mm_mulhi_epi16( + let y_high1 = _mm_mulhrs_epi16( _mm_slli_epi16::(_mm_unpackhi_epi8(y_values1, zeros)), v_luma_coeff, ); let g_coeff_hi = _mm_add_epi16( - _mm_mulhi_epi16(v_high, v_g_coeff_1), - _mm_mulhi_epi16(u_high, v_g_coeff_2), + _mm_mulhrs_epi16(v_high, v_g_coeff_1), + _mm_mulhrs_epi16(u_high, v_g_coeff_2), ); let r_high0 = _mm_srai_epi16::(_mm_add_epi16( - _mm_add_epi16(y_high0, _mm_mulhi_epi16(v_high, v_cr_coeff)), + _mm_add_epi16(y_high0, _mm_mulhrs_epi16(v_high, v_cr_coeff)), rounding_const, )); let b_high0 = _mm_srai_epi16::(_mm_add_epi16( - _mm_add_epi16(y_high0, _mm_mulhi_epi16(u_high, v_cb_coeff)), + _mm_add_epi16(y_high0, _mm_mulhrs_epi16(u_high, v_cb_coeff)), rounding_const, )); let g_high0 = _mm_srai_epi16::(_mm_add_epi16( @@ -156,11 +156,11 @@ unsafe fn sse_yuv_nv_to_rgba_impl420(_mm_add_epi16( - _mm_add_epi16(y_high1, _mm_mulhi_epi16(v_high, v_cr_coeff)), + _mm_add_epi16(y_high1, _mm_mulhrs_epi16(v_high, v_cr_coeff)), rounding_const, )); let b_high1 = _mm_srai_epi16::(_mm_add_epi16( - _mm_add_epi16(y_high1, _mm_mulhi_epi16(u_high, v_cb_coeff)), + _mm_add_epi16(y_high1, _mm_mulhrs_epi16(u_high, v_cb_coeff)), rounding_const, )); let g_high1 = _mm_srai_epi16::(_mm_add_epi16( @@ -170,26 +170,26 @@ unsafe fn sse_yuv_nv_to_rgba_impl420(_mm_sub_epi16(u_low_u16, uv_corr)); let v_low = _mm_slli_epi16::(_mm_sub_epi16(v_low_u16, uv_corr)); - let y_low0 = _mm_mulhi_epi16( + let y_low0 = _mm_mulhrs_epi16( _mm_slli_epi16::(_mm_cvtepu8_epi16(y_values0)), v_luma_coeff, ); - let y_low1 = _mm_mulhi_epi16( + let y_low1 = _mm_mulhrs_epi16( _mm_slli_epi16::(_mm_cvtepu8_epi16(y_values1)), v_luma_coeff, ); let g_coeff_lo = _mm_add_epi16( - _mm_mulhi_epi16(v_low, v_g_coeff_1), - _mm_mulhi_epi16(u_low, v_g_coeff_2), + _mm_mulhrs_epi16(v_low, v_g_coeff_1), + _mm_mulhrs_epi16(u_low, v_g_coeff_2), ); let r_low0 = _mm_srai_epi16::(_mm_add_epi16( - _mm_add_epi16(y_low0, _mm_mulhi_epi16(v_low, v_cr_coeff)), + _mm_add_epi16(y_low0, _mm_mulhrs_epi16(v_low, v_cr_coeff)), rounding_const, )); let b_low0 = _mm_srai_epi16::(_mm_add_epi16( - _mm_add_epi16(y_low0, _mm_mulhi_epi16(u_low, v_cb_coeff)), + _mm_add_epi16(y_low0, _mm_mulhrs_epi16(u_low, v_cb_coeff)), rounding_const, )); let g_low0 = _mm_srai_epi16::(_mm_add_epi16( @@ -197,11 +197,11 @@ unsafe fn sse_yuv_nv_to_rgba_impl420(_mm_add_epi16( - _mm_add_epi16(y_low1, _mm_mulhi_epi16(v_low, v_cr_coeff)), + _mm_add_epi16(y_low1, _mm_mulhrs_epi16(v_low, v_cr_coeff)), rounding_const, )); let b_low1 = _mm_srai_epi16::(_mm_add_epi16( - _mm_add_epi16(y_low1, _mm_mulhi_epi16(u_low, v_cb_coeff)), + _mm_add_epi16(y_low1, _mm_mulhrs_epi16(u_low, v_cb_coeff)), rounding_const, )); let g_low1 = _mm_srai_epi16::(_mm_add_epi16( @@ -317,26 +317,26 @@ unsafe fn sse_yuv_nv_to_rgba_impl420(_mm_sub_epi16(u_low_u16, uv_corr)); let v_low = _mm_slli_epi16::(_mm_sub_epi16(v_low_u16, uv_corr)); - let y_low0 = _mm_mulhi_epi16( + let y_low0 = _mm_mulhrs_epi16( _mm_slli_epi16::(_mm_cvtepu8_epi16(y_values0)), v_luma_coeff, ); - let y_low1 = _mm_mulhi_epi16( + let y_low1 = _mm_mulhrs_epi16( _mm_slli_epi16::(_mm_cvtepu8_epi16(y_values1)), v_luma_coeff, ); let g_coeff_lo = _mm_add_epi16( - _mm_mulhi_epi16(v_low, v_g_coeff_1), - _mm_mulhi_epi16(u_low, v_g_coeff_2), + _mm_mulhrs_epi16(v_low, v_g_coeff_1), + _mm_mulhrs_epi16(u_low, v_g_coeff_2), ); let r_low0 = _mm_srai_epi16::(_mm_add_epi16( - _mm_add_epi16(y_low0, _mm_mulhi_epi16(v_low, v_cr_coeff)), + _mm_add_epi16(y_low0, _mm_mulhrs_epi16(v_low, v_cr_coeff)), rounding_const, )); let b_low0 = _mm_srai_epi16::(_mm_add_epi16( - _mm_add_epi16(y_low0, _mm_mulhi_epi16(u_low, v_cb_coeff)), + _mm_add_epi16(y_low0, _mm_mulhrs_epi16(u_low, v_cb_coeff)), rounding_const, )); let g_low0 = _mm_srai_epi16::(_mm_add_epi16( @@ -345,11 +345,11 @@ unsafe fn sse_yuv_nv_to_rgba_impl420(_mm_add_epi16( - _mm_add_epi16(y_low1, _mm_mulhi_epi16(v_low, v_cr_coeff)), + _mm_add_epi16(y_low1, _mm_mulhrs_epi16(v_low, v_cr_coeff)), rounding_const, )); let b_low1 = _mm_srai_epi16::(_mm_add_epi16( - _mm_add_epi16(y_low1, _mm_mulhi_epi16(u_low, v_cb_coeff)), + _mm_add_epi16(y_low1, _mm_mulhrs_epi16(u_low, v_cb_coeff)), rounding_const, )); let g_low1 = _mm_srai_epi16::(_mm_add_epi16( diff --git a/src/sse/yuv_to_rgba.rs b/src/sse/yuv_to_rgba.rs index fb58366..0ba8153 100644 --- a/src/sse/yuv_to_rgba.rs +++ b/src/sse/yuv_to_rgba.rs @@ -80,6 +80,9 @@ unsafe fn sse_yuv_to_rgba_row_impl(_mm_sub_epi16(u_high_u16, uv_corr)); - let v_high = _mm_slli_epi16::<7>(_mm_sub_epi16(v_high_u16, uv_corr)); + let u_high = _mm_slli_epi16::(_mm_sub_epi16(u_high_u16, uv_corr)); + let v_high = _mm_slli_epi16::(_mm_sub_epi16(v_high_u16, uv_corr)); let y_high = _mm_mulhi_epi16( - _mm_slli_epi16::<7>(_mm_unpackhi_epi8(y_values, zeros)), + _mm_slli_epi16::(_mm_unpackhi_epi8(y_values, zeros)), v_luma_coeff, ); - let r_high = _mm_srai_epi16::<3>(_mm_add_epi16( + let r_high = _mm_srai_epi16::(_mm_add_epi16( _mm_add_epi16(y_high, _mm_mulhi_epi16(v_high, v_cr_coeff)), rounding_const, )); - let b_high = _mm_srai_epi16::<3>(_mm_add_epi16( + let b_high = _mm_srai_epi16::(_mm_add_epi16( _mm_add_epi16(y_high, _mm_mulhi_epi16(u_high, v_cb_coeff)), rounding_const, )); - let g_high = _mm_srai_epi16::<3>(_mm_add_epi16( + let g_high = _mm_srai_epi16::(_mm_add_epi16( _mm_sub_epi16( y_high, _mm_add_epi16( @@ -145,22 +148,22 @@ unsafe fn sse_yuv_to_rgba_row_impl(_mm_sub_epi16(u_low_u16, uv_corr)); - let v_low = _mm_slli_epi16::<7>(_mm_sub_epi16(v_low_u16, uv_corr)); + let u_low = _mm_slli_epi16::(_mm_sub_epi16(u_low_u16, uv_corr)); + let v_low = _mm_slli_epi16::(_mm_sub_epi16(v_low_u16, uv_corr)); let y_low = _mm_mulhi_epi16( - _mm_slli_epi16::<7>(_mm_cvtepu8_epi16(y_values)), + _mm_slli_epi16::(_mm_cvtepu8_epi16(y_values)), v_luma_coeff, ); - let r_low = _mm_srai_epi16::<3>(_mm_add_epi16( + let r_low = _mm_srai_epi16::(_mm_add_epi16( _mm_add_epi16(y_low, _mm_mulhi_epi16(v_low, v_cr_coeff)), rounding_const, )); - let b_low = _mm_srai_epi16::<3>(_mm_add_epi16( + let b_low = _mm_srai_epi16::(_mm_add_epi16( _mm_add_epi16(y_low, _mm_mulhi_epi16(u_low, v_cb_coeff)), rounding_const, )); - let g_low = _mm_srai_epi16::<3>(_mm_add_epi16( + let g_low = _mm_srai_epi16::(_mm_add_epi16( _mm_sub_epi16( y_low, _mm_add_epi16( @@ -247,22 +250,22 @@ unsafe fn sse_yuv_to_rgba_row_impl(_mm_sub_epi16(u_low_u16, uv_corr)); - let v_low = _mm_slli_epi16::<7>(_mm_sub_epi16(v_low_u16, uv_corr)); + let u_low = _mm_slli_epi16::(_mm_sub_epi16(u_low_u16, uv_corr)); + let v_low = _mm_slli_epi16::(_mm_sub_epi16(v_low_u16, uv_corr)); let y_low = _mm_mulhi_epi16( - _mm_slli_epi16::<7>(_mm_cvtepu8_epi16(y_values)), + _mm_slli_epi16::(_mm_cvtepu8_epi16(y_values)), v_luma_coeff, ); - let r_low = _mm_srai_epi16::<3>(_mm_add_epi16( + let r_low = _mm_srai_epi16::(_mm_add_epi16( _mm_add_epi16(y_low, _mm_mulhi_epi16(v_low, v_cr_coeff)), rounding_const, )); - let b_low = _mm_srai_epi16::<3>(_mm_add_epi16( + let b_low = _mm_srai_epi16::(_mm_add_epi16( _mm_add_epi16(y_low, _mm_mulhi_epi16(u_low, v_cb_coeff)), rounding_const, )); - let g_low = _mm_srai_epi16::<3>(_mm_add_epi16( + let g_low = _mm_srai_epi16::(_mm_add_epi16( _mm_sub_epi16( y_low, _mm_add_epi16( diff --git a/src/sse/yuv_to_rgba420.rs b/src/sse/yuv_to_rgba420.rs index 2bb1461..bf45971 100644 --- a/src/sse/yuv_to_rgba420.rs +++ b/src/sse/yuv_to_rgba420.rs @@ -80,7 +80,7 @@ unsafe fn sse_yuv_to_rgba_row_impl420( let u_ptr = u_plane.as_ptr(); let v_ptr = v_plane.as_ptr(); - const SCALE: i32 = 7; + const SCALE: i32 = 6; const V_SHR: i32 = 3; let y_corr = _mm_set1_epi8(range.bias_y as i8); @@ -116,26 +116,26 @@ unsafe fn sse_yuv_to_rgba_row_impl420( let u_high = _mm_slli_epi16::(_mm_sub_epi16(u_high_u16, uv_corr)); let v_high = _mm_slli_epi16::(_mm_sub_epi16(v_high_u16, uv_corr)); - let y_high0 = _mm_mulhi_epi16( + let y_high0 = _mm_mulhrs_epi16( _mm_slli_epi16::(_mm_unpackhi_epi8(y_values0, zeros)), v_luma_coeff, ); - let y_high1 = _mm_mulhi_epi16( + let y_high1 = _mm_mulhrs_epi16( _mm_slli_epi16::(_mm_unpackhi_epi8(y_values1, zeros)), v_luma_coeff, ); let g_coeff_hi = _mm_add_epi16( - _mm_mulhi_epi16(v_high, v_g_coeff_1), - _mm_mulhi_epi16(u_high, v_g_coeff_2), + _mm_mulhrs_epi16(v_high, v_g_coeff_1), + _mm_mulhrs_epi16(u_high, v_g_coeff_2), ); let r_high0 = _mm_srai_epi16::(_mm_add_epi16( - _mm_add_epi16(y_high0, _mm_mulhi_epi16(v_high, v_cr_coeff)), + _mm_add_epi16(y_high0, _mm_mulhrs_epi16(v_high, v_cr_coeff)), rounding_const, )); let b_high0 = _mm_srai_epi16::(_mm_add_epi16( - _mm_add_epi16(y_high0, _mm_mulhi_epi16(u_high, v_cb_coeff)), + _mm_add_epi16(y_high0, _mm_mulhrs_epi16(u_high, v_cb_coeff)), rounding_const, )); let g_high0 = _mm_srai_epi16::(_mm_add_epi16( @@ -144,11 +144,11 @@ unsafe fn sse_yuv_to_rgba_row_impl420( )); let r_high1 = _mm_srai_epi16::(_mm_add_epi16( - _mm_add_epi16(y_high1, _mm_mulhi_epi16(v_high, v_cr_coeff)), + _mm_add_epi16(y_high1, _mm_mulhrs_epi16(v_high, v_cr_coeff)), rounding_const, )); let b_high1 = _mm_srai_epi16::(_mm_add_epi16( - _mm_add_epi16(y_high1, _mm_mulhi_epi16(u_high, v_cb_coeff)), + _mm_add_epi16(y_high1, _mm_mulhrs_epi16(u_high, v_cb_coeff)), rounding_const, )); let g_high1 = _mm_srai_epi16::(_mm_add_epi16( @@ -158,26 +158,26 @@ unsafe fn sse_yuv_to_rgba_row_impl420( let u_low = _mm_slli_epi16::(_mm_sub_epi16(u_low_u16, uv_corr)); let v_low = _mm_slli_epi16::(_mm_sub_epi16(v_low_u16, uv_corr)); - let y_low0 = _mm_mulhi_epi16( + let y_low0 = _mm_mulhrs_epi16( _mm_slli_epi16::(_mm_cvtepu8_epi16(y_values0)), v_luma_coeff, ); - let y_low1 = _mm_mulhi_epi16( + let y_low1 = _mm_mulhrs_epi16( _mm_slli_epi16::(_mm_cvtepu8_epi16(y_values1)), v_luma_coeff, ); let g_coeff_lo = _mm_add_epi16( - _mm_mulhi_epi16(v_low, v_g_coeff_1), - _mm_mulhi_epi16(u_low, v_g_coeff_2), + _mm_mulhrs_epi16(v_low, v_g_coeff_1), + _mm_mulhrs_epi16(u_low, v_g_coeff_2), ); let r_low0 = _mm_srai_epi16::(_mm_add_epi16( - _mm_add_epi16(y_low0, _mm_mulhi_epi16(v_low, v_cr_coeff)), + _mm_add_epi16(y_low0, _mm_mulhrs_epi16(v_low, v_cr_coeff)), rounding_const, )); let b_low0 = _mm_srai_epi16::(_mm_add_epi16( - _mm_add_epi16(y_low0, _mm_mulhi_epi16(u_low, v_cb_coeff)), + _mm_add_epi16(y_low0, _mm_mulhrs_epi16(u_low, v_cb_coeff)), rounding_const, )); let g_low0 = _mm_srai_epi16::(_mm_add_epi16( @@ -186,11 +186,11 @@ unsafe fn sse_yuv_to_rgba_row_impl420( )); let r_low1 = _mm_srai_epi16::(_mm_add_epi16( - _mm_add_epi16(y_low1, _mm_mulhi_epi16(v_low, v_cr_coeff)), + _mm_add_epi16(y_low1, _mm_mulhrs_epi16(v_low, v_cr_coeff)), rounding_const, )); let b_low1 = _mm_srai_epi16::(_mm_add_epi16( - _mm_add_epi16(y_low1, _mm_mulhi_epi16(u_low, v_cb_coeff)), + _mm_add_epi16(y_low1, _mm_mulhrs_epi16(u_low, v_cb_coeff)), rounding_const, )); let g_low1 = _mm_srai_epi16::(_mm_add_epi16( @@ -304,26 +304,26 @@ unsafe fn sse_yuv_to_rgba_row_impl420( let u_low = _mm_slli_epi16::(_mm_sub_epi16(u_low_u16, uv_corr)); let v_low = _mm_slli_epi16::(_mm_sub_epi16(v_low_u16, uv_corr)); - let y_low0 = _mm_mulhi_epi16( + let y_low0 = _mm_mulhrs_epi16( _mm_slli_epi16::(_mm_cvtepu8_epi16(y_values0)), v_luma_coeff, ); - let y_low1 = _mm_mulhi_epi16( + let y_low1 = _mm_mulhrs_epi16( _mm_slli_epi16::(_mm_cvtepu8_epi16(y_values1)), v_luma_coeff, ); let g_coeff = _mm_add_epi16( - _mm_mulhi_epi16(v_low, v_g_coeff_1), - _mm_mulhi_epi16(u_low, v_g_coeff_2), + _mm_mulhrs_epi16(v_low, v_g_coeff_1), + _mm_mulhrs_epi16(u_low, v_g_coeff_2), ); let r_low0 = _mm_srai_epi16::(_mm_add_epi16( - _mm_add_epi16(y_low0, _mm_mulhi_epi16(v_low, v_cr_coeff)), + _mm_add_epi16(y_low0, _mm_mulhrs_epi16(v_low, v_cr_coeff)), rounding_const, )); let b_low0 = _mm_srai_epi16::(_mm_add_epi16( - _mm_add_epi16(y_low0, _mm_mulhi_epi16(u_low, v_cb_coeff)), + _mm_add_epi16(y_low0, _mm_mulhrs_epi16(u_low, v_cb_coeff)), rounding_const, )); let g_low0 = _mm_srai_epi16::(_mm_add_epi16( @@ -332,11 +332,11 @@ unsafe fn sse_yuv_to_rgba_row_impl420( )); let r_low1 = _mm_srai_epi16::(_mm_add_epi16( - _mm_add_epi16(y_low1, _mm_mulhi_epi16(v_low, v_cr_coeff)), + _mm_add_epi16(y_low1, _mm_mulhrs_epi16(v_low, v_cr_coeff)), rounding_const, )); let b_low1 = _mm_srai_epi16::(_mm_add_epi16( - _mm_add_epi16(y_low1, _mm_mulhi_epi16(u_low, v_cb_coeff)), + _mm_add_epi16(y_low1, _mm_mulhrs_epi16(u_low, v_cb_coeff)), rounding_const, )); let g_low1 = _mm_srai_epi16::(_mm_add_epi16( diff --git a/src/sse/yuv_to_rgba_alpha.rs b/src/sse/yuv_to_rgba_alpha.rs index 444e06e..bfefd6d 100644 --- a/src/sse/yuv_to_rgba_alpha.rs +++ b/src/sse/yuv_to_rgba_alpha.rs @@ -94,6 +94,9 @@ unsafe fn sse_yuv_to_rgba_alpha_row_impl(_mm_sub_epi16(u_high_u16, uv_corr)); - let v_high = _mm_slli_epi16::<7>(_mm_sub_epi16(v_high_u16, uv_corr)); - let y_high = _mm_mulhi_epi16( - _mm_slli_epi16::<7>(_mm_unpackhi_epi8(y_values, zeros)), + let u_high = _mm_slli_epi16::(_mm_sub_epi16(u_high_u16, uv_corr)); + let v_high = _mm_slli_epi16::(_mm_sub_epi16(v_high_u16, uv_corr)); + let y_high = _mm_mulhrs_epi16( + _mm_slli_epi16::(_mm_unpackhi_epi8(y_values, zeros)), v_luma_coeff, ); - let r_high = _mm_srai_epi16::<3>(_mm_add_epi16( - _mm_add_epi16(y_high, _mm_mulhi_epi16(v_high, v_cr_coeff)), + let r_high = _mm_srai_epi16::(_mm_add_epi16( + _mm_add_epi16(y_high, _mm_mulhrs_epi16(v_high, v_cr_coeff)), rounding_const, )); - let b_high = _mm_srai_epi16::<3>(_mm_adds_epi16( - _mm_add_epi16(y_high, _mm_mulhi_epi16(u_high, v_cb_coeff)), + let b_high = _mm_srai_epi16::(_mm_adds_epi16( + _mm_add_epi16(y_high, _mm_mulhrs_epi16(u_high, v_cb_coeff)), rounding_const, )); - let g_high = _mm_srai_epi16::<3>(_mm_add_epi16( + let g_high = _mm_srai_epi16::(_mm_add_epi16( _mm_sub_epi16( y_high, _mm_add_epi16( - _mm_mulhi_epi16(v_high, v_g_coeff_1), - _mm_mulhi_epi16(u_high, v_g_coeff_2), + _mm_mulhrs_epi16(v_high, v_g_coeff_1), + _mm_mulhrs_epi16(u_high, v_g_coeff_2), ), ), rounding_const, )); - let u_low = _mm_slli_epi16::<7>(_mm_sub_epi16(u_low_u16, uv_corr)); - let v_low = _mm_slli_epi16::<7>(_mm_sub_epi16(v_low_u16, uv_corr)); - let y_low = _mm_mulhi_epi16( - _mm_slli_epi16::<7>(_mm_cvtepu8_epi16(y_values)), + let u_low = _mm_slli_epi16::(_mm_sub_epi16(u_low_u16, uv_corr)); + let v_low = _mm_slli_epi16::(_mm_sub_epi16(v_low_u16, uv_corr)); + let y_low = _mm_mulhrs_epi16( + _mm_slli_epi16::(_mm_cvtepu8_epi16(y_values)), v_luma_coeff, ); - let r_low = _mm_srai_epi16::<3>(_mm_add_epi16( - _mm_add_epi16(y_low, _mm_mulhi_epi16(v_low, v_cr_coeff)), + let r_low = _mm_srai_epi16::(_mm_add_epi16( + _mm_add_epi16(y_low, _mm_mulhrs_epi16(v_low, v_cr_coeff)), rounding_const, )); - let b_low = _mm_srai_epi16::<3>(_mm_add_epi16( - _mm_add_epi16(y_low, _mm_mulhi_epi16(u_low, v_cb_coeff)), + let b_low = _mm_srai_epi16::(_mm_add_epi16( + _mm_add_epi16(y_low, _mm_mulhrs_epi16(u_low, v_cb_coeff)), rounding_const, )); - let g_low = _mm_srai_epi16::<3>(_mm_add_epi16( + let g_low = _mm_srai_epi16::(_mm_add_epi16( _mm_sub_epi16( y_low, _mm_add_epi16( - _mm_mulhi_epi16(v_low, v_g_coeff_1), - _mm_mulhi_epi16(u_low, v_g_coeff_2), + _mm_mulhrs_epi16(v_low, v_g_coeff_1), + _mm_mulhrs_epi16(u_low, v_g_coeff_2), ), ), rounding_const,