diff --git a/README.md b/README.md index 72c10f5..e7ff754 100644 --- a/README.md +++ b/README.md @@ -78,7 +78,7 @@ Tests performed on the image 5763x3842 |------------------------|:----------:|:---------:| | utils RGB->YUV 4:2:0 | 3.48ms | 3.64ms | | libyuv RGB->YUV 4:2:0 | 3.58ms | 33.87ms | -| utils RGBA->YUV 4:2:0 | 4.32ms | 5.74ms | +| utils RGBA->YUV 4:2:0 | 4.32ms | 5.47ms | | libyuv RGBA->YUV 4:2:0 | 4.87ms | 23.48ms | | utils RGBA->YUV 4:2:2 | 4.83ms | 7.08ms | | libyuv RGBA->YUV 4:2:2 | 5.90ms | 35.23ms | @@ -88,9 +88,9 @@ Tests performed on the image 5763x3842 | | time(NEON) | Time(AVX) | |------------------------|:----------:|:---------:| -| utils YUV NV12->RGB | 3.86ms | 6.48ms | +| utils YUV NV12->RGB | 3.86ms | 6.24ms | | libyuv YUV NV12->RGB | 5.20ms | 45.28ms | -| utils YUV 4:2:0->RGB | 3.28ms | 5.34ms | +| utils YUV 4:2:0->RGB | 3.28ms | 5.25ms | | libyuv YUV 4:2:0->RGB | 5.70ms | 44.95ms | | utils YUV 4:2:0->RGBA | 3.82ms | 5.98ms | | libyuv YUV 4:2:0->RGBA | 6.13ms | 6.88ms | diff --git a/app/benches/yuv8/main.rs b/app/benches/yuv8/main.rs index c510271..8dd9c50 100644 --- a/app/benches/yuv8/main.rs +++ b/app/benches/yuv8/main.rs @@ -26,9 +26,9 @@ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -use std::alloc::Layout; use criterion::{criterion_group, criterion_main, Criterion}; use image::{GenericImageView, ImageReader}; +use std::alloc::Layout; use yuv_sys::{ rs_ABGRToI420, rs_ABGRToJ422, rs_I420ToABGR, rs_I420ToRGB24, rs_I422ToABGR, rs_I444ToABGR, rs_NV21ToABGR, rs_RGB24ToI420, @@ -98,37 +98,41 @@ pub fn criterion_benchmark(c: &mut Criterion) { }) }); - c.bench_function("libyuv RGB -> YUV 4:2:0", |b| { - unsafe { - let layout_rgb = Layout::from_size_align(dimensions.0 as usize * dimensions.1 as usize * 3, 16).unwrap(); - let layout_y = Layout::from_size_align(dimensions.0 as usize * dimensions.1 as usize, 16).unwrap(); - let layout_uv = Layout::from_size_align((dimensions.0 as usize + 1) / 2 * (dimensions.1 as usize + 1) / 2, 16).unwrap(); - let target_y = std::alloc::alloc(layout_y); - let target_u = std::alloc::alloc(layout_uv); - let target_v = std::alloc::alloc(layout_uv); - let source_rgb = std::alloc::alloc(layout_rgb); - for (x, src) in src_bytes.iter().enumerate() { - *source_rgb.add(x) = *src; - } - b.iter(|| { - rs_RGB24ToI420( - source_rgb, - stride as i32, - target_y, - dimensions.0 as i32, - target_u, - (dimensions.0 as i32 + 1) / 2, - target_v, - (dimensions.0 as i32 + 1) / 2, - dimensions.0 as i32, - dimensions.1 as i32, - ); - }); - std::alloc::dealloc(target_y, layout_y); - std::alloc::dealloc(target_u, layout_uv); - std::alloc::dealloc(target_v, layout_uv); - std::alloc::dealloc(source_rgb, layout_rgb); + c.bench_function("libyuv RGB -> YUV 4:2:0", |b| unsafe { + let layout_rgb = + Layout::from_size_align(dimensions.0 as usize * dimensions.1 as usize * 3, 16).unwrap(); + let layout_y = + Layout::from_size_align(dimensions.0 as usize * dimensions.1 as usize, 16).unwrap(); + let layout_uv = Layout::from_size_align( + (dimensions.0 as usize + 1) / 2 * (dimensions.1 as usize + 1) / 2, + 16, + ) + .unwrap(); + let target_y = std::alloc::alloc(layout_y); + let target_u = std::alloc::alloc(layout_uv); + let target_v = std::alloc::alloc(layout_uv); + let source_rgb = std::alloc::alloc(layout_rgb); + for (x, src) in src_bytes.iter().enumerate() { + *source_rgb.add(x) = *src; } + b.iter(|| { + rs_RGB24ToI420( + source_rgb, + stride as i32, + target_y, + dimensions.0 as i32, + target_u, + (dimensions.0 as i32 + 1) / 2, + target_v, + (dimensions.0 as i32 + 1) / 2, + dimensions.0 as i32, + dimensions.1 as i32, + ); + }); + std::alloc::dealloc(target_y, layout_y); + std::alloc::dealloc(target_u, layout_uv); + std::alloc::dealloc(target_v, layout_uv); + std::alloc::dealloc(source_rgb, layout_rgb); }); c.bench_function("yuvutils RGBA -> YUV 4:2:0", |b| { @@ -149,37 +153,41 @@ pub fn criterion_benchmark(c: &mut Criterion) { }) }); - c.bench_function("libyuv RGBA -> YUV 4:2:0", |b| { - unsafe { - let layout_rgba = Layout::from_size_align(dimensions.0 as usize * dimensions.1 as usize * 4, 16).unwrap(); - let layout_y = Layout::from_size_align(dimensions.0 as usize * dimensions.1 as usize, 16).unwrap(); - let layout_uv = Layout::from_size_align((dimensions.0 as usize + 1) / 2 * (dimensions.1 as usize + 1) / 2, 16).unwrap(); - let target_y = std::alloc::alloc(layout_y); - let target_u = std::alloc::alloc(layout_uv); - let target_v = std::alloc::alloc(layout_uv); - let source_rgb = std::alloc::alloc(layout_rgba); - for (x, src) in src_bytes.iter().enumerate() { - *source_rgb.add(x) = *src; - } - b.iter(|| { - rs_ABGRToI420( - source_rgb, - dimensions.0 as i32 * 4i32, - target_y, - dimensions.0 as i32, - target_u, - (dimensions.0 as i32 + 1) / 2, - target_v, - (dimensions.0 as i32 + 1) / 2, - dimensions.0 as i32, - dimensions.1 as i32, - ); - }); - std::alloc::dealloc(target_y, layout_y); - std::alloc::dealloc(target_u, layout_uv); - std::alloc::dealloc(target_v, layout_uv); - std::alloc::dealloc(source_rgb, layout_rgba); - } + c.bench_function("libyuv RGBA -> YUV 4:2:0", |b| unsafe { + let layout_rgba = + Layout::from_size_align(dimensions.0 as usize * dimensions.1 as usize * 4, 16).unwrap(); + let layout_y = + Layout::from_size_align(dimensions.0 as usize * dimensions.1 as usize, 16).unwrap(); + let layout_uv = Layout::from_size_align( + (dimensions.0 as usize + 1) / 2 * (dimensions.1 as usize + 1) / 2, + 16, + ) + .unwrap(); + let target_y = std::alloc::alloc(layout_y); + let target_u = std::alloc::alloc(layout_uv); + let target_v = std::alloc::alloc(layout_uv); + let source_rgb = std::alloc::alloc(layout_rgba); + for (x, src) in src_bytes.iter().enumerate() { + *source_rgb.add(x) = *src; + } + b.iter(|| { + rs_ABGRToI420( + source_rgb, + dimensions.0 as i32 * 4i32, + target_y, + dimensions.0 as i32, + target_u, + (dimensions.0 as i32 + 1) / 2, + target_v, + (dimensions.0 as i32 + 1) / 2, + dimensions.0 as i32, + dimensions.1 as i32, + ); + }); + std::alloc::dealloc(target_y, layout_y); + std::alloc::dealloc(target_u, layout_uv); + std::alloc::dealloc(target_v, layout_uv); + std::alloc::dealloc(source_rgb, layout_rgba); }); c.bench_function("yuvutils RGBA -> YUV 4:2:2", |b| { diff --git a/src/avx2/mod.rs b/src/avx2/mod.rs index 7f67999..e3811da 100644 --- a/src/avx2/mod.rs +++ b/src/avx2/mod.rs @@ -33,6 +33,7 @@ mod rgb_to_nv; mod rgb_to_y; mod rgb_to_ycgco; mod rgba_to_yuv; +mod rgba_to_yuv420; mod ycgco_to_rgb; mod ycgco_to_rgba_alpha; mod yuv_nv_to_rgba; @@ -45,12 +46,12 @@ mod yuv_to_rgba_alpha; mod yuv_to_yuv2; mod yuy2_to_rgb; mod yuy2_to_yuv; -mod rgba_to_yuv420; pub(crate) use rgb_to_nv::avx2_rgba_to_nv; pub(crate) use rgb_to_y::avx2_rgb_to_y_row; pub(crate) use rgb_to_ycgco::avx2_rgb_to_ycgco_row; pub(crate) use rgba_to_yuv::avx2_rgba_to_yuv; +pub(crate) use rgba_to_yuv420::avx2_rgba_to_yuv420; pub(crate) use ycgco_to_rgb::avx2_ycgco_to_rgb_row; pub(crate) use ycgco_to_rgba_alpha::avx2_ycgco_to_rgba_alpha; pub(crate) use yuv_nv_to_rgba::avx2_yuv_nv_to_rgba_row; @@ -63,4 +64,3 @@ pub(crate) use yuv_to_rgba_alpha::avx2_yuv_to_rgba_alpha; pub(crate) use yuv_to_yuv2::yuv_to_yuy2_avx2_row; pub(crate) use yuy2_to_rgb::yuy2_to_rgb_avx; pub(crate) use yuy2_to_yuv::yuy2_to_yuv_avx; -pub(crate) use rgba_to_yuv420::avx2_rgba_to_yuv420; \ No newline at end of file diff --git a/src/avx2/rdp_rgba_to_yuv.rs b/src/avx2/rdp_rgba_to_yuv.rs deleted file mode 100644 index 35621c2..0000000 --- a/src/avx2/rdp_rgba_to_yuv.rs +++ /dev/null @@ -1,279 +0,0 @@ -/* - * Copyright (c) Radzivon Bartoshyk, 10/2024. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without modification, - * are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -use crate::avx2::avx2_utils::{_mm256_deinterleave_rgba_epi8, avx2_deinterleave_rgb}; -use crate::internals::ProcessedOffset; -use crate::yuv_support::{CbCrForwardTransform, YuvSourceChannels}; -#[cfg(target_arch = "x86")] -use std::arch::x86::*; -#[cfg(target_arch = "x86_64")] -use std::arch::x86_64::*; - -pub fn rdp_avx2_rgba_to_yuv( - transform: &CbCrForwardTransform, - y_plane: &mut [u16], - u_plane: &mut [u16], - v_plane: &mut [u16], - rgba: &[u8], - start_cx: usize, - width: usize, -) -> ProcessedOffset { - unsafe { - rdp_avx2_rgba_to_yuv_impl::( - transform, y_plane, u_plane, v_plane, rgba, start_cx, width, - ) - } -} - -#[target_feature(enable = "avx2")] -unsafe fn rdp_avx2_rgba_to_yuv_impl( - transform: &CbCrForwardTransform, - y_plane: &mut [u16], - u_plane: &mut [u16], - v_plane: &mut [u16], - rgba: &[u8], - start_cx: usize, - width: usize, -) -> ProcessedOffset { - let source_channels: YuvSourceChannels = ORIGIN_CHANNELS.into(); - let channels = source_channels.get_channels_count(); - - let y_ptr = y_plane; - let u_ptr = u_plane; - let v_ptr = v_plane; - let rgba_ptr = rgba.as_ptr(); - - let mut cx = start_cx; - - const V_SCALE: i32 = 7; - - let i_bias_y = _mm256_set1_epi16(-4095); - let i_cap_y = _mm256_set1_epi16(4096); - - let y_bias = _mm256_set1_epi16(-4096); - let uv_bias = _mm256_set1_epi16(0); - let v_yr = _mm256_set1_epi16(transform.yr as i16); - let v_yg = _mm256_set1_epi16(transform.yg as i16); - let v_yb = _mm256_set1_epi16(transform.yb as i16); - let v_cb_r = _mm256_set1_epi16(transform.cb_r as i16); - let v_cb_g = _mm256_set1_epi16(transform.cb_g as i16); - let v_cb_b = _mm256_set1_epi16(transform.cb_b as i16); - let v_cr_r = _mm256_set1_epi16(transform.cr_r as i16); - let v_cr_g = _mm256_set1_epi16(transform.cr_g as i16); - let v_cr_b = _mm256_set1_epi16(transform.cr_b as i16); - - while cx + 32 < width { - let (r_values, g_values, b_values); - - let px = cx * channels; - - match source_channels { - YuvSourceChannels::Rgb | YuvSourceChannels::Bgr => { - let source_ptr = rgba_ptr.add(px); - let row_1 = _mm256_loadu_si256(source_ptr as *const __m256i); - let row_2 = _mm256_loadu_si256(source_ptr.add(32) as *const __m256i); - let row_3 = _mm256_loadu_si256(source_ptr.add(64) as *const __m256i); - - let (it1, it2, it3) = avx2_deinterleave_rgb(row_1, row_2, row_3); - if source_channels == YuvSourceChannels::Rgb { - r_values = it1; - g_values = it2; - b_values = it3; - } else { - r_values = it3; - g_values = it2; - b_values = it1; - } - } - YuvSourceChannels::Rgba | YuvSourceChannels::Bgra => { - let source_ptr = rgba_ptr.add(px); - let row_1 = _mm256_loadu_si256(source_ptr as *const __m256i); - let row_2 = _mm256_loadu_si256(source_ptr.add(32) as *const __m256i); - let row_3 = _mm256_loadu_si256(source_ptr.add(64) as *const __m256i); - let row_4 = _mm256_loadu_si256(source_ptr.add(96) as *const __m256i); - - let (it1, it2, it3, _) = _mm256_deinterleave_rgba_epi8(row_1, row_2, row_3, row_4); - if source_channels == YuvSourceChannels::Rgba { - r_values = it1; - g_values = it2; - b_values = it3; - } else { - r_values = it3; - g_values = it2; - b_values = it1; - } - } - } - - let r_low = - _mm256_slli_epi16::(_mm256_cvtepu8_epi16(_mm256_castsi256_si128(r_values))); - let r_high = _mm256_slli_epi16::(_mm256_cvtepu8_epi16( - _mm256_extracti128_si256::<1>(r_values), - )); - let g_low = - _mm256_slli_epi16::(_mm256_cvtepu8_epi16(_mm256_castsi256_si128(g_values))); - let g_high = _mm256_slli_epi16::(_mm256_cvtepu8_epi16( - _mm256_extracti128_si256::<1>(g_values), - )); - let b_low = - _mm256_slli_epi16::(_mm256_cvtepu8_epi16(_mm256_castsi256_si128(b_values))); - let b_high = _mm256_slli_epi16::(_mm256_cvtepu8_epi16( - _mm256_extracti128_si256::<1>(b_values), - )); - - let y_l = _mm256_max_epi16( - _mm256_min_epi16( - _mm256_add_epi16( - y_bias, - _mm256_add_epi16( - _mm256_add_epi16( - _mm256_mulhi_epi16(r_low, v_yr), - _mm256_mulhi_epi16(g_low, v_yg), - ), - _mm256_mulhi_epi16(b_low, v_yb), - ), - ), - i_cap_y, - ), - i_bias_y, - ); - - let y_h = _mm256_max_epi16( - _mm256_min_epi16( - _mm256_add_epi16( - y_bias, - _mm256_add_epi16( - _mm256_add_epi16( - _mm256_mulhi_epi16(r_high, v_yr), - _mm256_mulhi_epi16(g_high, v_yg), - ), - _mm256_mulhi_epi16(b_high, v_yb), - ), - ), - i_cap_y, - ), - i_bias_y, - ); - - _mm256_storeu_si256( - y_ptr.get_unchecked_mut(cx..).as_mut_ptr() as *mut __m256i, - y_l, - ); - _mm256_storeu_si256( - y_ptr.get_unchecked_mut((cx + 16)..).as_mut_ptr() as *mut __m256i, - y_h, - ); - - let cb_l = _mm256_max_epi16( - _mm256_min_epi16( - _mm256_add_epi16( - uv_bias, - _mm256_add_epi16( - _mm256_add_epi16( - _mm256_mulhi_epi16(r_low, v_cb_r), - _mm256_mulhi_epi16(g_low, v_cb_g), - ), - _mm256_mulhi_epi16(b_low, v_cb_b), - ), - ), - i_cap_y, - ), - i_bias_y, - ); - let cr_l = _mm256_max_epi16( - _mm256_min_epi16( - _mm256_add_epi16( - uv_bias, - _mm256_add_epi16( - _mm256_add_epi16( - _mm256_mulhi_epi16(r_low, v_cr_r), - _mm256_mulhi_epi16(g_low, v_cr_g), - ), - _mm256_mulhi_epi16(b_low, v_cr_b), - ), - ), - i_cap_y, - ), - i_bias_y, - ); - let cb_h = _mm256_max_epi16( - _mm256_min_epi16( - _mm256_add_epi16( - uv_bias, - _mm256_add_epi16( - _mm256_add_epi16( - _mm256_mulhi_epi16(r_high, v_cb_r), - _mm256_mulhi_epi16(g_high, v_cb_g), - ), - _mm256_mulhi_epi16(b_high, v_cb_b), - ), - ), - i_cap_y, - ), - i_bias_y, - ); - let cr_h = _mm256_max_epi16( - _mm256_min_epi16( - _mm256_add_epi16( - uv_bias, - _mm256_add_epi16( - _mm256_add_epi16( - _mm256_mulhi_epi16(r_high, v_cr_r), - _mm256_mulhi_epi16(g_high, v_cr_g), - ), - _mm256_mulhi_epi16(b_high, v_cr_b), - ), - ), - i_cap_y, - ), - i_bias_y, - ); - - _mm256_storeu_si256( - u_ptr.get_unchecked_mut(cx..).as_mut_ptr() as *mut __m256i, - cb_l, - ); - _mm256_storeu_si256( - u_ptr.get_unchecked_mut((cx + 16)..).as_mut_ptr() as *mut __m256i, - cb_h, - ); - _mm256_storeu_si256( - v_ptr.get_unchecked_mut(cx..).as_mut_ptr() as *mut __m256i, - cr_l, - ); - _mm256_storeu_si256( - v_ptr.get_unchecked_mut((cx + 16)..).as_mut_ptr() as *mut __m256i, - cr_h, - ); - - cx += 32; - } - - ProcessedOffset { cx, ux: cx } -} diff --git a/src/avx2/rgb_to_nv.rs b/src/avx2/rgb_to_nv.rs index efb9add..bbba3d9 100644 --- a/src/avx2/rgb_to_nv.rs +++ b/src/avx2/rgb_to_nv.rs @@ -93,12 +93,10 @@ unsafe fn avx2_rgba_to_nv_impl< let mut cx = start_cx; let mut uv_x = start_ux; - const V_SHR: i32 = 3; - const V_SCALE: i32 = 6; + const V_SCALE: i32 = 3; - let rounding_const_bias: i16 = 1 << (V_SHR - 1); - let bias_y = range.bias_y as i16 * (1 << V_SHR) + rounding_const_bias; - let bias_uv = range.bias_uv as i16 * (1 << V_SHR) + rounding_const_bias; + let bias_y = range.bias_y as i16; + let bias_uv = range.bias_uv as i16; let i_bias_y = _mm256_set1_epi16(range.bias_y as i16); let i_cap_y = _mm256_set1_epi16(range.range_y as i16 + range.bias_y as i16); @@ -177,7 +175,7 @@ unsafe fn avx2_rgba_to_nv_impl< let y_l = _mm256_max_epi16( _mm256_min_epi16( - _mm256_srai_epi16::(_mm256_add_epi16( + _mm256_add_epi16( y_bias, _mm256_add_epi16( _mm256_add_epi16( @@ -186,7 +184,7 @@ unsafe fn avx2_rgba_to_nv_impl< ), _mm256_mulhrs_epi16(b_low, v_yb), ), - )), + ), i_cap_y, ), i_bias_y, @@ -194,7 +192,7 @@ unsafe fn avx2_rgba_to_nv_impl< let y_h = _mm256_max_epi16( _mm256_min_epi16( - _mm256_srai_epi16::(_mm256_add_epi16( + _mm256_add_epi16( y_bias, _mm256_add_epi16( _mm256_add_epi16( @@ -203,7 +201,7 @@ unsafe fn avx2_rgba_to_nv_impl< ), _mm256_mulhrs_epi16(b_high, v_yb), ), - )), + ), i_cap_y, ), i_bias_y, @@ -215,7 +213,7 @@ unsafe fn avx2_rgba_to_nv_impl< if chroma_subsampling == YuvChromaSubsampling::Yuv444 { let cb_l = _mm256_max_epi16( _mm256_min_epi16( - _mm256_srai_epi16::(_mm256_add_epi16( + (_mm256_add_epi16( uv_bias, _mm256_add_epi16( _mm256_add_epi16( @@ -231,7 +229,7 @@ unsafe fn avx2_rgba_to_nv_impl< ); let cr_l = _mm256_max_epi16( _mm256_min_epi16( - _mm256_srai_epi16::(_mm256_add_epi16( + (_mm256_add_epi16( uv_bias, _mm256_add_epi16( _mm256_add_epi16( @@ -247,7 +245,7 @@ unsafe fn avx2_rgba_to_nv_impl< ); let cb_h = _mm256_max_epi16( _mm256_min_epi16( - _mm256_srai_epi16::(_mm256_add_epi16( + (_mm256_add_epi16( uv_bias, _mm256_add_epi16( _mm256_add_epi16( @@ -263,7 +261,7 @@ unsafe fn avx2_rgba_to_nv_impl< ); let cr_h = _mm256_max_epi16( _mm256_min_epi16( - _mm256_srai_epi16::(_mm256_add_epi16( + (_mm256_add_epi16( uv_bias, _mm256_add_epi16( _mm256_add_epi16( @@ -297,7 +295,7 @@ unsafe fn avx2_rgba_to_nv_impl< let b1 = _mm256_avg_epu16(b_low, b_high); let cb = _mm256_max_epi16( _mm256_min_epi16( - _mm256_srai_epi16::(_mm256_add_epi16( + (_mm256_add_epi16( uv_bias, _mm256_add_epi16( _mm256_add_epi16( @@ -313,7 +311,7 @@ unsafe fn avx2_rgba_to_nv_impl< ); let cr = _mm256_max_epi16( _mm256_min_epi16( - _mm256_srai_epi16::(_mm256_add_epi16( + (_mm256_add_epi16( uv_bias, _mm256_add_epi16( _mm256_add_epi16( diff --git a/src/avx2/rgb_to_y.rs b/src/avx2/rgb_to_y.rs index 29cd446..2dcf7d8 100644 --- a/src/avx2/rgb_to_y.rs +++ b/src/avx2/rgb_to_y.rs @@ -66,10 +66,8 @@ pub(crate) unsafe fn avx2_rgb_to_y_row_impl( let mut cx = start_cx; - const V_SHR: i32 = 3; - const V_SCALE: i32 = 6; - let rounding_const_bias: i16 = 1 << (V_SHR - 1); - let bias_y = range.bias_y as i16 * (1 << V_SHR) + rounding_const_bias; + const V_SCALE: i32 = 3; + let bias_y = range.bias_y as i16; let i_bias_y = _mm256_set1_epi16(range.bias_y as i16); let i_cap_y = _mm256_set1_epi16(range.range_y as i16 + range.bias_y as i16); @@ -140,7 +138,7 @@ pub(crate) unsafe fn avx2_rgb_to_y_row_impl( let y_l = _mm256_max_epi16( _mm256_min_epi16( - _mm256_srai_epi16::(_mm256_add_epi16( + _mm256_add_epi16( y_bias, _mm256_add_epi16( _mm256_add_epi16( @@ -149,7 +147,7 @@ pub(crate) unsafe fn avx2_rgb_to_y_row_impl( ), _mm256_mulhrs_epi16(b_low, v_yb), ), - )), + ), i_cap_y, ), i_bias_y, @@ -157,7 +155,7 @@ pub(crate) unsafe fn avx2_rgb_to_y_row_impl( let y_h = _mm256_max_epi16( _mm256_min_epi16( - _mm256_srai_epi16::(_mm256_add_epi16( + _mm256_add_epi16( y_bias, _mm256_add_epi16( _mm256_add_epi16( @@ -166,7 +164,7 @@ pub(crate) unsafe fn avx2_rgb_to_y_row_impl( ), _mm256_mulhrs_epi16(b_high, v_yb), ), - )), + ), i_cap_y, ), i_bias_y, diff --git a/src/avx2/rgba_to_yuv.rs b/src/avx2/rgba_to_yuv.rs index 25c46f4..658f5d2 100644 --- a/src/avx2/rgba_to_yuv.rs +++ b/src/avx2/rgba_to_yuv.rs @@ -81,11 +81,9 @@ unsafe fn avx2_rgba_to_yuv_impl( let mut cx = start_cx; let mut uv_x = start_ux; - const V_SHR: i32 = 3; const V_SCALE: i32 = 6; - let rounding_const_bias: i16 = 1 << (V_SHR - 1); - let bias_y = range.bias_y as i16 * (1 << V_SHR) + rounding_const_bias; - let bias_uv = range.bias_uv as i16 * (1 << V_SHR) + rounding_const_bias; + let bias_y = range.bias_y as i16; + let bias_uv = range.bias_uv as i16; let i_bias_y = _mm256_set1_epi16(range.bias_y as i16); let i_cap_y = _mm256_set1_epi16(range.range_y as i16 + range.bias_y as i16); @@ -164,7 +162,7 @@ unsafe fn avx2_rgba_to_yuv_impl( let y_l = _mm256_max_epi16( _mm256_min_epi16( - _mm256_srai_epi16::(_mm256_add_epi16( + _mm256_add_epi16( y_bias, _mm256_add_epi16( _mm256_add_epi16( @@ -173,7 +171,7 @@ unsafe fn avx2_rgba_to_yuv_impl( ), _mm256_mulhrs_epi16(b_low, v_yb), ), - )), + ), i_cap_y, ), i_bias_y, @@ -181,7 +179,7 @@ unsafe fn avx2_rgba_to_yuv_impl( let y_h = _mm256_max_epi16( _mm256_min_epi16( - _mm256_srai_epi16::(_mm256_add_epi16( + _mm256_add_epi16( y_bias, _mm256_add_epi16( _mm256_add_epi16( @@ -190,7 +188,7 @@ unsafe fn avx2_rgba_to_yuv_impl( ), _mm256_mulhrs_epi16(b_high, v_yb), ), - )), + ), i_cap_y, ), i_bias_y, @@ -202,7 +200,7 @@ unsafe fn avx2_rgba_to_yuv_impl( if chroma_subsampling == YuvChromaSubsampling::Yuv444 { let cb_l = _mm256_max_epi16( _mm256_min_epi16( - _mm256_srai_epi16::(_mm256_add_epi16( + _mm256_add_epi16( uv_bias, _mm256_add_epi16( _mm256_add_epi16( @@ -211,14 +209,14 @@ unsafe fn avx2_rgba_to_yuv_impl( ), _mm256_mulhrs_epi16(b_low, v_cb_b), ), - )), + ), i_cap_uv, ), i_bias_y, ); let cr_l = _mm256_max_epi16( _mm256_min_epi16( - _mm256_srai_epi16::(_mm256_add_epi16( + _mm256_add_epi16( uv_bias, _mm256_add_epi16( _mm256_add_epi16( @@ -227,14 +225,14 @@ unsafe fn avx2_rgba_to_yuv_impl( ), _mm256_mulhrs_epi16(b_low, v_cr_b), ), - )), + ), i_cap_uv, ), i_bias_y, ); let cb_h = _mm256_max_epi16( _mm256_min_epi16( - _mm256_srai_epi16::(_mm256_add_epi16( + _mm256_add_epi16( uv_bias, _mm256_add_epi16( _mm256_add_epi16( @@ -243,14 +241,14 @@ unsafe fn avx2_rgba_to_yuv_impl( ), _mm256_mulhrs_epi16(b_high, v_cb_b), ), - )), + ), i_cap_uv, ), i_bias_y, ); let cr_h = _mm256_max_epi16( _mm256_min_epi16( - _mm256_srai_epi16::(_mm256_add_epi16( + _mm256_add_epi16( uv_bias, _mm256_add_epi16( _mm256_add_epi16( @@ -259,7 +257,7 @@ unsafe fn avx2_rgba_to_yuv_impl( ), _mm256_mulhrs_epi16(b_high, v_cr_b), ), - )), + ), i_cap_uv, ), i_bias_y, @@ -279,7 +277,7 @@ unsafe fn avx2_rgba_to_yuv_impl( let b1 = _mm256_avg_epu16(b_low, b_high); let cb = _mm256_max_epi16( _mm256_min_epi16( - _mm256_srai_epi16::(_mm256_add_epi16( + _mm256_add_epi16( uv_bias, _mm256_add_epi16( _mm256_add_epi16( @@ -288,14 +286,14 @@ unsafe fn avx2_rgba_to_yuv_impl( ), _mm256_mulhrs_epi16(b1, v_cb_b), ), - )), + ), i_cap_uv, ), i_bias_y, ); let cr = _mm256_max_epi16( _mm256_min_epi16( - _mm256_srai_epi16::(_mm256_add_epi16( + _mm256_add_epi16( uv_bias, _mm256_add_epi16( _mm256_add_epi16( @@ -304,7 +302,7 @@ unsafe fn avx2_rgba_to_yuv_impl( ), _mm256_mulhrs_epi16(b1, v_cr_b), ), - )), + ), i_cap_uv, ), i_bias_y, diff --git a/src/avx2/rgba_to_yuv420.rs b/src/avx2/rgba_to_yuv420.rs index f03618d..deb281c 100644 --- a/src/avx2/rgba_to_yuv420.rs +++ b/src/avx2/rgba_to_yuv420.rs @@ -31,9 +31,7 @@ use crate::avx2::avx2_utils::{ _mm256_deinterleave_rgba_epi8, avx2_deinterleave_rgb, avx2_pack_u16, }; use crate::internals::ProcessedOffset; -use crate::yuv_support::{ - CbCrForwardTransform, YuvChromaRange, YuvSourceChannels, -}; +use crate::yuv_support::{CbCrForwardTransform, YuvChromaRange, YuvSourceChannels}; #[cfg(target_arch = "x86")] use std::arch::x86::*; #[cfg(target_arch = "x86_64")] @@ -83,11 +81,9 @@ unsafe fn avx2_rgba_to_yuv_impl420( let mut cx = start_cx; let mut uv_x = start_ux; - const V_SHR: i32 = 3; - const V_SCALE: i32 = 6; - let rounding_const_bias: i16 = 1 << (V_SHR - 1); - let bias_y = range.bias_y as i16 * (1 << V_SHR) + rounding_const_bias; - let bias_uv = range.bias_uv as i16 * (1 << V_SHR) + rounding_const_bias; + const V_SCALE: i32 = 3; + let bias_y = range.bias_y as i16; + let bias_uv = range.bias_uv as i16; let i_bias_y = _mm256_set1_epi16(range.bias_y as i16); let i_cap_y = _mm256_set1_epi16(range.range_y as i16 + range.bias_y as i16); @@ -169,7 +165,8 @@ unsafe fn avx2_rgba_to_yuv_impl420( let row_31 = _mm256_loadu_si256(source_ptr1.add(64) as *const __m256i); let row_41 = _mm256_loadu_si256(source_ptr1.add(96) as *const __m256i); - let (it1, it2, it3, _) = _mm256_deinterleave_rgba_epi8(row_11, row_21, row_31, row_41); + let (it1, it2, it3, _) = + _mm256_deinterleave_rgba_epi8(row_11, row_21, row_31, row_41); if source_channels == YuvSourceChannels::Rgba { r_values1 = it1; g_values1 = it2; @@ -200,7 +197,7 @@ unsafe fn avx2_rgba_to_yuv_impl420( let y0_l = _mm256_max_epi16( _mm256_min_epi16( - _mm256_srai_epi16::(_mm256_add_epi16( + _mm256_add_epi16( y_bias, _mm256_add_epi16( _mm256_add_epi16( @@ -209,7 +206,7 @@ unsafe fn avx2_rgba_to_yuv_impl420( ), _mm256_mulhrs_epi16(b0_low, v_yb), ), - )), + ), i_cap_y, ), i_bias_y, @@ -217,7 +214,7 @@ unsafe fn avx2_rgba_to_yuv_impl420( let y0_h = _mm256_max_epi16( _mm256_min_epi16( - _mm256_srai_epi16::(_mm256_add_epi16( + _mm256_add_epi16( y_bias, _mm256_add_epi16( _mm256_add_epi16( @@ -226,7 +223,7 @@ unsafe fn avx2_rgba_to_yuv_impl420( ), _mm256_mulhrs_epi16(b0_high, v_yb), ), - )), + ), i_cap_y, ), i_bias_y, @@ -250,7 +247,7 @@ unsafe fn avx2_rgba_to_yuv_impl420( let y1_l = _mm256_max_epi16( _mm256_min_epi16( - _mm256_srai_epi16::(_mm256_add_epi16( + _mm256_add_epi16( y_bias, _mm256_add_epi16( _mm256_add_epi16( @@ -259,7 +256,7 @@ unsafe fn avx2_rgba_to_yuv_impl420( ), _mm256_mulhrs_epi16(b1_low, v_yb), ), - )), + ), i_cap_y, ), i_bias_y, @@ -267,7 +264,7 @@ unsafe fn avx2_rgba_to_yuv_impl420( let y1_h = _mm256_max_epi16( _mm256_min_epi16( - _mm256_srai_epi16::(_mm256_add_epi16( + _mm256_add_epi16( y_bias, _mm256_add_epi16( _mm256_add_epi16( @@ -276,7 +273,7 @@ unsafe fn avx2_rgba_to_yuv_impl420( ), _mm256_mulhrs_epi16(b1_high, v_yb), ), - )), + ), i_cap_y, ), i_bias_y, @@ -299,7 +296,7 @@ unsafe fn avx2_rgba_to_yuv_impl420( let b_uv = _mm256_avg_epu16(b0_low, b0_high); let cb = _mm256_max_epi16( _mm256_min_epi16( - _mm256_srai_epi16::(_mm256_add_epi16( + _mm256_add_epi16( uv_bias, _mm256_add_epi16( _mm256_add_epi16( @@ -308,14 +305,14 @@ unsafe fn avx2_rgba_to_yuv_impl420( ), _mm256_mulhrs_epi16(b_uv, v_cb_b), ), - )), + ), i_cap_uv, ), i_bias_y, ); let cr = _mm256_max_epi16( _mm256_min_epi16( - _mm256_srai_epi16::(_mm256_add_epi16( + _mm256_add_epi16( uv_bias, _mm256_add_epi16( _mm256_add_epi16( @@ -324,7 +321,7 @@ unsafe fn avx2_rgba_to_yuv_impl420( ), _mm256_mulhrs_epi16(b_uv, v_cr_b), ), - )), + ), i_cap_uv, ), i_bias_y, diff --git a/src/avx2/yuv_nv_to_rgba.rs b/src/avx2/yuv_nv_to_rgba.rs index 0274e53..1b86fd6 100644 --- a/src/avx2/yuv_nv_to_rgba.rs +++ b/src/avx2/yuv_nv_to_rgba.rs @@ -84,8 +84,7 @@ unsafe fn avx2_yuv_nv_to_rgba_row_impl< let uv_ptr = uv_plane.as_ptr(); let rgba_ptr = rgba.as_mut_ptr(); - const SCALE: i32 = 6; - const V_SHR: i32 = 3; + const SCALE: i32 = 3; let y_corr = _mm256_set1_epi8(range.bias_y as i8); let uv_corr = _mm256_set1_epi16(range.bias_uv as i16); @@ -95,7 +94,6 @@ unsafe fn avx2_yuv_nv_to_rgba_row_impl< let v_g_coeff_1 = _mm256_set1_epi16(transform.g_coeff_1 as i16); let v_g_coeff_2 = _mm256_set1_epi16(transform.g_coeff_2 as i16); let v_alpha = _mm256_set1_epi8(255u8 as i8); - let rounding_const = _mm256_set1_epi16(1 << (V_SHR - 1)); while cx + 32 < width { let y_values = @@ -161,24 +159,15 @@ unsafe fn avx2_yuv_nv_to_rgba_row_impl< v_luma_coeff, ); - let r_high = _mm256_srli_epi16::(_mm256_add_epi16( - _mm256_add_epi16(y_high, _mm256_mulhrs_epi16(v_high, v_cr_coeff)), - rounding_const, - )); - let b_high = _mm256_srli_epi16::(_mm256_add_epi16( - _mm256_add_epi16(y_high, _mm256_mulhrs_epi16(u_high, v_cb_coeff)), - rounding_const, - )); - let g_high = _mm256_srli_epi16::(_mm256_add_epi16( - _mm256_sub_epi16( - y_high, - _mm256_add_epi16( - _mm256_mulhrs_epi16(v_high, v_g_coeff_1), - _mm256_mulhrs_epi16(u_high, v_g_coeff_2), - ), + let r_high = _mm256_add_epi16(y_high, _mm256_mulhrs_epi16(v_high, v_cr_coeff)); + let b_high = _mm256_add_epi16(y_high, _mm256_mulhrs_epi16(u_high, v_cb_coeff)); + let g_high = _mm256_sub_epi16( + y_high, + _mm256_add_epi16( + _mm256_mulhrs_epi16(v_high, v_g_coeff_1), + _mm256_mulhrs_epi16(u_high, v_g_coeff_2), ), - rounding_const, - )); + ); let u_low = _mm256_slli_epi16::(_mm256_sub_epi16(_mm256_cvtepu8_epi16(u_low_u8), uv_corr)); @@ -189,24 +178,15 @@ unsafe fn avx2_yuv_nv_to_rgba_row_impl< v_luma_coeff, ); - let r_low = _mm256_srli_epi16::(_mm256_add_epi16( - _mm256_add_epi16(y_low, _mm256_mulhrs_epi16(v_low, v_cr_coeff)), - rounding_const, - )); - let b_low = _mm256_srli_epi16::(_mm256_add_epi16( - _mm256_add_epi16(y_low, _mm256_mulhrs_epi16(u_low, v_cb_coeff)), - rounding_const, - )); - let g_low = _mm256_srli_epi16::(_mm256_add_epi16( - _mm256_sub_epi16( - y_low, - _mm256_add_epi16( - _mm256_mulhrs_epi16(v_low, v_g_coeff_1), - _mm256_mulhrs_epi16(u_low, v_g_coeff_2), - ), + let r_low = _mm256_add_epi16(y_low, _mm256_mulhrs_epi16(v_low, v_cr_coeff)); + let b_low = _mm256_add_epi16(y_low, _mm256_mulhrs_epi16(u_low, v_cb_coeff)); + let g_low = _mm256_sub_epi16( + y_low, + _mm256_add_epi16( + _mm256_mulhrs_epi16(v_low, v_g_coeff_1), + _mm256_mulhrs_epi16(u_low, v_g_coeff_2), ), - rounding_const, - )); + ); let r_values = avx2_pack_u16(r_low, r_high); let g_values = avx2_pack_u16(g_low, g_high); diff --git a/src/avx2/yuv_nv_to_rgba420.rs b/src/avx2/yuv_nv_to_rgba420.rs index 82637f8..7cc721a 100644 --- a/src/avx2/yuv_nv_to_rgba420.rs +++ b/src/avx2/yuv_nv_to_rgba420.rs @@ -75,8 +75,7 @@ unsafe fn avx2_yuv_nv_to_rgba_row_impl420(_mm256_add_epi16( - _mm256_add_epi16(y_high0, _mm256_mulhrs_epi16(v_high, v_cr_coeff)), - rounding_const, - )); - let b_high0 = _mm256_srli_epi16::(_mm256_add_epi16( - _mm256_add_epi16(y_high0, _mm256_mulhrs_epi16(u_high, v_cb_coeff)), - rounding_const, - )); - let g_high0 = _mm256_srli_epi16::(_mm256_add_epi16( - _mm256_sub_epi16(y_high0, g_coeff_hi), - rounding_const, - )); - let r_high1 = _mm256_srli_epi16::(_mm256_add_epi16( - _mm256_add_epi16(y_high1, _mm256_mulhrs_epi16(v_high, v_cr_coeff)), - rounding_const, - )); - let b_high1 = _mm256_srli_epi16::(_mm256_add_epi16( - _mm256_add_epi16(y_high1, _mm256_mulhrs_epi16(u_high, v_cb_coeff)), - rounding_const, - )); - let g_high1 = _mm256_srli_epi16::(_mm256_add_epi16( - _mm256_sub_epi16(y_high1, g_coeff_hi), - rounding_const, - )); + let r_high0 = _mm256_add_epi16(y_high0, _mm256_mulhrs_epi16(v_high, v_cr_coeff)); + let b_high0 = _mm256_add_epi16(y_high0, _mm256_mulhrs_epi16(u_high, v_cb_coeff)); + let g_high0 = _mm256_sub_epi16(y_high0, g_coeff_hi); + let r_high1 = _mm256_add_epi16(y_high1, _mm256_mulhrs_epi16(v_high, v_cr_coeff)); + let b_high1 = _mm256_add_epi16(y_high1, _mm256_mulhrs_epi16(u_high, v_cb_coeff)); + let g_high1 = _mm256_sub_epi16(y_high1, g_coeff_hi); let u_low = _mm256_slli_epi16::(_mm256_sub_epi16(_mm256_cvtepu8_epi16(u_low_u8), uv_corr)); @@ -185,31 +165,13 @@ unsafe fn avx2_yuv_nv_to_rgba_row_impl420(_mm256_add_epi16( - _mm256_add_epi16(y_low0, _mm256_mulhrs_epi16(v_low, v_cr_coeff)), - rounding_const, - )); - let b_low0 = _mm256_srli_epi16::(_mm256_add_epi16( - _mm256_add_epi16(y_low0, _mm256_mulhrs_epi16(u_low, v_cb_coeff)), - rounding_const, - )); - let g_low0 = _mm256_srli_epi16::(_mm256_add_epi16( - _mm256_sub_epi16(y_low0, g_coeff_lo), - rounding_const, - )); - - let r_low1 = _mm256_srli_epi16::(_mm256_add_epi16( - _mm256_add_epi16(y_low1, _mm256_mulhrs_epi16(v_low, v_cr_coeff)), - rounding_const, - )); - let b_low1 = _mm256_srli_epi16::(_mm256_add_epi16( - _mm256_add_epi16(y_low1, _mm256_mulhrs_epi16(u_low, v_cb_coeff)), - rounding_const, - )); - let g_low1 = _mm256_srli_epi16::(_mm256_add_epi16( - _mm256_sub_epi16(y_low1, g_coeff_lo), - rounding_const, - )); + let r_low0 = _mm256_add_epi16(y_low0, _mm256_mulhrs_epi16(v_low, v_cr_coeff)); + let b_low0 = _mm256_add_epi16(y_low0, _mm256_mulhrs_epi16(u_low, v_cb_coeff)); + let g_low0 = _mm256_sub_epi16(y_low0, g_coeff_lo); + + let r_low1 = _mm256_add_epi16(y_low1, _mm256_mulhrs_epi16(v_low, v_cr_coeff)); + let b_low1 = _mm256_add_epi16(y_low1, _mm256_mulhrs_epi16(u_low, v_cb_coeff)); + let g_low1 = _mm256_sub_epi16(y_low1, g_coeff_lo); let r_values0 = avx2_pack_u16(r_low0, r_high0); let g_values0 = avx2_pack_u16(g_low0, g_high0); diff --git a/src/avx2/yuv_to_rgba.rs b/src/avx2/yuv_to_rgba.rs index 36711b3..27f54c7 100644 --- a/src/avx2/yuv_to_rgba.rs +++ b/src/avx2/yuv_to_rgba.rs @@ -87,10 +87,7 @@ unsafe fn avx2_yuv_to_rgba_row_impl(_mm256_add_epi16( - _mm256_add_epi16(y_high, _mm256_mulhrs_epi16(v_high, v_cr_coeff)), - rounding_const, - )); - let b_high = _mm256_srai_epi16::(_mm256_add_epi16( - _mm256_add_epi16(y_high, _mm256_mulhrs_epi16(u_high, v_cb_coeff)), - rounding_const, - )); - let g_high = _mm256_srai_epi16::(_mm256_add_epi16( - _mm256_sub_epi16( - y_high, - _mm256_add_epi16( - _mm256_mulhrs_epi16(v_high, v_g_coeff_1), - _mm256_mulhrs_epi16(u_high, v_g_coeff_2), - ), + let r_high = _mm256_add_epi16(y_high, _mm256_mulhrs_epi16(v_high, v_cr_coeff)); + let b_high = _mm256_add_epi16(y_high, _mm256_mulhrs_epi16(u_high, v_cb_coeff)); + let g_high = _mm256_sub_epi16( + y_high, + _mm256_add_epi16( + _mm256_mulhrs_epi16(v_high, v_g_coeff_1), + _mm256_mulhrs_epi16(u_high, v_g_coeff_2), ), - rounding_const, - )); + ); let u_low = _mm256_slli_epi16::(_mm256_sub_epi16(u_low_u16, uv_corr)); let v_low = _mm256_slli_epi16::(_mm256_sub_epi16(v_low_u16, uv_corr)); @@ -154,24 +142,15 @@ unsafe fn avx2_yuv_to_rgba_row_impl(_mm256_add_epi16( - _mm256_add_epi16(y_low, _mm256_mulhrs_epi16(v_low, v_cr_coeff)), - rounding_const, - )); - let b_low = _mm256_srai_epi16::(_mm256_add_epi16( - _mm256_add_epi16(y_low, _mm256_mulhrs_epi16(u_low, v_cb_coeff)), - rounding_const, - )); - let g_low = _mm256_srai_epi16::(_mm256_add_epi16( - _mm256_sub_epi16( - y_low, - _mm256_add_epi16( - _mm256_mulhrs_epi16(v_low, v_g_coeff_1), - _mm256_mulhrs_epi16(u_low, v_g_coeff_2), - ), + let r_low = _mm256_add_epi16(y_low, _mm256_mulhrs_epi16(v_low, v_cr_coeff)); + let b_low = _mm256_add_epi16(y_low, _mm256_mulhrs_epi16(u_low, v_cb_coeff)); + let g_low = _mm256_sub_epi16( + y_low, + _mm256_add_epi16( + _mm256_mulhrs_epi16(v_low, v_g_coeff_1), + _mm256_mulhrs_epi16(u_low, v_g_coeff_2), ), - rounding_const, - )); + ); let r_values = avx2_pack_u16(r_low, r_high); let g_values = avx2_pack_u16(g_low, g_high); diff --git a/src/avx2/yuv_to_rgba420.rs b/src/avx2/yuv_to_rgba420.rs index f85b167..686afa2 100644 --- a/src/avx2/yuv_to_rgba420.rs +++ b/src/avx2/yuv_to_rgba420.rs @@ -87,10 +87,7 @@ unsafe fn avx2_yuv_to_rgba_row_impl420( let v_g_coeff_2 = _mm256_set1_epi16(transform.g_coeff_2 as i16); let v_alpha = _mm256_set1_epi8(255u8 as i8); - const SCALE: i32 = 6; - const V_SHR: i32 = 3; - - let rounding_const = _mm256_set1_epi16(1 << (V_SHR - 1)); + const SCALE: i32 = 3; while cx + 32 < width { let y_values0 = _mm256_subs_epu8( @@ -130,31 +127,13 @@ unsafe fn avx2_yuv_to_rgba_row_impl420( _mm256_mulhrs_epi16(u_high, v_g_coeff_2), ); - let r_high0 = _mm256_srai_epi16::(_mm256_add_epi16( - _mm256_add_epi16(y_high0, _mm256_mulhrs_epi16(v_high, v_cr_coeff)), - rounding_const, - )); - let b_high0 = _mm256_srai_epi16::(_mm256_add_epi16( - _mm256_add_epi16(y_high0, _mm256_mulhrs_epi16(u_high, v_cb_coeff)), - rounding_const, - )); - let g_high0 = _mm256_srai_epi16::(_mm256_add_epi16( - _mm256_sub_epi16(y_high0, g_coeff_hi), - rounding_const, - )); + let r_high0 = _mm256_add_epi16(y_high0, _mm256_mulhrs_epi16(v_high, v_cr_coeff)); + let b_high0 = _mm256_add_epi16(y_high0, _mm256_mulhrs_epi16(u_high, v_cb_coeff)); + let g_high0 = _mm256_sub_epi16(y_high0, g_coeff_hi); - let r_high1 = _mm256_srai_epi16::(_mm256_add_epi16( - _mm256_add_epi16(y_high1, _mm256_mulhrs_epi16(v_high, v_cr_coeff)), - rounding_const, - )); - let b_high1 = _mm256_srai_epi16::(_mm256_add_epi16( - _mm256_add_epi16(y_high1, _mm256_mulhrs_epi16(u_high, v_cb_coeff)), - rounding_const, - )); - let g_high1 = _mm256_srai_epi16::(_mm256_add_epi16( - _mm256_sub_epi16(y_high1, g_coeff_hi), - rounding_const, - )); + let r_high1 = _mm256_add_epi16(y_high1, _mm256_mulhrs_epi16(v_high, v_cr_coeff)); + let b_high1 = _mm256_add_epi16(y_high1, _mm256_mulhrs_epi16(u_high, v_cb_coeff)); + let g_high1 = _mm256_sub_epi16(y_high1, g_coeff_hi); let u_low = _mm256_slli_epi16::(_mm256_sub_epi16(u_low_u16, uv_corr)); let v_low = _mm256_slli_epi16::(_mm256_sub_epi16(v_low_u16, uv_corr)); @@ -172,32 +151,13 @@ unsafe fn avx2_yuv_to_rgba_row_impl420( _mm256_mulhrs_epi16(u_low, v_g_coeff_2), ); - let r_low0 = _mm256_srai_epi16::(_mm256_add_epi16( - _mm256_add_epi16(y_low0, _mm256_mulhrs_epi16(v_low, v_cr_coeff)), - rounding_const, - )); - let b_low0 = _mm256_srai_epi16::(_mm256_add_epi16( - _mm256_add_epi16(y_low0, _mm256_mulhrs_epi16(u_low, v_cb_coeff)), - rounding_const, - )); - let g_low0 = _mm256_srai_epi16::(_mm256_add_epi16( - _mm256_sub_epi16(y_low0, g_coeff_lo), - rounding_const, - )); - - let r_low1 = _mm256_srai_epi16::(_mm256_add_epi16( - _mm256_add_epi16(y_low1, _mm256_mulhrs_epi16(v_low, v_cr_coeff)), - rounding_const, - )); - let b_low1 = _mm256_srai_epi16::(_mm256_add_epi16( - _mm256_add_epi16(y_low1, _mm256_mulhrs_epi16(u_low, v_cb_coeff)), - rounding_const, - )); - let g_low1 = _mm256_srai_epi16::(_mm256_add_epi16( - _mm256_sub_epi16(y_low1, g_coeff_lo), - rounding_const, - )); + let r_low0 = _mm256_add_epi16(y_low0, _mm256_mulhrs_epi16(v_low, v_cr_coeff)); + let b_low0 = _mm256_add_epi16(y_low0, _mm256_mulhrs_epi16(u_low, v_cb_coeff)); + let g_low0 = _mm256_sub_epi16(y_low0, g_coeff_lo); + let r_low1 = _mm256_add_epi16(y_low1, _mm256_mulhrs_epi16(v_low, v_cr_coeff)); + let b_low1 = _mm256_add_epi16(y_low1, _mm256_mulhrs_epi16(u_low, v_cb_coeff)); + let g_low1 = _mm256_sub_epi16(y_low1, g_coeff_lo); let r_values0 = avx2_pack_u16(r_low0, r_high0); let g_values0 = avx2_pack_u16(g_low0, g_high0); let b_values0 = avx2_pack_u16(b_low0, b_high0); diff --git a/src/avx2/yuv_to_rgba_alpha.rs b/src/avx2/yuv_to_rgba_alpha.rs index a33bee4..31fa219 100644 --- a/src/avx2/yuv_to_rgba_alpha.rs +++ b/src/avx2/yuv_to_rgba_alpha.rs @@ -93,8 +93,7 @@ unsafe fn avx2_yuv_to_rgba_alpha_impl(_mm256_add_epi16( - _mm256_add_epi16(y_high, _mm256_mulhrs_epi16(v_high, v_cr_coeff)), - rounding_const, - )); - let b_high = _mm256_srai_epi16::<3>(_mm256_add_epi16( - _mm256_add_epi16(y_high, _mm256_mulhrs_epi16(u_high, v_cb_coeff)), - rounding_const, - )); - let g_high = _mm256_srai_epi16::(_mm256_add_epi16( - _mm256_sub_epi16( - y_high, - _mm256_add_epi16( - _mm256_mulhrs_epi16(v_high, v_g_coeff_1), - _mm256_mulhrs_epi16(u_high, v_g_coeff_2), - ), + let r_high = _mm256_add_epi16(y_high, _mm256_mulhrs_epi16(v_high, v_cr_coeff)); + let b_high = _mm256_add_epi16(y_high, _mm256_mulhrs_epi16(u_high, v_cb_coeff)); + let g_high = _mm256_sub_epi16( + y_high, + _mm256_add_epi16( + _mm256_mulhrs_epi16(v_high, v_g_coeff_1), + _mm256_mulhrs_epi16(u_high, v_g_coeff_2), ), - rounding_const, - )); + ); let u_low = _mm256_slli_epi16::(_mm256_sub_epi16(u_low_u16, uv_corr)); let v_low = _mm256_slli_epi16::(_mm256_sub_epi16(v_low_u16, uv_corr)); @@ -169,24 +158,15 @@ unsafe fn avx2_yuv_to_rgba_alpha_impl(_mm256_add_epi16( - _mm256_add_epi16(y_low, _mm256_mulhrs_epi16(v_low, v_cr_coeff)), - rounding_const, - )); - let b_low = _mm256_srai_epi16::(_mm256_add_epi16( - _mm256_add_epi16(y_low, _mm256_mulhrs_epi16(u_low, v_cb_coeff)), - rounding_const, - )); - let g_low = _mm256_srai_epi16::(_mm256_add_epi16( - _mm256_sub_epi16( - y_low, - _mm256_add_epi16( - _mm256_mulhrs_epi16(v_low, v_g_coeff_1), - _mm256_mulhrs_epi16(u_low, v_g_coeff_2), - ), + let r_low = _mm256_add_epi16(y_low, _mm256_mulhrs_epi16(v_low, v_cr_coeff)); + let b_low = _mm256_add_epi16(y_low, _mm256_mulhrs_epi16(u_low, v_cb_coeff)); + let g_low = _mm256_sub_epi16( + y_low, + _mm256_add_epi16( + _mm256_mulhrs_epi16(v_low, v_g_coeff_1), + _mm256_mulhrs_epi16(u_low, v_g_coeff_2), ), - rounding_const, - )); + ); let (r_values, g_values, b_values); diff --git a/src/sse/mod.rs b/src/sse/mod.rs index 15be6cb..0441f8a 100644 --- a/src/sse/mod.rs +++ b/src/sse/mod.rs @@ -33,6 +33,7 @@ mod rgb_to_ycgco; mod rgb_to_ycgco_r; mod rgb_to_yuv_p16; mod rgba_to_yuv; +mod rgba_to_yuv420; pub(crate) mod sse_support; mod sse_ycbcr; mod sse_ycgco_r; @@ -50,7 +51,6 @@ mod yuv_to_rgba_alpha; mod yuv_to_yuy2; mod yuy2_to_rgb; mod yuy2_to_yuv; -mod rgba_to_yuv420; pub(crate) use rgb_to_nv::sse_rgba_to_nv_row; pub(crate) use rgb_to_y::sse_rgb_to_y; @@ -58,6 +58,7 @@ pub(crate) use rgb_to_ycgco::sse_rgb_to_ycgco_row; pub(crate) use rgb_to_ycgco_r::sse_rgb_to_ycgcor_row; pub(crate) use rgb_to_yuv_p16::{sse_rgba_to_yuv_p16, sse_rgba_to_yuv_p16_lp}; pub(crate) use rgba_to_yuv::sse_rgba_to_yuv_row; +pub(crate) use rgba_to_yuv420::sse_rgba_to_yuv_row420; pub(crate) use sse_support::*; pub(crate) use ycgco_to_rgb::sse_ycgco_to_rgb_row; pub(crate) use ycgco_to_rgb_alpha::sse_ycgco_to_rgb_alpha_row; @@ -73,4 +74,3 @@ pub(crate) use yuv_to_rgba_alpha::sse_yuv_to_rgba_alpha_row; pub(crate) use yuv_to_yuy2::yuv_to_yuy2_sse; pub(crate) use yuy2_to_rgb::yuy2_to_rgb_sse; pub(crate) use yuy2_to_yuv::yuy2_to_yuv_sse; -pub(crate) use rgba_to_yuv420::sse_rgba_to_yuv_row420; \ No newline at end of file diff --git a/src/sse/rgb_to_nv.rs b/src/sse/rgb_to_nv.rs index 2fedfae..f281a31 100644 --- a/src/sse/rgb_to_nv.rs +++ b/src/sse/rgb_to_nv.rs @@ -95,11 +95,9 @@ unsafe fn sse_rgba_to_nv_row_impl< let mut cx = start_cx; let mut uv_x = start_ux; - const V_SHR: i32 = 3; - const V_SCALE: i32 = 6; - let rounding_const_bias: i16 = 1 << (V_SHR - 1); - let bias_y = range.bias_y as i16 * (1 << V_SHR) + rounding_const_bias; - let bias_uv = range.bias_uv as i16 * (1 << V_SHR) + rounding_const_bias; + const V_SCALE: i32 = 3; + let bias_y = range.bias_y as i16; + let bias_uv = range.bias_uv as i16; let i_bias_y = _mm_set1_epi16(range.bias_y as i16); let i_cap_y = _mm_set1_epi16(range.range_y as i16 + range.bias_y as i16); @@ -171,7 +169,7 @@ unsafe fn sse_rgba_to_nv_row_impl< let y_l = _mm_max_epi16( _mm_min_epi16( - _mm_srai_epi16::(_mm_add_epi16( + (_mm_add_epi16( y_bias, _mm_add_epi16( _mm_add_epi16(_mm_mulhrs_epi16(r_low, v_yr), _mm_mulhrs_epi16(g_low, v_yg)), @@ -185,7 +183,7 @@ unsafe fn sse_rgba_to_nv_row_impl< let y_h = _mm_max_epi16( _mm_min_epi16( - _mm_srai_epi16::(_mm_add_epi16( + (_mm_add_epi16( y_bias, _mm_add_epi16( _mm_add_epi16( @@ -206,7 +204,7 @@ unsafe fn sse_rgba_to_nv_row_impl< if chroma_subsampling == YuvChromaSubsampling::Yuv444 { let cb_l = _mm_max_epi16( _mm_min_epi16( - _mm_srai_epi16::(_mm_add_epi16( + (_mm_add_epi16( uv_bias, _mm_add_epi16( _mm_add_epi16( @@ -222,7 +220,7 @@ unsafe fn sse_rgba_to_nv_row_impl< ); let cr_l = _mm_max_epi16( _mm_min_epi16( - _mm_srai_epi16::(_mm_add_epi16( + _mm_add_epi16( uv_bias, _mm_add_epi16( _mm_add_epi16( @@ -231,14 +229,14 @@ unsafe fn sse_rgba_to_nv_row_impl< ), _mm_mulhrs_epi16(b_low, v_cr_b), ), - )), + ), i_cap_uv, ), i_bias_y, ); let cb_h = _mm_max_epi16( _mm_min_epi16( - _mm_srai_epi16::(_mm_add_epi16( + (_mm_add_epi16( uv_bias, _mm_add_epi16( _mm_add_epi16( @@ -254,7 +252,7 @@ unsafe fn sse_rgba_to_nv_row_impl< ); let cr_h = _mm_max_epi16( _mm_min_epi16( - _mm_srai_epi16::(_mm_add_epi16( + (_mm_add_epi16( uv_bias, _mm_add_epi16( _mm_add_epi16( @@ -294,7 +292,7 @@ unsafe fn sse_rgba_to_nv_row_impl< let cbk = _mm_max_epi16( _mm_min_epi16( - _mm_srai_epi16::(_mm_add_epi16( + (_mm_add_epi16( uv_bias, _mm_add_epi16( _mm_add_epi16( @@ -311,7 +309,7 @@ unsafe fn sse_rgba_to_nv_row_impl< let crk = _mm_max_epi16( _mm_min_epi16( - _mm_srai_epi16::(_mm_add_epi16( + (_mm_add_epi16( uv_bias, _mm_add_epi16( _mm_add_epi16( diff --git a/src/sse/rgb_to_y.rs b/src/sse/rgb_to_y.rs index 5f0f046..5b4dc5f 100644 --- a/src/sse/rgb_to_y.rs +++ b/src/sse/rgb_to_y.rs @@ -64,10 +64,8 @@ unsafe fn sse_rgb_to_y_impl( let mut cx = start_cx; - const V_SHR: i32 = 3; - const V_SCALE: i32 = 6; - let rounding_const_bias: i16 = 1 << (V_SHR - 1); - let bias_y = range.bias_y as i16 * (1 << V_SHR) + rounding_const_bias; + const V_SCALE: i32 = 3; + let bias_y = range.bias_y as i16; let zeros = _mm_setzero_si128(); @@ -130,7 +128,7 @@ unsafe fn sse_rgb_to_y_impl( let y_l = _mm_max_epi16( _mm_min_epi16( - _mm_srai_epi16::(_mm_add_epi16( + (_mm_add_epi16( y_bias, _mm_add_epi16( _mm_add_epi16(_mm_mulhrs_epi16(r_low, v_yr), _mm_mulhrs_epi16(g_low, v_yg)), @@ -144,7 +142,7 @@ unsafe fn sse_rgb_to_y_impl( let y_h = _mm_max_epi16( _mm_min_epi16( - _mm_srai_epi16::(_mm_add_epi16( + (_mm_add_epi16( y_bias, _mm_add_epi16( _mm_add_epi16( diff --git a/src/sse/rgba_to_yuv.rs b/src/sse/rgba_to_yuv.rs index be395ac..dbd1954 100644 --- a/src/sse/rgba_to_yuv.rs +++ b/src/sse/rgba_to_yuv.rs @@ -79,11 +79,9 @@ unsafe fn sse_rgba_to_yuv_row_impl(_mm_add_epi16( + (_mm_add_epi16( y_bias, _mm_add_epi16( _mm_add_epi16(_mm_mulhrs_epi16(r_low, v_yr), _mm_mulhrs_epi16(g_low, v_yg)), @@ -169,7 +167,7 @@ unsafe fn sse_rgba_to_yuv_row_impl(_mm_add_epi16( + (_mm_add_epi16( y_bias, _mm_add_epi16( _mm_add_epi16( @@ -190,7 +188,7 @@ unsafe fn sse_rgba_to_yuv_row_impl(_mm_add_epi16( + (_mm_add_epi16( uv_bias, _mm_add_epi16( _mm_add_epi16( @@ -206,7 +204,7 @@ unsafe fn sse_rgba_to_yuv_row_impl(_mm_add_epi16( + (_mm_add_epi16( uv_bias, _mm_add_epi16( _mm_add_epi16( @@ -222,7 +220,7 @@ unsafe fn sse_rgba_to_yuv_row_impl(_mm_add_epi16( + (_mm_add_epi16( uv_bias, _mm_add_epi16( _mm_add_epi16( @@ -238,7 +236,7 @@ unsafe fn sse_rgba_to_yuv_row_impl(_mm_add_epi16( + (_mm_add_epi16( uv_bias, _mm_add_epi16( _mm_add_epi16( @@ -269,7 +267,7 @@ unsafe fn sse_rgba_to_yuv_row_impl(_mm_add_epi16( + (_mm_add_epi16( uv_bias, _mm_add_epi16( _mm_add_epi16( @@ -286,7 +284,7 @@ unsafe fn sse_rgba_to_yuv_row_impl(_mm_add_epi16( + (_mm_add_epi16( uv_bias, _mm_add_epi16( _mm_add_epi16( diff --git a/src/sse/rgba_to_yuv420.rs b/src/sse/rgba_to_yuv420.rs index b53a6c7..112fbfd 100644 --- a/src/sse/rgba_to_yuv420.rs +++ b/src/sse/rgba_to_yuv420.rs @@ -79,11 +79,9 @@ unsafe fn sse_rgba_to_yuv_row_impl420( let mut cx = start_cx; let mut uv_x = start_ux; - const V_SHR: i32 = 3; - const V_SCALE: i32 = 6; - let rounding_const_bias: i16 = 1 << (V_SHR - 1); - let bias_y = range.bias_y as i16 * (1 << V_SHR) + rounding_const_bias; - let bias_uv = range.bias_uv as i16 * (1 << V_SHR) + rounding_const_bias; + const V_SCALE: i32 = 3; + let bias_y = range.bias_y as i16; + let bias_uv = range.bias_uv as i16; let i_bias_y = _mm_set1_epi16(range.bias_y as i16); let i_cap_y = _mm_set1_epi16(range.range_y as i16 + range.bias_y as i16); @@ -189,7 +187,7 @@ unsafe fn sse_rgba_to_yuv_row_impl420( let y0_l = _mm_max_epi16( _mm_min_epi16( - _mm_srai_epi16::(_mm_add_epi16( + (_mm_add_epi16( y_bias, _mm_add_epi16( _mm_add_epi16( @@ -206,7 +204,7 @@ unsafe fn sse_rgba_to_yuv_row_impl420( let y0_h = _mm_max_epi16( _mm_min_epi16( - _mm_srai_epi16::(_mm_add_epi16( + (_mm_add_epi16( y_bias, _mm_add_epi16( _mm_add_epi16( @@ -230,7 +228,7 @@ unsafe fn sse_rgba_to_yuv_row_impl420( let y1_l = _mm_max_epi16( _mm_min_epi16( - _mm_srai_epi16::(_mm_add_epi16( + (_mm_add_epi16( y_bias, _mm_add_epi16( _mm_add_epi16( @@ -247,7 +245,7 @@ unsafe fn sse_rgba_to_yuv_row_impl420( let y1_h = _mm_max_epi16( _mm_min_epi16( - _mm_srai_epi16::(_mm_add_epi16( + (_mm_add_epi16( y_bias, _mm_add_epi16( _mm_add_epi16( @@ -280,7 +278,7 @@ unsafe fn sse_rgba_to_yuv_row_impl420( let cbk = _mm_max_epi16( _mm_min_epi16( - _mm_srai_epi16::(_mm_add_epi16( + (_mm_add_epi16( uv_bias, _mm_add_epi16( _mm_add_epi16(_mm_mulhrs_epi16(r1, v_cb_r), _mm_mulhrs_epi16(g1, v_cb_g)), @@ -294,7 +292,7 @@ unsafe fn sse_rgba_to_yuv_row_impl420( let crk = _mm_max_epi16( _mm_min_epi16( - _mm_srai_epi16::(_mm_add_epi16( + (_mm_add_epi16( uv_bias, _mm_add_epi16( _mm_add_epi16(_mm_mulhrs_epi16(r1, v_cr_r), _mm_mulhrs_epi16(g1, v_cr_g)), diff --git a/src/sse/yuv_nv_to_rgba.rs b/src/sse/yuv_nv_to_rgba.rs index eeddc51..9df60b1 100644 --- a/src/sse/yuv_nv_to_rgba.rs +++ b/src/sse/yuv_nv_to_rgba.rs @@ -86,8 +86,7 @@ unsafe fn sse_yuv_nv_to_rgba_impl< let uv_ptr = uv_plane.as_ptr(); let rgba_ptr = rgba.as_mut_ptr(); - const SCALE: i32 = 6; - const V_SHR: i32 = 3; + const SCALE: i32 = 3; let y_corr = _mm_set1_epi8(range.bias_y as i8); let uv_corr = _mm_set1_epi16(range.bias_uv as i16); @@ -97,7 +96,6 @@ unsafe fn sse_yuv_nv_to_rgba_impl< let v_g_coeff_1 = _mm_set1_epi16(transform.g_coeff_1 as i16); let v_g_coeff_2 = _mm_set1_epi16(transform.g_coeff_2 as i16); let v_alpha = _mm_set1_epi8(255u8 as i8); - let rounding_const = _mm_set1_epi16(1 << (V_SHR - 1)); let zeros = _mm_setzero_si128(); @@ -161,24 +159,15 @@ unsafe fn sse_yuv_nv_to_rgba_impl< v_luma_coeff, ); - let r_high = _mm_srai_epi16::(_mm_add_epi16( - _mm_add_epi16(y_high, _mm_mulhrs_epi16(v_high, v_cr_coeff)), - rounding_const, - )); - let b_high = _mm_srai_epi16::(_mm_add_epi16( - _mm_add_epi16(y_high, _mm_mulhrs_epi16(u_high, v_cb_coeff)), - rounding_const, - )); - let g_high = _mm_srai_epi16::(_mm_add_epi16( - _mm_sub_epi16( - y_high, - _mm_add_epi16( - _mm_mulhrs_epi16(v_high, v_g_coeff_1), - _mm_mulhrs_epi16(u_high, v_g_coeff_2), - ), + let r_high = _mm_add_epi16(y_high, _mm_mulhrs_epi16(v_high, v_cr_coeff)); + let b_high = _mm_add_epi16(y_high, _mm_mulhrs_epi16(u_high, v_cb_coeff)); + let g_high = _mm_sub_epi16( + y_high, + _mm_add_epi16( + _mm_mulhrs_epi16(v_high, v_g_coeff_1), + _mm_mulhrs_epi16(u_high, v_g_coeff_2), ), - rounding_const, - )); + ); let u_low = _mm_slli_epi16::(_mm_sub_epi16(u_low_u16, uv_corr)); let v_low = _mm_slli_epi16::(_mm_sub_epi16(v_low_u16, uv_corr)); @@ -187,24 +176,15 @@ unsafe fn sse_yuv_nv_to_rgba_impl< v_luma_coeff, ); - let r_low = _mm_srai_epi16::(_mm_add_epi16( - _mm_add_epi16(y_low, _mm_mulhrs_epi16(v_low, v_cr_coeff)), - rounding_const, - )); - let b_low = _mm_srai_epi16::(_mm_add_epi16( - _mm_add_epi16(y_low, _mm_mulhrs_epi16(u_low, v_cb_coeff)), - rounding_const, - )); - let g_low = _mm_srai_epi16::(_mm_add_epi16( - _mm_sub_epi16( - y_low, - _mm_add_epi16( - _mm_mulhrs_epi16(v_low, v_g_coeff_1), - _mm_mulhrs_epi16(u_low, v_g_coeff_2), - ), + let r_low = _mm_add_epi16(y_low, _mm_mulhrs_epi16(v_low, v_cr_coeff)); + let b_low = _mm_add_epi16(y_low, _mm_mulhrs_epi16(u_low, v_cb_coeff)); + let g_low = _mm_sub_epi16( + y_low, + _mm_add_epi16( + _mm_mulhrs_epi16(v_low, v_g_coeff_1), + _mm_mulhrs_epi16(u_low, v_g_coeff_2), ), - rounding_const, - )); + ); let r_values = _mm_packus_epi16(r_low, r_high); let g_values = _mm_packus_epi16(g_low, g_high); @@ -300,24 +280,15 @@ unsafe fn sse_yuv_nv_to_rgba_impl< v_luma_coeff, ); - let r_low = _mm_srai_epi16::(_mm_add_epi16( - _mm_add_epi16(y_low, _mm_mulhrs_epi16(v_low, v_cr_coeff)), - rounding_const, - )); - let b_low = _mm_srai_epi16::(_mm_add_epi16( - _mm_add_epi16(y_low, _mm_mulhrs_epi16(u_low, v_cb_coeff)), - rounding_const, - )); - let g_low = _mm_srai_epi16::(_mm_add_epi16( - _mm_sub_epi16( - y_low, - _mm_add_epi16( - _mm_mulhrs_epi16(v_low, v_g_coeff_1), - _mm_mulhrs_epi16(u_low, v_g_coeff_2), - ), + let r_low = _mm_add_epi16(y_low, _mm_mulhrs_epi16(v_low, v_cr_coeff)); + let b_low = _mm_add_epi16(y_low, _mm_mulhrs_epi16(u_low, v_cb_coeff)); + let g_low = _mm_sub_epi16( + y_low, + _mm_add_epi16( + _mm_mulhrs_epi16(v_low, v_g_coeff_1), + _mm_mulhrs_epi16(u_low, v_g_coeff_2), ), - rounding_const, - )); + ); let r_values = _mm_packus_epi16(r_low, zeros); let g_values = _mm_packus_epi16(g_low, zeros); diff --git a/src/sse/yuv_nv_to_rgba420.rs b/src/sse/yuv_nv_to_rgba420.rs index 5d3689f..03acdbe 100644 --- a/src/sse/yuv_nv_to_rgba420.rs +++ b/src/sse/yuv_nv_to_rgba420.rs @@ -77,8 +77,7 @@ unsafe fn sse_yuv_nv_to_rgba_impl420(_mm_add_epi16( - _mm_add_epi16(y_high0, _mm_mulhrs_epi16(v_high, v_cr_coeff)), - rounding_const, - )); - let b_high0 = _mm_srai_epi16::(_mm_add_epi16( - _mm_add_epi16(y_high0, _mm_mulhrs_epi16(u_high, v_cb_coeff)), - rounding_const, - )); - let g_high0 = _mm_srai_epi16::(_mm_add_epi16( - _mm_sub_epi16(y_high0, g_coeff_hi), - rounding_const, - )); - let r_high1 = _mm_srai_epi16::(_mm_add_epi16( - _mm_add_epi16(y_high1, _mm_mulhrs_epi16(v_high, v_cr_coeff)), - rounding_const, - )); - let b_high1 = _mm_srai_epi16::(_mm_add_epi16( - _mm_add_epi16(y_high1, _mm_mulhrs_epi16(u_high, v_cb_coeff)), - rounding_const, - )); - let g_high1 = _mm_srai_epi16::(_mm_add_epi16( - _mm_sub_epi16(y_high1, g_coeff_hi), - rounding_const, - )); + let r_high0 = _mm_add_epi16(y_high0, _mm_mulhrs_epi16(v_high, v_cr_coeff)); + let b_high0 = _mm_add_epi16(y_high0, _mm_mulhrs_epi16(u_high, v_cb_coeff)); + let g_high0 = _mm_sub_epi16(y_high0, g_coeff_hi); + let r_high1 = _mm_add_epi16(y_high1, _mm_mulhrs_epi16(v_high, v_cr_coeff)); + let b_high1 = _mm_add_epi16(y_high1, _mm_mulhrs_epi16(u_high, v_cb_coeff)); + let g_high1 = _mm_sub_epi16(y_high1, g_coeff_hi); let u_low = _mm_slli_epi16::(_mm_sub_epi16(u_low_u16, uv_corr)); let v_low = _mm_slli_epi16::(_mm_sub_epi16(v_low_u16, uv_corr)); @@ -184,30 +163,12 @@ unsafe fn sse_yuv_nv_to_rgba_impl420(_mm_add_epi16( - _mm_add_epi16(y_low0, _mm_mulhrs_epi16(v_low, v_cr_coeff)), - rounding_const, - )); - let b_low0 = _mm_srai_epi16::(_mm_add_epi16( - _mm_add_epi16(y_low0, _mm_mulhrs_epi16(u_low, v_cb_coeff)), - rounding_const, - )); - let g_low0 = _mm_srai_epi16::(_mm_add_epi16( - _mm_sub_epi16(y_low0, g_coeff_lo), - rounding_const, - )); - let r_low1 = _mm_srai_epi16::(_mm_add_epi16( - _mm_add_epi16(y_low1, _mm_mulhrs_epi16(v_low, v_cr_coeff)), - rounding_const, - )); - let b_low1 = _mm_srai_epi16::(_mm_add_epi16( - _mm_add_epi16(y_low1, _mm_mulhrs_epi16(u_low, v_cb_coeff)), - rounding_const, - )); - let g_low1 = _mm_srai_epi16::(_mm_add_epi16( - _mm_sub_epi16(y_low1, g_coeff_lo), - rounding_const, - )); + let r_low0 = _mm_add_epi16(y_low0, _mm_mulhrs_epi16(v_low, v_cr_coeff)); + let b_low0 = _mm_add_epi16(y_low0, _mm_mulhrs_epi16(u_low, v_cb_coeff)); + let g_low0 = _mm_sub_epi16(y_low0, g_coeff_lo); + let r_low1 = _mm_add_epi16(y_low1, _mm_mulhrs_epi16(v_low, v_cr_coeff)); + let b_low1 = _mm_add_epi16(y_low1, _mm_mulhrs_epi16(u_low, v_cb_coeff)); + let g_low1 = _mm_sub_epi16(y_low1, g_coeff_lo); let r_values0 = _mm_packus_epi16(r_low0, r_high0); let g_values0 = _mm_packus_epi16(g_low0, g_high0); @@ -331,31 +292,13 @@ unsafe fn sse_yuv_nv_to_rgba_impl420(_mm_add_epi16( - _mm_add_epi16(y_low0, _mm_mulhrs_epi16(v_low, v_cr_coeff)), - rounding_const, - )); - let b_low0 = _mm_srai_epi16::(_mm_add_epi16( - _mm_add_epi16(y_low0, _mm_mulhrs_epi16(u_low, v_cb_coeff)), - rounding_const, - )); - let g_low0 = _mm_srai_epi16::(_mm_add_epi16( - _mm_sub_epi16(y_low0, g_coeff_lo), - rounding_const, - )); - - let r_low1 = _mm_srai_epi16::(_mm_add_epi16( - _mm_add_epi16(y_low1, _mm_mulhrs_epi16(v_low, v_cr_coeff)), - rounding_const, - )); - let b_low1 = _mm_srai_epi16::(_mm_add_epi16( - _mm_add_epi16(y_low1, _mm_mulhrs_epi16(u_low, v_cb_coeff)), - rounding_const, - )); - let g_low1 = _mm_srai_epi16::(_mm_add_epi16( - _mm_sub_epi16(y_low1, g_coeff_lo), - rounding_const, - )); + let r_low0 = _mm_add_epi16(y_low0, _mm_mulhrs_epi16(v_low, v_cr_coeff)); + let b_low0 = _mm_add_epi16(y_low0, _mm_mulhrs_epi16(u_low, v_cb_coeff)); + let g_low0 = _mm_sub_epi16(y_low0, g_coeff_lo); + + let r_low1 = _mm_add_epi16(y_low1, _mm_mulhrs_epi16(v_low, v_cr_coeff)); + let b_low1 = _mm_add_epi16(y_low1, _mm_mulhrs_epi16(u_low, v_cb_coeff)); + let g_low1 = _mm_sub_epi16(y_low1, g_coeff_lo); let r_values0 = _mm_packus_epi16(r_low0, zeros); let g_values0 = _mm_packus_epi16(g_low0, zeros); diff --git a/src/sse/yuv_to_rgba.rs b/src/sse/yuv_to_rgba.rs index 0ba8153..edc7500 100644 --- a/src/sse/yuv_to_rgba.rs +++ b/src/sse/yuv_to_rgba.rs @@ -80,8 +80,7 @@ unsafe fn sse_yuv_to_rgba_row_impl(_mm_sub_epi16(u_high_u16, uv_corr)); let v_high = _mm_slli_epi16::(_mm_sub_epi16(v_high_u16, uv_corr)); - let y_high = _mm_mulhi_epi16( + let y_high = _mm_mulhrs_epi16( _mm_slli_epi16::(_mm_unpackhi_epi8(y_values, zeros)), v_luma_coeff, ); - let r_high = _mm_srai_epi16::(_mm_add_epi16( - _mm_add_epi16(y_high, _mm_mulhi_epi16(v_high, v_cr_coeff)), - rounding_const, - )); - let b_high = _mm_srai_epi16::(_mm_add_epi16( - _mm_add_epi16(y_high, _mm_mulhi_epi16(u_high, v_cb_coeff)), - rounding_const, - )); - let g_high = _mm_srai_epi16::(_mm_add_epi16( - _mm_sub_epi16( - y_high, - _mm_add_epi16( - _mm_mulhi_epi16(v_high, v_g_coeff_1), - _mm_mulhi_epi16(u_high, v_g_coeff_2), - ), + let r_high = _mm_add_epi16(y_high, _mm_mulhrs_epi16(v_high, v_cr_coeff)); + let b_high = _mm_add_epi16(y_high, _mm_mulhrs_epi16(u_high, v_cb_coeff)); + let g_high = _mm_sub_epi16( + y_high, + _mm_add_epi16( + _mm_mulhrs_epi16(v_high, v_g_coeff_1), + _mm_mulhrs_epi16(u_high, v_g_coeff_2), ), - rounding_const, - )); + ); let u_low = _mm_slli_epi16::(_mm_sub_epi16(u_low_u16, uv_corr)); let v_low = _mm_slli_epi16::(_mm_sub_epi16(v_low_u16, uv_corr)); - let y_low = _mm_mulhi_epi16( + let y_low = _mm_mulhrs_epi16( _mm_slli_epi16::(_mm_cvtepu8_epi16(y_values)), v_luma_coeff, ); - let r_low = _mm_srai_epi16::(_mm_add_epi16( - _mm_add_epi16(y_low, _mm_mulhi_epi16(v_low, v_cr_coeff)), - rounding_const, - )); - let b_low = _mm_srai_epi16::(_mm_add_epi16( - _mm_add_epi16(y_low, _mm_mulhi_epi16(u_low, v_cb_coeff)), - rounding_const, - )); - let g_low = _mm_srai_epi16::(_mm_add_epi16( - _mm_sub_epi16( - y_low, - _mm_add_epi16( - _mm_mulhi_epi16(v_low, v_g_coeff_1), - _mm_mulhi_epi16(u_low, v_g_coeff_2), - ), + let r_low = _mm_add_epi16(y_low, _mm_mulhrs_epi16(v_low, v_cr_coeff)); + let b_low = _mm_add_epi16(y_low, _mm_mulhrs_epi16(u_low, v_cb_coeff)); + let g_low = _mm_sub_epi16( + y_low, + _mm_add_epi16( + _mm_mulhrs_epi16(v_low, v_g_coeff_1), + _mm_mulhrs_epi16(u_low, v_g_coeff_2), ), - rounding_const, - )); + ); let r_values = _mm_packus_epi16(r_low, r_high); let g_values = _mm_packus_epi16(g_low, g_high); @@ -252,29 +232,20 @@ unsafe fn sse_yuv_to_rgba_row_impl(_mm_sub_epi16(u_low_u16, uv_corr)); let v_low = _mm_slli_epi16::(_mm_sub_epi16(v_low_u16, uv_corr)); - let y_low = _mm_mulhi_epi16( + let y_low = _mm_mulhrs_epi16( _mm_slli_epi16::(_mm_cvtepu8_epi16(y_values)), v_luma_coeff, ); - let r_low = _mm_srai_epi16::(_mm_add_epi16( - _mm_add_epi16(y_low, _mm_mulhi_epi16(v_low, v_cr_coeff)), - rounding_const, - )); - let b_low = _mm_srai_epi16::(_mm_add_epi16( - _mm_add_epi16(y_low, _mm_mulhi_epi16(u_low, v_cb_coeff)), - rounding_const, - )); - let g_low = _mm_srai_epi16::(_mm_add_epi16( - _mm_sub_epi16( - y_low, - _mm_add_epi16( - _mm_mulhi_epi16(v_low, v_g_coeff_1), - _mm_mulhi_epi16(u_low, v_g_coeff_2), - ), + let r_low = _mm_add_epi16(y_low, _mm_mulhrs_epi16(v_low, v_cr_coeff)); + let b_low = _mm_add_epi16(y_low, _mm_mulhrs_epi16(u_low, v_cb_coeff)); + let g_low = _mm_sub_epi16( + y_low, + _mm_add_epi16( + _mm_mulhrs_epi16(v_low, v_g_coeff_1), + _mm_mulhrs_epi16(u_low, v_g_coeff_2), ), - rounding_const, - )); + ); let r_values = _mm_packus_epi16(r_low, zeros); let g_values = _mm_packus_epi16(g_low, zeros); diff --git a/src/sse/yuv_to_rgba420.rs b/src/sse/yuv_to_rgba420.rs index bf45971..4b0f599 100644 --- a/src/sse/yuv_to_rgba420.rs +++ b/src/sse/yuv_to_rgba420.rs @@ -80,8 +80,7 @@ unsafe fn sse_yuv_to_rgba_row_impl420( let u_ptr = u_plane.as_ptr(); let v_ptr = v_plane.as_ptr(); - const SCALE: i32 = 6; - const V_SHR: i32 = 3; + const SCALE: i32 = 3; let y_corr = _mm_set1_epi8(range.bias_y as i8); let uv_corr = _mm_set1_epi16(range.bias_uv as i16); @@ -91,7 +90,6 @@ unsafe fn sse_yuv_to_rgba_row_impl420( let v_g_coeff_1 = _mm_set1_epi16(transform.g_coeff_1 as i16); let v_g_coeff_2 = _mm_set1_epi16(transform.g_coeff_2 as i16); let v_alpha = _mm_set1_epi8(255u8 as i8); - let rounding_const = _mm_set1_epi16(1 << (V_SHR - 1)); let zeros = _mm_setzero_si128(); let reshuffle = _mm_setr_epi8(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7); @@ -130,31 +128,13 @@ unsafe fn sse_yuv_to_rgba_row_impl420( _mm_mulhrs_epi16(u_high, v_g_coeff_2), ); - let r_high0 = _mm_srai_epi16::(_mm_add_epi16( - _mm_add_epi16(y_high0, _mm_mulhrs_epi16(v_high, v_cr_coeff)), - rounding_const, - )); - let b_high0 = _mm_srai_epi16::(_mm_add_epi16( - _mm_add_epi16(y_high0, _mm_mulhrs_epi16(u_high, v_cb_coeff)), - rounding_const, - )); - let g_high0 = _mm_srai_epi16::(_mm_add_epi16( - _mm_sub_epi16(y_high0, g_coeff_hi), - rounding_const, - )); - - let r_high1 = _mm_srai_epi16::(_mm_add_epi16( - _mm_add_epi16(y_high1, _mm_mulhrs_epi16(v_high, v_cr_coeff)), - rounding_const, - )); - let b_high1 = _mm_srai_epi16::(_mm_add_epi16( - _mm_add_epi16(y_high1, _mm_mulhrs_epi16(u_high, v_cb_coeff)), - rounding_const, - )); - let g_high1 = _mm_srai_epi16::(_mm_add_epi16( - _mm_sub_epi16(y_high1, g_coeff_hi), - rounding_const, - )); + let r_high0 = _mm_add_epi16(y_high0, _mm_mulhrs_epi16(v_high, v_cr_coeff)); + let b_high0 = _mm_add_epi16(y_high0, _mm_mulhrs_epi16(u_high, v_cb_coeff)); + let g_high0 = _mm_sub_epi16(y_high0, g_coeff_hi); + + let r_high1 = _mm_add_epi16(y_high1, _mm_mulhrs_epi16(v_high, v_cr_coeff)); + let b_high1 = _mm_add_epi16(y_high1, _mm_mulhrs_epi16(u_high, v_cb_coeff)); + let g_high1 = _mm_sub_epi16(y_high1, g_coeff_hi); let u_low = _mm_slli_epi16::(_mm_sub_epi16(u_low_u16, uv_corr)); let v_low = _mm_slli_epi16::(_mm_sub_epi16(v_low_u16, uv_corr)); @@ -172,31 +152,13 @@ unsafe fn sse_yuv_to_rgba_row_impl420( _mm_mulhrs_epi16(u_low, v_g_coeff_2), ); - let r_low0 = _mm_srai_epi16::(_mm_add_epi16( - _mm_add_epi16(y_low0, _mm_mulhrs_epi16(v_low, v_cr_coeff)), - rounding_const, - )); - let b_low0 = _mm_srai_epi16::(_mm_add_epi16( - _mm_add_epi16(y_low0, _mm_mulhrs_epi16(u_low, v_cb_coeff)), - rounding_const, - )); - let g_low0 = _mm_srai_epi16::(_mm_add_epi16( - _mm_sub_epi16(y_low0, g_coeff_lo), - rounding_const, - )); - - let r_low1 = _mm_srai_epi16::(_mm_add_epi16( - _mm_add_epi16(y_low1, _mm_mulhrs_epi16(v_low, v_cr_coeff)), - rounding_const, - )); - let b_low1 = _mm_srai_epi16::(_mm_add_epi16( - _mm_add_epi16(y_low1, _mm_mulhrs_epi16(u_low, v_cb_coeff)), - rounding_const, - )); - let g_low1 = _mm_srai_epi16::(_mm_add_epi16( - _mm_sub_epi16(y_low1, g_coeff_lo), - rounding_const, - )); + let r_low0 = _mm_add_epi16(y_low0, _mm_mulhrs_epi16(v_low, v_cr_coeff)); + let b_low0 = _mm_add_epi16(y_low0, _mm_mulhrs_epi16(u_low, v_cb_coeff)); + let g_low0 = _mm_sub_epi16(y_low0, g_coeff_lo); + + let r_low1 = _mm_add_epi16(y_low1, _mm_mulhrs_epi16(v_low, v_cr_coeff)); + let b_low1 = _mm_add_epi16(y_low1, _mm_mulhrs_epi16(u_low, v_cb_coeff)); + let g_low1 = _mm_sub_epi16(y_low1, g_coeff_lo); let r_values0 = _mm_packus_epi16(r_low0, r_high0); let g_values0 = _mm_packus_epi16(g_low0, g_high0); @@ -318,31 +280,13 @@ unsafe fn sse_yuv_to_rgba_row_impl420( _mm_mulhrs_epi16(u_low, v_g_coeff_2), ); - let r_low0 = _mm_srai_epi16::(_mm_add_epi16( - _mm_add_epi16(y_low0, _mm_mulhrs_epi16(v_low, v_cr_coeff)), - rounding_const, - )); - let b_low0 = _mm_srai_epi16::(_mm_add_epi16( - _mm_add_epi16(y_low0, _mm_mulhrs_epi16(u_low, v_cb_coeff)), - rounding_const, - )); - let g_low0 = _mm_srai_epi16::(_mm_add_epi16( - _mm_sub_epi16(y_low0, g_coeff), - rounding_const, - )); - - let r_low1 = _mm_srai_epi16::(_mm_add_epi16( - _mm_add_epi16(y_low1, _mm_mulhrs_epi16(v_low, v_cr_coeff)), - rounding_const, - )); - let b_low1 = _mm_srai_epi16::(_mm_add_epi16( - _mm_add_epi16(y_low1, _mm_mulhrs_epi16(u_low, v_cb_coeff)), - rounding_const, - )); - let g_low1 = _mm_srai_epi16::(_mm_add_epi16( - _mm_sub_epi16(y_low1, g_coeff), - rounding_const, - )); + let r_low0 = _mm_add_epi16(y_low0, _mm_mulhrs_epi16(v_low, v_cr_coeff)); + let b_low0 = _mm_add_epi16(y_low0, _mm_mulhrs_epi16(u_low, v_cb_coeff)); + let g_low0 = _mm_sub_epi16(y_low0, g_coeff); + + let r_low1 = _mm_add_epi16(y_low1, _mm_mulhrs_epi16(v_low, v_cr_coeff)); + let b_low1 = _mm_add_epi16(y_low1, _mm_mulhrs_epi16(u_low, v_cb_coeff)); + let g_low1 = _mm_sub_epi16(y_low1, g_coeff); let r_values0 = _mm_packus_epi16(r_low0, zeros); let g_values0 = _mm_packus_epi16(g_low0, zeros); diff --git a/src/sse/yuv_to_rgba_alpha.rs b/src/sse/yuv_to_rgba_alpha.rs index bfefd6d..76b2c90 100644 --- a/src/sse/yuv_to_rgba_alpha.rs +++ b/src/sse/yuv_to_rgba_alpha.rs @@ -94,8 +94,7 @@ unsafe fn sse_yuv_to_rgba_alpha_row_impl(_mm_add_epi16( - _mm_add_epi16(y_high, _mm_mulhrs_epi16(v_high, v_cr_coeff)), - rounding_const, - )); - let b_high = _mm_srai_epi16::(_mm_adds_epi16( - _mm_add_epi16(y_high, _mm_mulhrs_epi16(u_high, v_cb_coeff)), - rounding_const, - )); - let g_high = _mm_srai_epi16::(_mm_add_epi16( - _mm_sub_epi16( - y_high, - _mm_add_epi16( - _mm_mulhrs_epi16(v_high, v_g_coeff_1), - _mm_mulhrs_epi16(u_high, v_g_coeff_2), - ), + let r_high = _mm_add_epi16(y_high, _mm_mulhrs_epi16(v_high, v_cr_coeff)); + let b_high = _mm_add_epi16(y_high, _mm_mulhrs_epi16(u_high, v_cb_coeff)); + let g_high = _mm_sub_epi16( + y_high, + _mm_add_epi16( + _mm_mulhrs_epi16(v_high, v_g_coeff_1), + _mm_mulhrs_epi16(u_high, v_g_coeff_2), ), - rounding_const, - )); + ); let u_low = _mm_slli_epi16::(_mm_sub_epi16(u_low_u16, uv_corr)); let v_low = _mm_slli_epi16::(_mm_sub_epi16(v_low_u16, uv_corr)); @@ -171,24 +160,15 @@ unsafe fn sse_yuv_to_rgba_alpha_row_impl(_mm_add_epi16( - _mm_add_epi16(y_low, _mm_mulhrs_epi16(v_low, v_cr_coeff)), - rounding_const, - )); - let b_low = _mm_srai_epi16::(_mm_add_epi16( - _mm_add_epi16(y_low, _mm_mulhrs_epi16(u_low, v_cb_coeff)), - rounding_const, - )); - let g_low = _mm_srai_epi16::(_mm_add_epi16( - _mm_sub_epi16( - y_low, - _mm_add_epi16( - _mm_mulhrs_epi16(v_low, v_g_coeff_1), - _mm_mulhrs_epi16(u_low, v_g_coeff_2), - ), + let r_low = _mm_add_epi16(y_low, _mm_mulhrs_epi16(v_low, v_cr_coeff)); + let b_low = _mm_add_epi16(y_low, _mm_mulhrs_epi16(u_low, v_cb_coeff)); + let g_low = _mm_sub_epi16( + y_low, + _mm_add_epi16( + _mm_mulhrs_epi16(v_low, v_g_coeff_1), + _mm_mulhrs_epi16(u_low, v_g_coeff_2), ), - rounding_const, - )); + ); let (r_values, g_values, b_values);