From ad64a2ad15f026bf51c3ba82640b3a31711f2cf2 Mon Sep 17 00:00:00 2001 From: Radzivon Bartoshyk Date: Thu, 28 Nov 2024 18:06:55 +0000 Subject: [PATCH] Sharp yuv improvements, MSRV improvements --- Cargo.toml | 1 + src/from_identity.rs | 1 + src/from_identity_alpha.rs | 1 + src/images.rs | 1 + src/neon/mod.rs | 4 +- src/neon/y_p16_to_rgba16.rs | 128 +++++++++++++++--------------- src/sharpyuv/sharp_rgba_to_yuv.rs | 24 +++--- src/to_identity.rs | 1 + src/y_p16_to_rgb16.rs | 6 +- 9 files changed, 84 insertions(+), 83 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 94ace09..e08183a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,6 +14,7 @@ categories = ["multimedia::images", "multimedia::video"] homepage = "https://github.com/awxkee/yuvutils-rs" repository = "https://github.com/awxkee/yuvutils-rs" exclude = ["*.jpg", "assets/*", "*.png"] +rust-version = "1.73.0" [dependencies] num-traits = "0.2.19" diff --git a/src/from_identity.rs b/src/from_identity.rs index e6d80e3..43b2c63 100644 --- a/src/from_identity.rs +++ b/src/from_identity.rs @@ -37,6 +37,7 @@ use rayon::iter::{IndexedParallelIterator, ParallelIterator}; #[cfg(feature = "rayon")] use rayon::prelude::{ParallelSlice, ParallelSliceMut}; use std::fmt::Debug; +use std::mem::size_of; #[inline] fn gbr_to_rgbx_impl< diff --git a/src/from_identity_alpha.rs b/src/from_identity_alpha.rs index ff37968..0b0587a 100644 --- a/src/from_identity_alpha.rs +++ b/src/from_identity_alpha.rs @@ -37,6 +37,7 @@ use rayon::iter::{IndexedParallelIterator, ParallelIterator}; #[cfg(feature = "rayon")] use rayon::prelude::{ParallelSlice, ParallelSliceMut}; use std::fmt::Debug; +use std::mem::size_of; #[inline] fn gbr_to_rgbx_alpha_impl< diff --git a/src/images.rs b/src/images.rs index 24aa194..0ac7811 100644 --- a/src/images.rs +++ b/src/images.rs @@ -34,6 +34,7 @@ use crate::YuvError; use std::fmt::Debug; #[derive(Debug)] +/// Shared storage type pub enum BufferStoreMut<'a, T: Copy + Debug> { Borrowed(&'a mut [T]), Owned(Vec), diff --git a/src/neon/mod.rs b/src/neon/mod.rs index aeee023..5cfd22b 100644 --- a/src/neon/mod.rs +++ b/src/neon/mod.rs @@ -64,7 +64,7 @@ pub(crate) use rgb_to_yuv_p16::{neon_rgba_to_yuv_p16, neon_rgba_to_yuv_p16_rdm}; pub(crate) use rgba_to_nv::{neon_rgbx_to_nv_row, neon_rgbx_to_nv_row_rdm}; pub(crate) use rgba_to_yuv::{neon_rgba_to_yuv, neon_rgba_to_yuv_rdm}; pub(crate) use rgba_to_yuv420::{neon_rgba_to_yuv420, neon_rgba_to_yuv_rdm420}; -pub(crate) use y_p16_to_rgba16::{neon_y_p16_to_rgba16_row, neon_y_p16_to_rgba16_rdm}; +pub(crate) use y_p16_to_rgba16::{neon_y_p16_to_rgba16_rdm, neon_y_p16_to_rgba16_row}; pub(crate) use y_to_rgb::{neon_y_to_rgb_row, neon_y_to_rgb_row_rdm}; pub(crate) use ycgco_to_rgb::neon_ycgco_to_rgb_row; pub(crate) use ycgco_to_rgb_alpha::neon_ycgco_to_rgb_alpha_row; @@ -84,4 +84,4 @@ pub(crate) use yuv_to_rgba420::{neon_yuv_to_rgba_row420, neon_yuv_to_rgba_row_rd pub(crate) use yuv_to_rgba_alpha::{neon_yuv_to_rgba_alpha, neon_yuv_to_rgba_alpha_rdm}; pub(crate) use yuv_to_yuy2::yuv_to_yuy2_neon_impl; pub(crate) use yuy2_to_rgb::yuy2_to_rgb_neon; -pub(crate) use yuy2_to_yuv::yuy2_to_yuv_neon_impl; \ No newline at end of file +pub(crate) use yuy2_to_yuv::yuy2_to_yuv_neon_impl; diff --git a/src/neon/y_p16_to_rgba16.rs b/src/neon/y_p16_to_rgba16.rs index db3099c..f6d3702 100644 --- a/src/neon/y_p16_to_rgba16.rs +++ b/src/neon/y_p16_to_rgba16.rs @@ -30,9 +30,8 @@ use std::arch::aarch64::*; use crate::internals::ProcessedOffset; -use crate::yuv_support::{ - CbCrInverseTransform, YuvBytesPacking, YuvChromaRange, YuvEndianness, YuvSourceChannels, -}; +use crate::neon::neon_simd_support::vldq_s16_endian; +use crate::yuv_support::{CbCrInverseTransform, YuvChromaRange, YuvSourceChannels}; #[target_feature(enable = "rdm")] pub(crate) unsafe fn neon_y_p16_to_rgba16_rdm< @@ -40,8 +39,8 @@ pub(crate) unsafe fn neon_y_p16_to_rgba16_rdm< const ENDIANNESS: u8, const BYTES_POSITION: u8, >( - y_ld_ptr: *const u16, - rgba: *mut u16, + y_ld_ptr: &[u16], + rgba: &mut [u16], width: u32, range: &YuvChromaRange, transform: &CbCrInverseTransform, @@ -50,11 +49,9 @@ pub(crate) unsafe fn neon_y_p16_to_rgba16_rdm< ) -> ProcessedOffset { let destination_channels: YuvSourceChannels = DESTINATION_CHANNELS.into(); let channels = destination_channels.get_channels_count(); - let endianness: YuvEndianness = ENDIANNESS.into(); - let bytes_position: YuvBytesPacking = BYTES_POSITION.into(); let dst_ptr = rgba; - let y_corr = vdupq_n_s16(range.bias_y as i16); + let y_corr = vdupq_n_u16(range.bias_y as u16); let v_min_values = vdupq_n_s16(0i16); let v_alpha = vdupq_n_u16((1 << bit_depth) - 1); let v_msb_shift = vdupq_n_s16(bit_depth as i16 - 16); @@ -64,47 +61,49 @@ pub(crate) unsafe fn neon_y_p16_to_rgba16_rdm< const V_SCALE: i32 = 2; while cx + 8 < width as usize { - let y_values: int16x8_t; - - match endianness { - YuvEndianness::BigEndian => { - let mut y_u_values = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16( - vld1q_u16(y_ld_ptr.add(cx)), - ))); - if bytes_position == YuvBytesPacking::MostSignificantBytes { - y_u_values = vshlq_u16(y_u_values, v_msb_shift); - } - y_values = vsubq_s16(vreinterpretq_s16_u16(y_u_values), y_corr); - } - YuvEndianness::LittleEndian => { - let mut y_vl = vld1q_u16(y_ld_ptr.add(cx)); - if bytes_position == YuvBytesPacking::MostSignificantBytes { - y_vl = vshlq_u16(y_vl, v_msb_shift); - } - y_values = vsubq_s16(vreinterpretq_s16_u16(y_vl), y_corr); - } - } + let y_values: int16x8_t = vreinterpretq_s16_u16(vqsubq_u16( + vreinterpretq_u16_s16(vldq_s16_endian::( + y_ld_ptr.get_unchecked(cx..).as_ptr(), + v_msb_shift, + )), + y_corr, + )); let y_high = vqrdmulhq_n_s16(vshlq_n_s16::(y_values), transform.y_coef as i16); - let r_values = vreinterpretq_u16_s16(vmaxq_s16(y_high, v_min_values)); + let r_values = vminq_u16( + vreinterpretq_u16_s16(vmaxq_s16(y_high, v_min_values)), + v_alpha, + ); match destination_channels { YuvSourceChannels::Rgb => { let dst_pack = uint16x8x3_t(r_values, r_values, r_values); - vst3q_u16(dst_ptr.add(cx * channels), dst_pack); + vst3q_u16( + dst_ptr.get_unchecked_mut(cx * channels..).as_mut_ptr(), + dst_pack, + ); } YuvSourceChannels::Bgr => { let dst_pack = uint16x8x3_t(r_values, r_values, r_values); - vst3q_u16(dst_ptr.add(cx * channels), dst_pack); + vst3q_u16( + dst_ptr.get_unchecked_mut(cx * channels..).as_mut_ptr(), + dst_pack, + ); } YuvSourceChannels::Rgba => { let dst_pack = uint16x8x4_t(r_values, r_values, r_values, v_alpha); - vst4q_u16(dst_ptr.add(cx * channels), dst_pack); + vst4q_u16( + dst_ptr.get_unchecked_mut(cx * channels..).as_mut_ptr(), + dst_pack, + ); } YuvSourceChannels::Bgra => { let dst_pack = uint16x8x4_t(r_values, r_values, r_values, v_alpha); - vst4q_u16(dst_ptr.add(cx * channels), dst_pack); + vst4q_u16( + dst_ptr.get_unchecked_mut(cx * channels..).as_mut_ptr(), + dst_pack, + ); } } @@ -120,8 +119,8 @@ pub(crate) unsafe fn neon_y_p16_to_rgba16_row< const BYTES_POSITION: u8, const PRECISION: i32, >( - y_ld_ptr: *const u16, - rgba: *mut u16, + y_ld_ptr: &[u16], + rgba: &mut [u16], width: u32, range: &YuvChromaRange, transform: &CbCrInverseTransform, @@ -130,66 +129,63 @@ pub(crate) unsafe fn neon_y_p16_to_rgba16_row< ) -> ProcessedOffset { let destination_channels: YuvSourceChannels = DESTINATION_CHANNELS.into(); let channels = destination_channels.get_channels_count(); - let endianness: YuvEndianness = ENDIANNESS.into(); - let bytes_position: YuvBytesPacking = BYTES_POSITION.into(); let dst_ptr = rgba; - let y_corr = vdupq_n_s16(range.bias_y as i16); + let y_corr = vdupq_n_u16(range.bias_y as u16); let v_luma_coeff = vdupq_n_s16(transform.y_coef as i16); - let v_min_values = vdupq_n_s16(0i16); let v_alpha = vdupq_n_u16((1 << bit_depth) - 1); + let v_max_values = vdupq_n_s32((1 << bit_depth) - 1); let v_msb_shift = vdupq_n_s16(bit_depth as i16 - 16); let mut cx = start_cx; while cx + 8 < width as usize { - let y_values: int16x8_t; - - match endianness { - YuvEndianness::BigEndian => { - let mut y_u_values = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16( - vld1q_u16(y_ld_ptr.add(cx)), - ))); - if bytes_position == YuvBytesPacking::MostSignificantBytes { - y_u_values = vshlq_u16(y_u_values, v_msb_shift); - } - y_values = vsubq_s16(vreinterpretq_s16_u16(y_u_values), y_corr); - } - YuvEndianness::LittleEndian => { - let mut y_vl = vld1q_u16(y_ld_ptr.add(cx)); - if bytes_position == YuvBytesPacking::MostSignificantBytes { - y_vl = vshlq_u16(y_vl, v_msb_shift); - } - y_values = vsubq_s16(vreinterpretq_s16_u16(y_vl), y_corr); - } - } + let y_values: int16x8_t = vreinterpretq_s16_u16(vqsubq_u16( + vreinterpretq_u16_s16(vldq_s16_endian::( + y_ld_ptr.get_unchecked(cx..).as_ptr(), + v_msb_shift, + )), + y_corr, + )); let y_high = vmull_high_s16(y_values, v_luma_coeff); - let r_high = vrshrn_n_s32::(y_high); + let r_high = vqmovun_s32(vminq_s32(vrshrq_n_s32::(y_high), v_max_values)); let y_low = vmull_s16(vget_low_s16(y_values), vget_low_s16(v_luma_coeff)); - let r_low = vrshrn_n_s32::(y_low); + let r_low = vqmovun_s32(vminq_s32(vrshrq_n_s32::(y_low), v_max_values)); - let r_values = vreinterpretq_u16_s16(vmaxq_s16(vcombine_s16(r_low, r_high), v_min_values)); + let r_values = vcombine_u16(r_low, r_high); match destination_channels { YuvSourceChannels::Rgb => { let dst_pack = uint16x8x3_t(r_values, r_values, r_values); - vst3q_u16(dst_ptr.add(cx * channels), dst_pack); + vst3q_u16( + dst_ptr.get_unchecked_mut(cx * channels..).as_mut_ptr(), + dst_pack, + ); } YuvSourceChannels::Bgr => { let dst_pack = uint16x8x3_t(r_values, r_values, r_values); - vst3q_u16(dst_ptr.add(cx * channels), dst_pack); + vst3q_u16( + dst_ptr.get_unchecked_mut(cx * channels..).as_mut_ptr(), + dst_pack, + ); } YuvSourceChannels::Rgba => { let dst_pack = uint16x8x4_t(r_values, r_values, r_values, v_alpha); - vst4q_u16(dst_ptr.add(cx * channels), dst_pack); + vst4q_u16( + dst_ptr.get_unchecked_mut(cx * channels..).as_mut_ptr(), + dst_pack, + ); } YuvSourceChannels::Bgra => { let dst_pack = uint16x8x4_t(r_values, r_values, r_values, v_alpha); - vst4q_u16(dst_ptr.add(cx * channels), dst_pack); + vst4q_u16( + dst_ptr.get_unchecked_mut(cx * channels..).as_mut_ptr(), + dst_pack, + ); } } diff --git a/src/sharpyuv/sharp_rgba_to_yuv.rs b/src/sharpyuv/sharp_rgba_to_yuv.rs index 7994bd5..3409140 100644 --- a/src/sharpyuv/sharp_rgba_to_yuv.rs +++ b/src/sharpyuv/sharp_rgba_to_yuv.rs @@ -495,10 +495,10 @@ fn rgbx_to_sharp_yuv( }; sharpen_row420::( y, - rgba, - y_plane, - u_plane, - v_plane, + &rgba[0..planar_image.width as usize * src_chans.get_channels_count()], + &mut y_plane[0..planar_image.width as usize], + &mut u_plane[0..(planar_image.width as usize).div_ceil(2)], + &mut v_plane[0..(planar_image.width as usize).div_ceil(2)], rgb_layout_lane, rgb_layout_next_lane, &gamma_map_table, @@ -513,10 +513,10 @@ fn rgbx_to_sharp_yuv( let rgb_layout_lane = &rgb_layout [rgb_layout_start..((planar_image.width as usize) * 3 + rgb_layout_start)]; sharpen_row422::( - rgba, - y_plane, - u_plane, - v_plane, + &rgba[0..planar_image.width as usize * src_chans.get_channels_count()], + &mut y_plane[0..planar_image.width as usize], + &mut u_plane[0..(planar_image.width as usize).div_ceil(2)], + &mut v_plane[0..(planar_image.width as usize).div_ceil(2)], rgb_layout_lane, &gamma_map_table, &chroma_range, @@ -564,10 +564,10 @@ fn rgbx_to_sharp_yuv( let rgb_layout_next_lane: &[u16] = rgb_layout; sharpen_row420::( y, - rgba, - y_plane, - u_plane, - v_plane, + &rgba[0..planar_image.width as usize * src_chans.get_channels_count()], + &mut y_plane[0..planar_image.width as usize], + &mut u_plane[0..(planar_image.width as usize).div_ceil(2)], + &mut v_plane[0..(planar_image.width as usize).div_ceil(2)], rgb_layout_lane, rgb_layout_next_lane, &gamma_map_table, diff --git a/src/to_identity.rs b/src/to_identity.rs index e304658..d5f59b8 100644 --- a/src/to_identity.rs +++ b/src/to_identity.rs @@ -32,6 +32,7 @@ use crate::yuv_support::{get_yuv_range, YuvSourceChannels}; use crate::{YuvChromaSubsampling, YuvError, YuvPlanarImageMut, YuvRange}; use num_traits::AsPrimitive; use std::fmt::Debug; +use std::mem::size_of; #[inline] fn rgbx_to_gbr_impl< diff --git a/src/y_p16_to_rgb16.rs b/src/y_p16_to_rgb16.rs index ed0423c..9a9f692 100644 --- a/src/y_p16_to_rgb16.rs +++ b/src/y_p16_to_rgb16.rs @@ -95,7 +95,7 @@ fn yuv400_p16_to_rgbx< #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] let is_rdm_available = std::arch::is_aarch64_feature_detected!("rdm"); #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] - let neon_wide_handler = if is_rdm_available { + let neon_wide_handler = if is_rdm_available && bit_depth <= 12 { neon_y_p16_to_rgba16_rdm:: } else { neon_y_p16_to_rgba16_row:: @@ -111,8 +111,8 @@ fn yuv400_p16_to_rgbx< { unsafe { let offset = neon_wide_handler( - y_plane.as_ptr(), - rgba16.as_mut_ptr(), + y_plane, + rgba16, image.width, &chroma_range, &i_transform,