diff --git a/app/src/main.rs b/app/src/main.rs index 74431be..de41e54 100644 --- a/app/src/main.rs +++ b/app/src/main.rs @@ -112,8 +112,7 @@ fn main() { // // println!("rgb_to_yuv_nv12 time: {:?}", start_time.elapsed()); // - // let end_time = Instant::now().sub(start_time); - // println!("Forward time: {:?}", end_time); + println!("Forward time: {:?}", start_time.elapsed()); // // // // let full_size = if width % 2 == 0 { // 2 * width as usize * height as usize diff --git a/src/neon/mod.rs b/src/neon/mod.rs index e8cd7c5..2bd8970 100644 --- a/src/neon/mod.rs +++ b/src/neon/mod.rs @@ -57,7 +57,7 @@ mod yuy2_to_yuv; pub use rgb_to_y::neon_rgb_to_y_row; pub use rgb_to_ycgco::neon_rgb_to_ycgco_row; pub use rgb_to_ycgco_r::neon_rgb_to_ycgcor_row; -pub use rgb_to_yuv_p16::neon_rgba_to_yuv_p16; +pub(crate) use rgb_to_yuv_p16::{neon_rgba_to_yuv_p16, neon_rgba_to_yuv_p16_rdm}; pub(crate) use rgba_to_nv::{neon_rgbx_to_nv_row, neon_rgbx_to_nv_row_rdm}; pub(crate) use rgba_to_yuv::{neon_rgba_to_yuv, neon_rgba_to_yuv_rdm}; pub use y_p16_to_rgba16::neon_y_p16_to_rgba16_row; diff --git a/src/neon/rgb_to_yuv_p16.rs b/src/neon/rgb_to_yuv_p16.rs index ffa34ac..874fc59 100644 --- a/src/neon/rgb_to_yuv_p16.rs +++ b/src/neon/rgb_to_yuv_p16.rs @@ -33,7 +33,7 @@ use crate::yuv_support::{ use crate::{YuvBytesPacking, YuvEndianness}; use std::arch::aarch64::*; -pub unsafe fn neon_rgba_to_yuv_p16< +pub(crate) unsafe fn neon_rgba_to_yuv_p16< const ORIGIN_CHANNELS: u8, const SAMPLING: u8, const ENDIANNESS: u8, @@ -83,6 +83,10 @@ pub unsafe fn neon_rgba_to_yuv_p16< let v_shift_count = vdupq_n_s16(16 - BIT_DEPTH as i16); + let i_bias_y = vdupq_n_u16(range.bias_y as u16); + let i_cap_y = vdupq_n_u16(range.range_y as u16 + range.bias_y as u16); + let i_cap_uv = vdupq_n_u16(range.bias_y as u16 + range.range_uv as u16); + while cx + 8 < width { let r_values; let g_values; @@ -137,9 +141,15 @@ pub unsafe fn neon_rgba_to_yuv_p16< vget_low_s16(v_yb), ); - let mut y_vl = vcombine_u16( - vqshrun_n_s32::(y_l), - vqshrun_n_s32::(y_h), + let mut y_vl = vminq_u16( + vmaxq_u16( + vcombine_u16( + vqshrun_n_s32::(y_l), + vqshrun_n_s32::(y_h), + ), + i_bias_y, + ), + i_cap_y, ); if bytes_position == YuvBytesPacking::MostSignificantBytes { @@ -173,9 +183,15 @@ pub unsafe fn neon_rgba_to_yuv_p16< vget_low_s16(v_cb_b), ); - let mut cb_vl = vcombine_u16( - vqshrun_n_s32::(cb_l), - vqshrun_n_s32::(cb_h), + let mut cb_vl = vminq_u16( + vmaxq_u16( + vcombine_u16( + vqshrun_n_s32::(cb_l), + vqshrun_n_s32::(cb_h), + ), + i_bias_y, + ), + i_cap_uv, ); let mut cr_h = vmlal_high_s16(uv_bias, vreinterpretq_s16_u16(r_values), v_cr_r); @@ -198,9 +214,15 @@ pub unsafe fn neon_rgba_to_yuv_p16< vget_low_s16(v_cr_b), ); - let mut cr_vl = vcombine_u16( - vqshrun_n_s32::(cr_l), - vqshrun_n_s32::(cr_h), + let mut cr_vl = vminq_u16( + vmaxq_u16( + vcombine_u16( + vqshrun_n_s32::(cr_l), + vqshrun_n_s32::(cr_h), + ), + i_bias_y, + ), + i_cap_uv, ); match chroma_subsampling { @@ -247,3 +269,169 @@ pub unsafe fn neon_rgba_to_yuv_p16< ProcessedOffset { ux, cx } } + +#[target_feature(enable = "rdm")] +pub(crate) unsafe fn neon_rgba_to_yuv_p16_rdm< + const ORIGIN_CHANNELS: u8, + const SAMPLING: u8, + const ENDIANNESS: u8, + const BYTES_POSITION: u8, + const PRECISION: i32, + const BIT_DEPTH: usize, +>( + transform: &CbCrForwardTransform, + range: &YuvChromaRange, + y_plane: &mut [u16], + u_plane: &mut [u16], + v_plane: &mut [u16], + rgba: &[u16], + start_cx: usize, + start_ux: usize, + width: usize, + compute_uv_row: bool, +) -> ProcessedOffset { + let chroma_subsampling: YuvChromaSubsampling = SAMPLING.into(); + let source_channels: YuvSourceChannels = ORIGIN_CHANNELS.into(); + let endianness: YuvEndianness = ENDIANNESS.into(); + let bytes_position: YuvBytesPacking = BYTES_POSITION.into(); + let channels = source_channels.get_channels_count(); + + let bias_y = range.bias_y as i16; + let bias_uv = range.bias_uv as i16; + + let y_ptr = y_plane.as_mut_ptr(); + let u_ptr = u_plane.as_mut_ptr(); + let v_ptr = v_plane.as_mut_ptr(); + + let y_bias = vdupq_n_s16(bias_y); + let uv_bias = vdupq_n_s16(bias_uv); + let v_yr = vdupq_n_s16(transform.yr as i16); + let v_yg = vdupq_n_s16(transform.yg as i16); + let v_yb = vdupq_n_s16(transform.yb as i16); + let v_cb_r = vdupq_n_s16(transform.cb_r as i16); + let v_cb_g = vdupq_n_s16(transform.cb_g as i16); + let v_cb_b = vdupq_n_s16(transform.cb_b as i16); + let v_cr_r = vdupq_n_s16(transform.cr_r as i16); + let v_cr_g = vdupq_n_s16(transform.cr_g as i16); + let v_cr_b = vdupq_n_s16(transform.cr_b as i16); + + let i_bias_y = vdupq_n_s16(range.bias_y as i16); + let i_cap_y = vdupq_n_s16(range.range_y as i16 + range.bias_y as i16); + let i_cap_uv = vdupq_n_s16(range.bias_y as i16 + range.range_uv as i16); + + let mut cx = start_cx; + let mut ux = start_ux; + + let v_shift_count = vdupq_n_s16(16 - BIT_DEPTH as i16); + + while cx + 8 < width { + let mut r_values; + let mut g_values; + let mut b_values; + + let src_ptr = rgba.get_unchecked(cx * channels..); + + match source_channels { + YuvSourceChannels::Rgb | YuvSourceChannels::Bgr => { + let rgb_values = vld3q_u16(src_ptr.as_ptr()); + if source_channels == YuvSourceChannels::Rgb { + r_values = rgb_values.0; + g_values = rgb_values.1; + b_values = rgb_values.2; + } else { + r_values = rgb_values.2; + g_values = rgb_values.1; + b_values = rgb_values.0; + } + } + YuvSourceChannels::Rgba => { + let rgb_values = vld4q_u16(src_ptr.as_ptr()); + r_values = rgb_values.0; + g_values = rgb_values.1; + b_values = rgb_values.2; + } + YuvSourceChannels::Bgra => { + let rgb_values = vld4q_u16(src_ptr.as_ptr()); + r_values = rgb_values.2; + g_values = rgb_values.1; + b_values = rgb_values.0; + } + } + + r_values = vshlq_n_u16::<3>(r_values); + g_values = vshlq_n_u16::<3>(g_values); + b_values = vshlq_n_u16::<3>(b_values); + + let mut y_values = vqrdmlahq_s16(y_bias, vreinterpretq_s16_u16(r_values), v_yr); + y_values = vqrdmlahq_s16(y_values, vreinterpretq_s16_u16(g_values), v_yg); + y_values = vqrdmlahq_s16(y_values, vreinterpretq_s16_u16(b_values), v_yb); + + let mut y_vl = vreinterpretq_u16_s16(vminq_s16(vmaxq_s16(y_values, i_bias_y), i_cap_y)); + + if bytes_position == YuvBytesPacking::MostSignificantBytes { + y_vl = vshlq_u16(y_vl, v_shift_count); + } + + if endianness == YuvEndianness::BigEndian { + y_vl = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(y_vl))); + } + + vst1q_u16(y_ptr.add(cx), y_vl); + + if compute_uv_row { + let mut cb_h = vqrdmlahq_s16(uv_bias, vreinterpretq_s16_u16(r_values), v_cb_r); + cb_h = vqrdmlahq_s16(cb_h, vreinterpretq_s16_u16(g_values), v_cb_g); + cb_h = vqrdmlahq_s16(cb_h, vreinterpretq_s16_u16(b_values), v_cb_b); + + let mut cb_vl = vreinterpretq_u16_s16(vminq_s16(vmaxq_s16(cb_h, i_bias_y), i_cap_uv)); + + let mut cr_h = vqrdmlahq_s16(uv_bias, vreinterpretq_s16_u16(r_values), v_cr_r); + cr_h = vqrdmlahq_s16(cr_h, vreinterpretq_s16_u16(g_values), v_cr_g); + cr_h = vqrdmlahq_s16(cr_h, vreinterpretq_s16_u16(b_values), v_cr_b); + + let mut cr_vl = vreinterpretq_u16_s16(vminq_s16(vmaxq_s16(cr_h, i_bias_y), i_cap_uv)); + + match chroma_subsampling { + YuvChromaSubsampling::Yuv420 | YuvChromaSubsampling::Yuv422 => { + let mut cb_s = vrshrn_n_u32::<1>(vpaddlq_u16(cb_vl)); + let mut cr_s = vrshrn_n_u32::<1>(vpaddlq_u16(cr_vl)); + + if bytes_position == YuvBytesPacking::MostSignificantBytes { + cb_s = vshl_u16(cb_s, vget_low_s16(v_shift_count)); + cr_s = vshl_u16(cr_s, vget_low_s16(v_shift_count)); + } + + if endianness == YuvEndianness::BigEndian { + cb_s = vreinterpret_u16_u8(vrev16_u8(vreinterpret_u8_u16(cb_s))); + cr_s = vreinterpret_u16_u8(vrev16_u8(vreinterpret_u8_u16(cr_s))); + } + + vst1_u16(u_ptr.add(ux), cb_s); + vst1_u16(v_ptr.add(ux), cr_s); + + ux += 4; + } + YuvChromaSubsampling::Yuv444 => { + if bytes_position == YuvBytesPacking::MostSignificantBytes { + cb_vl = vshlq_u16(cb_vl, v_shift_count); + cr_vl = vshlq_u16(cr_vl, v_shift_count); + } + + if endianness == YuvEndianness::BigEndian { + cb_vl = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(cb_vl))); + cr_vl = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(cr_vl))); + } + + vst1q_u16(u_ptr.add(ux), cb_vl); + vst1q_u16(v_ptr.add(ux), cr_vl); + + ux += 8; + } + } + } + + cx += 8; + } + + ProcessedOffset { ux, cx } +} diff --git a/src/rgb_to_yuv_p16.rs b/src/rgb_to_yuv_p16.rs index 8f88e9c..6422e6c 100644 --- a/src/rgb_to_yuv_p16.rs +++ b/src/rgb_to_yuv_p16.rs @@ -28,7 +28,7 @@ */ use crate::internals::ProcessedOffset; #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] -use crate::neon::neon_rgba_to_yuv_p16; +use crate::neon::{neon_rgba_to_yuv_p16, neon_rgba_to_yuv_p16_rdm}; #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] use crate::sse::sse_rgba_to_yuv_p16; use crate::yuv_error::check_rgba_destination; @@ -111,6 +111,28 @@ fn rgbx_to_yuv_ant< #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] let mut _use_sse = std::arch::is_x86_feature_detected!("sse4.1"); + #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] + let is_rdm_available = std::arch::is_aarch64_feature_detected!("rdm"); + #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] + let neon_wide_row_handler = if is_rdm_available && BIT_DEPTH <= 12 { + neon_rgba_to_yuv_p16_rdm::< + ORIGIN_CHANNELS, + SAMPLING, + ENDIANNESS, + BYTES_POSITION, + PRECISION, + BIT_DEPTH, + > + } else { + neon_rgba_to_yuv_p16::< + ORIGIN_CHANNELS, + SAMPLING, + ENDIANNESS, + BYTES_POSITION, + PRECISION, + BIT_DEPTH, + > + }; #[allow(unused_variables)] let process_wide_row = |_y_plane: &mut [u16], @@ -148,14 +170,7 @@ fn rgbx_to_yuv_ant< #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] unsafe { - _offset = neon_rgba_to_yuv_p16::< - ORIGIN_CHANNELS, - SAMPLING, - ENDIANNESS, - BYTES_POSITION, - PRECISION, - BIT_DEPTH, - >( + _offset = neon_wide_row_handler( &transform, &range, _y_plane,