Skip to content

Commit

Permalink
YUV 4:0:0 RDM NEON
Browse files Browse the repository at this point in the history
  • Loading branch information
awxkee committed Nov 28, 2024
1 parent 42c4a5e commit 1a57d75
Show file tree
Hide file tree
Showing 3 changed files with 93 additions and 9 deletions.
4 changes: 2 additions & 2 deletions src/neon/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ pub(crate) use rgb_to_yuv_p16::{neon_rgba_to_yuv_p16, neon_rgba_to_yuv_p16_rdm};
pub(crate) use rgba_to_nv::{neon_rgbx_to_nv_row, neon_rgbx_to_nv_row_rdm};
pub(crate) use rgba_to_yuv::{neon_rgba_to_yuv, neon_rgba_to_yuv_rdm};
pub(crate) use rgba_to_yuv420::{neon_rgba_to_yuv420, neon_rgba_to_yuv_rdm420};
pub(crate) use y_p16_to_rgba16::neon_y_p16_to_rgba16_row;
pub(crate) use y_p16_to_rgba16::{neon_y_p16_to_rgba16_row, neon_y_p16_to_rgba16_rdm};
pub(crate) use y_to_rgb::{neon_y_to_rgb_row, neon_y_to_rgb_row_rdm};
pub(crate) use ycgco_to_rgb::neon_ycgco_to_rgb_row;
pub(crate) use ycgco_to_rgb_alpha::neon_ycgco_to_rgb_alpha_row;
Expand All @@ -84,4 +84,4 @@ pub(crate) use yuv_to_rgba420::{neon_yuv_to_rgba_row420, neon_yuv_to_rgba_row_rd
pub(crate) use yuv_to_rgba_alpha::{neon_yuv_to_rgba_alpha, neon_yuv_to_rgba_alpha_rdm};
pub(crate) use yuv_to_yuy2::yuv_to_yuy2_neon_impl;
pub(crate) use yuy2_to_rgb::yuy2_to_rgb_neon;
pub(crate) use yuy2_to_yuv::yuy2_to_yuv_neon_impl;
pub(crate) use yuy2_to_yuv::yuy2_to_yuv_neon_impl;
80 changes: 80 additions & 0 deletions src/neon/y_p16_to_rgba16.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,86 @@ use crate::yuv_support::{
CbCrInverseTransform, YuvBytesPacking, YuvChromaRange, YuvEndianness, YuvSourceChannels,
};

#[target_feature(enable = "rdm")]
pub(crate) unsafe fn neon_y_p16_to_rgba16_rdm<
const DESTINATION_CHANNELS: u8,
const ENDIANNESS: u8,
const BYTES_POSITION: u8,
>(
y_ld_ptr: *const u16,
rgba: *mut u16,
width: u32,
range: &YuvChromaRange,
transform: &CbCrInverseTransform<i32>,
start_cx: usize,
bit_depth: usize,
) -> ProcessedOffset {
let destination_channels: YuvSourceChannels = DESTINATION_CHANNELS.into();
let channels = destination_channels.get_channels_count();
let endianness: YuvEndianness = ENDIANNESS.into();
let bytes_position: YuvBytesPacking = BYTES_POSITION.into();
let dst_ptr = rgba;

let y_corr = vdupq_n_s16(range.bias_y as i16);
let v_min_values = vdupq_n_s16(0i16);
let v_alpha = vdupq_n_u16((1 << bit_depth) - 1);
let v_msb_shift = vdupq_n_s16(bit_depth as i16 - 16);

let mut cx = start_cx;

const V_SCALE: i32 = 2;

while cx + 8 < width as usize {
let y_values: int16x8_t;

match endianness {
YuvEndianness::BigEndian => {
let mut y_u_values = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(
vld1q_u16(y_ld_ptr.add(cx)),
)));
if bytes_position == YuvBytesPacking::MostSignificantBytes {
y_u_values = vshlq_u16(y_u_values, v_msb_shift);
}
y_values = vsubq_s16(vreinterpretq_s16_u16(y_u_values), y_corr);
}
YuvEndianness::LittleEndian => {
let mut y_vl = vld1q_u16(y_ld_ptr.add(cx));
if bytes_position == YuvBytesPacking::MostSignificantBytes {
y_vl = vshlq_u16(y_vl, v_msb_shift);
}
y_values = vsubq_s16(vreinterpretq_s16_u16(y_vl), y_corr);
}
}

let y_high = vqrdmulhq_n_s16(vshlq_n_s16::<V_SCALE>(y_values), transform.y_coef as i16);

let r_values = vreinterpretq_u16_s16(vmaxq_s16(y_high, v_min_values));

match destination_channels {
YuvSourceChannels::Rgb => {
let dst_pack = uint16x8x3_t(r_values, r_values, r_values);
vst3q_u16(dst_ptr.add(cx * channels), dst_pack);
}
YuvSourceChannels::Bgr => {
let dst_pack = uint16x8x3_t(r_values, r_values, r_values);
vst3q_u16(dst_ptr.add(cx * channels), dst_pack);
}
YuvSourceChannels::Rgba => {
let dst_pack = uint16x8x4_t(r_values, r_values, r_values, v_alpha);
vst4q_u16(dst_ptr.add(cx * channels), dst_pack);
}
YuvSourceChannels::Bgra => {
let dst_pack = uint16x8x4_t(r_values, r_values, r_values, v_alpha);
vst4q_u16(dst_ptr.add(cx * channels), dst_pack);
}
}

cx += 8;
}

ProcessedOffset { cx, ux: 0 }
}

pub(crate) unsafe fn neon_y_p16_to_rgba16_row<
const DESTINATION_CHANNELS: u8,
const ENDIANNESS: u8,
Expand Down
18 changes: 11 additions & 7 deletions src/y_p16_to_rgb16.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
*/
use crate::built_coefficients::get_built_inverse_transform;
#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
use crate::neon::neon_y_p16_to_rgba16_row;
use crate::neon::{neon_y_p16_to_rgba16_rdm, neon_y_p16_to_rgba16_row};
use crate::yuv_support::*;
use crate::{YuvError, YuvGrayImage};
#[cfg(feature = "rayon")]
Expand Down Expand Up @@ -92,6 +92,15 @@ fn yuv400_p16_to_rgbx<
.zip(image.y_plane.chunks_exact(image.y_stride as usize));
}

#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
let is_rdm_available = std::arch::is_aarch64_feature_detected!("rdm");
#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
let neon_wide_handler = if is_rdm_available {
neon_y_p16_to_rgba16_rdm::<DESTINATION_CHANNELS, ENDIANNESS, BYTES_POSITION>
} else {
neon_y_p16_to_rgba16_row::<DESTINATION_CHANNELS, ENDIANNESS, BYTES_POSITION, PRECISION>
};

match range {
YuvRange::Limited => {
iter.for_each(|(rgba16, y_plane)| {
Expand All @@ -101,12 +110,7 @@ fn yuv400_p16_to_rgbx<
#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
{
unsafe {
let offset = neon_y_p16_to_rgba16_row::<
DESTINATION_CHANNELS,
ENDIANNESS,
BYTES_POSITION,
PRECISION,
>(
let offset = neon_wide_handler(
y_plane.as_ptr(),
rgba16.as_mut_ptr(),
image.width,
Expand Down

0 comments on commit 1a57d75

Please sign in to comment.