Skip to content

Commit

Permalink
RDM Support RGB16 to YUV16
Browse files Browse the repository at this point in the history
  • Loading branch information
awxkee committed Nov 23, 2024
1 parent a46135c commit 7266a48
Show file tree
Hide file tree
Showing 4 changed files with 224 additions and 22 deletions.
3 changes: 1 addition & 2 deletions app/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -112,8 +112,7 @@ fn main() {
//
// println!("rgb_to_yuv_nv12 time: {:?}", start_time.elapsed());
//
// let end_time = Instant::now().sub(start_time);
// println!("Forward time: {:?}", end_time);
println!("Forward time: {:?}", start_time.elapsed());
// // //
// let full_size = if width % 2 == 0 {
// 2 * width as usize * height as usize
Expand Down
2 changes: 1 addition & 1 deletion src/neon/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ mod yuy2_to_yuv;
pub use rgb_to_y::neon_rgb_to_y_row;
pub use rgb_to_ycgco::neon_rgb_to_ycgco_row;
pub use rgb_to_ycgco_r::neon_rgb_to_ycgcor_row;
pub use rgb_to_yuv_p16::neon_rgba_to_yuv_p16;
pub(crate) use rgb_to_yuv_p16::{neon_rgba_to_yuv_p16, neon_rgba_to_yuv_p16_rdm};
pub(crate) use rgba_to_nv::{neon_rgbx_to_nv_row, neon_rgbx_to_nv_row_rdm};
pub(crate) use rgba_to_yuv::{neon_rgba_to_yuv, neon_rgba_to_yuv_rdm};
pub use y_p16_to_rgba16::neon_y_p16_to_rgba16_row;
Expand Down
208 changes: 198 additions & 10 deletions src/neon/rgb_to_yuv_p16.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ use crate::yuv_support::{
use crate::{YuvBytesPacking, YuvEndianness};
use std::arch::aarch64::*;

pub unsafe fn neon_rgba_to_yuv_p16<
pub(crate) unsafe fn neon_rgba_to_yuv_p16<
const ORIGIN_CHANNELS: u8,
const SAMPLING: u8,
const ENDIANNESS: u8,
Expand Down Expand Up @@ -83,6 +83,10 @@ pub unsafe fn neon_rgba_to_yuv_p16<

let v_shift_count = vdupq_n_s16(16 - BIT_DEPTH as i16);

let i_bias_y = vdupq_n_u16(range.bias_y as u16);
let i_cap_y = vdupq_n_u16(range.range_y as u16 + range.bias_y as u16);
let i_cap_uv = vdupq_n_u16(range.bias_y as u16 + range.range_uv as u16);

while cx + 8 < width {
let r_values;
let g_values;
Expand Down Expand Up @@ -137,9 +141,15 @@ pub unsafe fn neon_rgba_to_yuv_p16<
vget_low_s16(v_yb),
);

let mut y_vl = vcombine_u16(
vqshrun_n_s32::<PRECISION>(y_l),
vqshrun_n_s32::<PRECISION>(y_h),
let mut y_vl = vminq_u16(
vmaxq_u16(
vcombine_u16(
vqshrun_n_s32::<PRECISION>(y_l),
vqshrun_n_s32::<PRECISION>(y_h),
),
i_bias_y,
),
i_cap_y,
);

if bytes_position == YuvBytesPacking::MostSignificantBytes {
Expand Down Expand Up @@ -173,9 +183,15 @@ pub unsafe fn neon_rgba_to_yuv_p16<
vget_low_s16(v_cb_b),
);

let mut cb_vl = vcombine_u16(
vqshrun_n_s32::<PRECISION>(cb_l),
vqshrun_n_s32::<PRECISION>(cb_h),
let mut cb_vl = vminq_u16(
vmaxq_u16(
vcombine_u16(
vqshrun_n_s32::<PRECISION>(cb_l),
vqshrun_n_s32::<PRECISION>(cb_h),
),
i_bias_y,
),
i_cap_uv,
);

let mut cr_h = vmlal_high_s16(uv_bias, vreinterpretq_s16_u16(r_values), v_cr_r);
Expand All @@ -198,9 +214,15 @@ pub unsafe fn neon_rgba_to_yuv_p16<
vget_low_s16(v_cr_b),
);

let mut cr_vl = vcombine_u16(
vqshrun_n_s32::<PRECISION>(cr_l),
vqshrun_n_s32::<PRECISION>(cr_h),
let mut cr_vl = vminq_u16(
vmaxq_u16(
vcombine_u16(
vqshrun_n_s32::<PRECISION>(cr_l),
vqshrun_n_s32::<PRECISION>(cr_h),
),
i_bias_y,
),
i_cap_uv,
);

match chroma_subsampling {
Expand Down Expand Up @@ -247,3 +269,169 @@ pub unsafe fn neon_rgba_to_yuv_p16<

ProcessedOffset { ux, cx }
}

#[target_feature(enable = "rdm")]
pub(crate) unsafe fn neon_rgba_to_yuv_p16_rdm<
const ORIGIN_CHANNELS: u8,
const SAMPLING: u8,
const ENDIANNESS: u8,
const BYTES_POSITION: u8,
const PRECISION: i32,
const BIT_DEPTH: usize,
>(
transform: &CbCrForwardTransform<i32>,
range: &YuvChromaRange,
y_plane: &mut [u16],
u_plane: &mut [u16],
v_plane: &mut [u16],
rgba: &[u16],
start_cx: usize,
start_ux: usize,
width: usize,
compute_uv_row: bool,
) -> ProcessedOffset {
let chroma_subsampling: YuvChromaSubsampling = SAMPLING.into();
let source_channels: YuvSourceChannels = ORIGIN_CHANNELS.into();
let endianness: YuvEndianness = ENDIANNESS.into();
let bytes_position: YuvBytesPacking = BYTES_POSITION.into();
let channels = source_channels.get_channels_count();

let bias_y = range.bias_y as i16;
let bias_uv = range.bias_uv as i16;

let y_ptr = y_plane.as_mut_ptr();
let u_ptr = u_plane.as_mut_ptr();
let v_ptr = v_plane.as_mut_ptr();

let y_bias = vdupq_n_s16(bias_y);
let uv_bias = vdupq_n_s16(bias_uv);
let v_yr = vdupq_n_s16(transform.yr as i16);
let v_yg = vdupq_n_s16(transform.yg as i16);
let v_yb = vdupq_n_s16(transform.yb as i16);
let v_cb_r = vdupq_n_s16(transform.cb_r as i16);
let v_cb_g = vdupq_n_s16(transform.cb_g as i16);
let v_cb_b = vdupq_n_s16(transform.cb_b as i16);
let v_cr_r = vdupq_n_s16(transform.cr_r as i16);
let v_cr_g = vdupq_n_s16(transform.cr_g as i16);
let v_cr_b = vdupq_n_s16(transform.cr_b as i16);

let i_bias_y = vdupq_n_s16(range.bias_y as i16);
let i_cap_y = vdupq_n_s16(range.range_y as i16 + range.bias_y as i16);
let i_cap_uv = vdupq_n_s16(range.bias_y as i16 + range.range_uv as i16);

let mut cx = start_cx;
let mut ux = start_ux;

let v_shift_count = vdupq_n_s16(16 - BIT_DEPTH as i16);

while cx + 8 < width {
let mut r_values;
let mut g_values;
let mut b_values;

let src_ptr = rgba.get_unchecked(cx * channels..);

match source_channels {
YuvSourceChannels::Rgb | YuvSourceChannels::Bgr => {
let rgb_values = vld3q_u16(src_ptr.as_ptr());
if source_channels == YuvSourceChannels::Rgb {
r_values = rgb_values.0;
g_values = rgb_values.1;
b_values = rgb_values.2;
} else {
r_values = rgb_values.2;
g_values = rgb_values.1;
b_values = rgb_values.0;
}
}
YuvSourceChannels::Rgba => {
let rgb_values = vld4q_u16(src_ptr.as_ptr());
r_values = rgb_values.0;
g_values = rgb_values.1;
b_values = rgb_values.2;
}
YuvSourceChannels::Bgra => {
let rgb_values = vld4q_u16(src_ptr.as_ptr());
r_values = rgb_values.2;
g_values = rgb_values.1;
b_values = rgb_values.0;
}
}

r_values = vshlq_n_u16::<3>(r_values);
g_values = vshlq_n_u16::<3>(g_values);
b_values = vshlq_n_u16::<3>(b_values);

let mut y_values = vqrdmlahq_s16(y_bias, vreinterpretq_s16_u16(r_values), v_yr);
y_values = vqrdmlahq_s16(y_values, vreinterpretq_s16_u16(g_values), v_yg);
y_values = vqrdmlahq_s16(y_values, vreinterpretq_s16_u16(b_values), v_yb);

let mut y_vl = vreinterpretq_u16_s16(vminq_s16(vmaxq_s16(y_values, i_bias_y), i_cap_y));

if bytes_position == YuvBytesPacking::MostSignificantBytes {
y_vl = vshlq_u16(y_vl, v_shift_count);
}

if endianness == YuvEndianness::BigEndian {
y_vl = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(y_vl)));
}

vst1q_u16(y_ptr.add(cx), y_vl);

if compute_uv_row {
let mut cb_h = vqrdmlahq_s16(uv_bias, vreinterpretq_s16_u16(r_values), v_cb_r);
cb_h = vqrdmlahq_s16(cb_h, vreinterpretq_s16_u16(g_values), v_cb_g);
cb_h = vqrdmlahq_s16(cb_h, vreinterpretq_s16_u16(b_values), v_cb_b);

let mut cb_vl = vreinterpretq_u16_s16(vminq_s16(vmaxq_s16(cb_h, i_bias_y), i_cap_uv));

let mut cr_h = vqrdmlahq_s16(uv_bias, vreinterpretq_s16_u16(r_values), v_cr_r);
cr_h = vqrdmlahq_s16(cr_h, vreinterpretq_s16_u16(g_values), v_cr_g);
cr_h = vqrdmlahq_s16(cr_h, vreinterpretq_s16_u16(b_values), v_cr_b);

let mut cr_vl = vreinterpretq_u16_s16(vminq_s16(vmaxq_s16(cr_h, i_bias_y), i_cap_uv));

match chroma_subsampling {
YuvChromaSubsampling::Yuv420 | YuvChromaSubsampling::Yuv422 => {
let mut cb_s = vrshrn_n_u32::<1>(vpaddlq_u16(cb_vl));
let mut cr_s = vrshrn_n_u32::<1>(vpaddlq_u16(cr_vl));

if bytes_position == YuvBytesPacking::MostSignificantBytes {
cb_s = vshl_u16(cb_s, vget_low_s16(v_shift_count));
cr_s = vshl_u16(cr_s, vget_low_s16(v_shift_count));
}

if endianness == YuvEndianness::BigEndian {
cb_s = vreinterpret_u16_u8(vrev16_u8(vreinterpret_u8_u16(cb_s)));
cr_s = vreinterpret_u16_u8(vrev16_u8(vreinterpret_u8_u16(cr_s)));
}

vst1_u16(u_ptr.add(ux), cb_s);
vst1_u16(v_ptr.add(ux), cr_s);

ux += 4;
}
YuvChromaSubsampling::Yuv444 => {
if bytes_position == YuvBytesPacking::MostSignificantBytes {
cb_vl = vshlq_u16(cb_vl, v_shift_count);
cr_vl = vshlq_u16(cr_vl, v_shift_count);
}

if endianness == YuvEndianness::BigEndian {
cb_vl = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(cb_vl)));
cr_vl = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(cr_vl)));
}

vst1q_u16(u_ptr.add(ux), cb_vl);
vst1q_u16(v_ptr.add(ux), cr_vl);

ux += 8;
}
}
}

cx += 8;
}

ProcessedOffset { ux, cx }
}
33 changes: 24 additions & 9 deletions src/rgb_to_yuv_p16.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
*/
use crate::internals::ProcessedOffset;
#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
use crate::neon::neon_rgba_to_yuv_p16;
use crate::neon::{neon_rgba_to_yuv_p16, neon_rgba_to_yuv_p16_rdm};
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
use crate::sse::sse_rgba_to_yuv_p16;
use crate::yuv_error::check_rgba_destination;
Expand Down Expand Up @@ -111,6 +111,28 @@ fn rgbx_to_yuv_ant<

#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
let mut _use_sse = std::arch::is_x86_feature_detected!("sse4.1");
#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
let is_rdm_available = std::arch::is_aarch64_feature_detected!("rdm");
#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
let neon_wide_row_handler = if is_rdm_available && BIT_DEPTH <= 12 {
neon_rgba_to_yuv_p16_rdm::<
ORIGIN_CHANNELS,
SAMPLING,
ENDIANNESS,
BYTES_POSITION,
PRECISION,
BIT_DEPTH,
>
} else {
neon_rgba_to_yuv_p16::<
ORIGIN_CHANNELS,
SAMPLING,
ENDIANNESS,
BYTES_POSITION,
PRECISION,
BIT_DEPTH,
>
};

#[allow(unused_variables)]
let process_wide_row = |_y_plane: &mut [u16],
Expand Down Expand Up @@ -148,14 +170,7 @@ fn rgbx_to_yuv_ant<

#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
unsafe {
_offset = neon_rgba_to_yuv_p16::<
ORIGIN_CHANNELS,
SAMPLING,
ENDIANNESS,
BYTES_POSITION,
PRECISION,
BIT_DEPTH,
>(
_offset = neon_wide_row_handler(
&transform,
&range,
_y_plane,
Expand Down

0 comments on commit 7266a48

Please sign in to comment.