Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/dev' into dev
Browse files Browse the repository at this point in the history
# Conflicts:
#	src/from_identity.rs
  • Loading branch information
awxkee committed Nov 28, 2024
2 parents 24b97ef + ad64a2a commit cbf3888
Show file tree
Hide file tree
Showing 11 changed files with 175 additions and 242 deletions.
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ categories = ["multimedia::images", "multimedia::video"]
homepage = "https://github.com/awxkee/yuvutils-rs"
repository = "https://github.com/awxkee/yuvutils-rs"
exclude = ["*.jpg", "assets/*", "*.png"]
rust-version = "1.73.0"

[dependencies]
num-traits = "0.2.19"
Expand Down
153 changes: 1 addition & 152 deletions src/from_identity.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,158 +41,7 @@ use rayon::iter::{IndexedParallelIterator, ParallelIterator};
#[cfg(feature = "rayon")]
use rayon::prelude::{ParallelSlice, ParallelSliceMut};
use std::fmt::Debug;
use std::marker::PhantomData;

struct WideRowGbrProcessor<T> {
_phantom: PhantomData<T>,
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
_use_sse: bool,
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
_use_avx: bool,
}

impl<T> Default for WideRowGbrProcessor<T> {
fn default() -> Self {
WideRowGbrProcessor {
_phantom: PhantomData,
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
_use_sse: std::arch::is_x86_feature_detected!("sse4.1"),
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
_use_avx: std::arch::is_x86_feature_detected!("avx2"),
}
}
}

struct WideRowGbrLimitedProcessor<T> {
_phantom: PhantomData<T>,
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
_use_sse: bool,
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
_use_avx: bool,
}

impl<T> Default for WideRowGbrLimitedProcessor<T> {
fn default() -> Self {
WideRowGbrLimitedProcessor {
_phantom: PhantomData,
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
_use_sse: std::arch::is_x86_feature_detected!("sse4.1"),
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
_use_avx: std::arch::is_x86_feature_detected!("avx2"),
}
}
}

trait FullRangeWideRow<V> {
fn handle_row<const DEST: u8>(
&self,
g_plane: &[V],
b_plane: &[V],
r_plane: &[V],
rgba: &mut [V],
start_cx: usize,
width: usize,
) -> usize;
}

trait LimitedRangeWideRow<V> {
fn handle_row<const DEST: u8, const BIT_DEPTH: usize>(
&self,
g_plane: &[V],
b_plane: &[V],
r_plane: &[V],
rgba: &mut [V],
start_cx: usize,
width: usize,
y_bias: i32,
y_coeff: i32,
) -> usize;
}

impl FullRangeWideRow<u8> for WideRowGbrProcessor<u8> {
fn handle_row<const DEST: u8>(
&self,
g_plane: &[u8],
b_plane: &[u8],
r_plane: &[u8],
rgba: &mut [u8],
start_cx: usize,
width: usize,
) -> usize {
let mut _cx = start_cx;
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
{
if self._use_avx {
_cx = avx_yuv_to_rgba_row_full::<DEST>(g_plane, b_plane, r_plane, rgba, _cx, width);
}
if self._use_sse {
_cx = sse_yuv_to_rgba_row_full::<DEST>(g_plane, b_plane, r_plane, rgba, _cx, width);
}
}
_cx
}
}

impl FullRangeWideRow<u16> for WideRowGbrProcessor<u16> {
fn handle_row<const DEST: u8>(
&self,
_g_plane: &[u16],
_b_plane: &[u16],
_r_plane: &[u16],
_rgba: &mut [u16],
_start_cx: usize,
_width: usize,
) -> usize {
let mut _cx = 0;
_cx
}
}

impl LimitedRangeWideRow<u8> for WideRowGbrLimitedProcessor<u8> {
fn handle_row<const DEST: u8, const BIT_DEPTH: usize>(
&self,
g_plane: &[u8],
b_plane: &[u8],
r_plane: &[u8],
rgba: &mut [u8],
start_cx: usize,
width: usize,
y_bias: i32,
y_coeff: i32,
) -> usize {
let mut _cx = start_cx;
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
{
if self._use_avx {
_cx = avx_yuv_to_rgba_row_limited::<DEST>(
g_plane, b_plane, r_plane, rgba, _cx, width, y_bias, y_coeff,
);
}
if self._use_sse {
_cx = sse_yuv_to_rgba_row_limited::<DEST>(
g_plane, b_plane, r_plane, rgba, _cx, width, y_bias, y_coeff,
);
}
}
_cx
}
}

impl LimitedRangeWideRow<u16> for WideRowGbrLimitedProcessor<u16> {
fn handle_row<const DEST: u8, const BIT_DEPTH: usize>(
&self,
_g_plane: &[u16],
_b_plane: &[u16],
_r_plane: &[u16],
_rgba: &mut [u16],
_start_cx: usize,
_width: usize,
_y_bias: i32,
_y_coeff: i32,
) -> usize {
0
}
}
use std::mem::size_of;

#[inline]
fn gbr_to_rgbx_impl<
Expand Down
1 change: 1 addition & 0 deletions src/from_identity_alpha.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ use rayon::iter::{IndexedParallelIterator, ParallelIterator};
#[cfg(feature = "rayon")]
use rayon::prelude::{ParallelSlice, ParallelSliceMut};
use std::fmt::Debug;
use std::mem::size_of;

#[inline]
fn gbr_to_rgbx_alpha_impl<
Expand Down
1 change: 1 addition & 0 deletions src/images.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ use crate::YuvError;
use std::fmt::Debug;

#[derive(Debug)]
/// Shared storage type
pub enum BufferStoreMut<'a, T: Copy + Debug> {
Borrowed(&'a mut [T]),
Owned(Vec<T>),
Expand Down
2 changes: 1 addition & 1 deletion src/neon/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ pub(crate) use rgb_to_yuv_p16::{neon_rgba_to_yuv_p16, neon_rgba_to_yuv_p16_rdm};
pub(crate) use rgba_to_nv::{neon_rgbx_to_nv_row, neon_rgbx_to_nv_row_rdm};
pub(crate) use rgba_to_yuv::{neon_rgba_to_yuv, neon_rgba_to_yuv_rdm};
pub(crate) use rgba_to_yuv420::{neon_rgba_to_yuv420, neon_rgba_to_yuv_rdm420};
pub(crate) use y_p16_to_rgba16::neon_y_p16_to_rgba16_row;
pub(crate) use y_p16_to_rgba16::{neon_y_p16_to_rgba16_rdm, neon_y_p16_to_rgba16_row};
pub(crate) use y_to_rgb::{neon_y_to_rgb_row, neon_y_to_rgb_row_rdm};
pub(crate) use ycgco_to_rgb::neon_ycgco_to_rgb_row;
pub(crate) use ycgco_to_rgb_alpha::neon_ycgco_to_rgb_alpha_row;
Expand Down
146 changes: 111 additions & 35 deletions src/neon/y_p16_to_rgba16.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,18 +30,17 @@
use std::arch::aarch64::*;

use crate::internals::ProcessedOffset;
use crate::yuv_support::{
CbCrInverseTransform, YuvBytesPacking, YuvChromaRange, YuvEndianness, YuvSourceChannels,
};
use crate::neon::neon_simd_support::vldq_s16_endian;
use crate::yuv_support::{CbCrInverseTransform, YuvChromaRange, YuvSourceChannels};

pub(crate) unsafe fn neon_y_p16_to_rgba16_row<
#[target_feature(enable = "rdm")]
pub(crate) unsafe fn neon_y_p16_to_rgba16_rdm<
const DESTINATION_CHANNELS: u8,
const ENDIANNESS: u8,
const BYTES_POSITION: u8,
const PRECISION: i32,
>(
y_ld_ptr: *const u16,
rgba: *mut u16,
y_ld_ptr: &[u16],
rgba: &mut [u16],
width: u32,
range: &YuvChromaRange,
transform: &CbCrInverseTransform<i32>,
Expand All @@ -50,66 +49,143 @@ pub(crate) unsafe fn neon_y_p16_to_rgba16_row<
) -> ProcessedOffset {
let destination_channels: YuvSourceChannels = DESTINATION_CHANNELS.into();
let channels = destination_channels.get_channels_count();
let endianness: YuvEndianness = ENDIANNESS.into();
let bytes_position: YuvBytesPacking = BYTES_POSITION.into();
let dst_ptr = rgba;

let y_corr = vdupq_n_s16(range.bias_y as i16);
let v_luma_coeff = vdupq_n_s16(transform.y_coef as i16);
let y_corr = vdupq_n_u16(range.bias_y as u16);
let v_min_values = vdupq_n_s16(0i16);
let v_alpha = vdupq_n_u16((1 << bit_depth) - 1);
let v_msb_shift = vdupq_n_s16(bit_depth as i16 - 16);

let mut cx = start_cx;

const V_SCALE: i32 = 2;

while cx + 8 < width as usize {
let y_values: int16x8_t;

match endianness {
YuvEndianness::BigEndian => {
let mut y_u_values = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(
vld1q_u16(y_ld_ptr.add(cx)),
)));
if bytes_position == YuvBytesPacking::MostSignificantBytes {
y_u_values = vshlq_u16(y_u_values, v_msb_shift);
}
y_values = vsubq_s16(vreinterpretq_s16_u16(y_u_values), y_corr);
let y_values: int16x8_t = vreinterpretq_s16_u16(vqsubq_u16(
vreinterpretq_u16_s16(vldq_s16_endian::<ENDIANNESS, BYTES_POSITION>(
y_ld_ptr.get_unchecked(cx..).as_ptr(),
v_msb_shift,
)),
y_corr,
));

let y_high = vqrdmulhq_n_s16(vshlq_n_s16::<V_SCALE>(y_values), transform.y_coef as i16);

let r_values = vminq_u16(
vreinterpretq_u16_s16(vmaxq_s16(y_high, v_min_values)),
v_alpha,
);

match destination_channels {
YuvSourceChannels::Rgb => {
let dst_pack = uint16x8x3_t(r_values, r_values, r_values);
vst3q_u16(
dst_ptr.get_unchecked_mut(cx * channels..).as_mut_ptr(),
dst_pack,
);
}
YuvSourceChannels::Bgr => {
let dst_pack = uint16x8x3_t(r_values, r_values, r_values);
vst3q_u16(
dst_ptr.get_unchecked_mut(cx * channels..).as_mut_ptr(),
dst_pack,
);
}
YuvSourceChannels::Rgba => {
let dst_pack = uint16x8x4_t(r_values, r_values, r_values, v_alpha);
vst4q_u16(
dst_ptr.get_unchecked_mut(cx * channels..).as_mut_ptr(),
dst_pack,
);
}
YuvEndianness::LittleEndian => {
let mut y_vl = vld1q_u16(y_ld_ptr.add(cx));
if bytes_position == YuvBytesPacking::MostSignificantBytes {
y_vl = vshlq_u16(y_vl, v_msb_shift);
}
y_values = vsubq_s16(vreinterpretq_s16_u16(y_vl), y_corr);
YuvSourceChannels::Bgra => {
let dst_pack = uint16x8x4_t(r_values, r_values, r_values, v_alpha);
vst4q_u16(
dst_ptr.get_unchecked_mut(cx * channels..).as_mut_ptr(),
dst_pack,
);
}
}

cx += 8;
}

ProcessedOffset { cx, ux: 0 }
}

pub(crate) unsafe fn neon_y_p16_to_rgba16_row<
const DESTINATION_CHANNELS: u8,
const ENDIANNESS: u8,
const BYTES_POSITION: u8,
const PRECISION: i32,
>(
y_ld_ptr: &[u16],
rgba: &mut [u16],
width: u32,
range: &YuvChromaRange,
transform: &CbCrInverseTransform<i32>,
start_cx: usize,
bit_depth: usize,
) -> ProcessedOffset {
let destination_channels: YuvSourceChannels = DESTINATION_CHANNELS.into();
let channels = destination_channels.get_channels_count();
let dst_ptr = rgba;

let y_corr = vdupq_n_u16(range.bias_y as u16);
let v_luma_coeff = vdupq_n_s16(transform.y_coef as i16);
let v_alpha = vdupq_n_u16((1 << bit_depth) - 1);
let v_max_values = vdupq_n_s32((1 << bit_depth) - 1);
let v_msb_shift = vdupq_n_s16(bit_depth as i16 - 16);

let mut cx = start_cx;

while cx + 8 < width as usize {
let y_values: int16x8_t = vreinterpretq_s16_u16(vqsubq_u16(
vreinterpretq_u16_s16(vldq_s16_endian::<ENDIANNESS, BYTES_POSITION>(
y_ld_ptr.get_unchecked(cx..).as_ptr(),
v_msb_shift,
)),
y_corr,
));

let y_high = vmull_high_s16(y_values, v_luma_coeff);

let r_high = vrshrn_n_s32::<PRECISION>(y_high);
let r_high = vqmovun_s32(vminq_s32(vrshrq_n_s32::<PRECISION>(y_high), v_max_values));

let y_low = vmull_s16(vget_low_s16(y_values), vget_low_s16(v_luma_coeff));

let r_low = vrshrn_n_s32::<PRECISION>(y_low);
let r_low = vqmovun_s32(vminq_s32(vrshrq_n_s32::<PRECISION>(y_low), v_max_values));

let r_values = vreinterpretq_u16_s16(vmaxq_s16(vcombine_s16(r_low, r_high), v_min_values));
let r_values = vcombine_u16(r_low, r_high);

match destination_channels {
YuvSourceChannels::Rgb => {
let dst_pack = uint16x8x3_t(r_values, r_values, r_values);
vst3q_u16(dst_ptr.add(cx * channels), dst_pack);
vst3q_u16(
dst_ptr.get_unchecked_mut(cx * channels..).as_mut_ptr(),
dst_pack,
);
}
YuvSourceChannels::Bgr => {
let dst_pack = uint16x8x3_t(r_values, r_values, r_values);
vst3q_u16(dst_ptr.add(cx * channels), dst_pack);
vst3q_u16(
dst_ptr.get_unchecked_mut(cx * channels..).as_mut_ptr(),
dst_pack,
);
}
YuvSourceChannels::Rgba => {
let dst_pack = uint16x8x4_t(r_values, r_values, r_values, v_alpha);
vst4q_u16(dst_ptr.add(cx * channels), dst_pack);
vst4q_u16(
dst_ptr.get_unchecked_mut(cx * channels..).as_mut_ptr(),
dst_pack,
);
}
YuvSourceChannels::Bgra => {
let dst_pack = uint16x8x4_t(r_values, r_values, r_values, v_alpha);
vst4q_u16(dst_ptr.add(cx * channels), dst_pack);
vst4q_u16(
dst_ptr.get_unchecked_mut(cx * channels..).as_mut_ptr(),
dst_pack,
);
}
}

Expand Down
Loading

0 comments on commit cbf3888

Please sign in to comment.