Skip to content

Commit

Permalink
AVX + SSE improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
awxkee committed May 21, 2024
1 parent e3f6fe8 commit 8bb5d1a
Show file tree
Hide file tree
Showing 7 changed files with 173 additions and 144 deletions.
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "yuvutils-rs"
version = "0.1.11"
version = "0.1.12"
edition = "2021"
description = "Rust utilities for YUV format handling and conversion."
readme = "README.md"
Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

Fast and simple YUV approximation conversion in pure Rust. At most the same as libyuv does. Performance will be equal to libyuv or slightly higher on platforms where SIMD is implemented. Otherwise equal or slower.

Mostly implemented AVX2, SSE, NEON

### RGB to YCbCr

```rust
Expand Down
4 changes: 3 additions & 1 deletion src/intel_simd_support.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
use std::arch::x86_64::*;

#[cfg(target_arch = "x86_64")]
#[inline(always)]
#[allow(dead_code)]
const fn shuffle(z: u32, y: u32, x: u32, w: u32) -> i32 {
// Checked: we want to reinterpret the bits
((z << 6) | (y << 4) | (x << 2) | w) as i32
Expand Down Expand Up @@ -51,7 +53,7 @@ pub unsafe fn avx2_interleave_even(x: __m256i) -> __m256i {
#[cfg(target_arch = "x86_64")]
#[inline(always)]
pub unsafe fn avx2_interleave_even_2_epi8(a: __m256i, b: __m256i) -> __m256i {
let mask_a = _mm256_set1_epi16(0xF00);
let mask_a = _mm256_slli_epi16::<8>(_mm256_srli_epi16::<8>(a));
let masked_a = _mm256_and_si256(a, mask_a);
let b_s = _mm256_srli_epi16::<8>(b);
return _mm256_or_si256(masked_a, b_s);
Expand Down
154 changes: 90 additions & 64 deletions src/yuv_nv_to_rgba.rs
Original file line number Diff line number Diff line change
@@ -1,23 +1,25 @@
#[cfg(target_arch = "aarch64")]
#[cfg(target_feature = "neon")]
use std::arch::aarch64::*;
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;
#[cfg(target_arch = "x86_64")]
#[allow(unused_imports)]
use crate::intel_simd_support::*;
#[allow(unused_imports)]
use crate::internals::*;
use crate::yuv_support::*;
#[cfg(target_arch = "aarch64")]
#[cfg(target_feature = "neon")]
use std::arch::aarch64::*;
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;

#[cfg(target_arch = "x86_64")]
#[inline(always)]
#[allow(dead_code)]
unsafe fn avx2_process_row(
order: YuvNVOrder,
unsafe fn avx2_process_row<
const UV_ORDER: u8,
const DESTINATION_CHANNELS: u8,
const YUV_CHROMA_SAMPLING: u8,
>(
range: &YuvChromaRange,
transform: &CbCrInverseTransform<i32>,
chroma_subsampling: &YuvChromaSample,
y_plane: &[u8],
uv_plane: &[u8],
rgba: &mut [u8],
Expand All @@ -27,9 +29,11 @@ unsafe fn avx2_process_row(
uv_offset: usize,
rgba_offset: usize,
channels: usize,
destination_channels: YuvSourceChannels,
width: usize,
) -> ProcessedOffset {
let order: YuvNVOrder = UV_ORDER.into();
let destination_channels: YuvSourceChannels = DESTINATION_CHANNELS.into();
let chroma_subsampling: YuvChromaSample = YUV_CHROMA_SAMPLING.into();
let mut cx = start_cx;
let mut uv_x = start_ux;
let y_ptr = y_plane.as_ptr();
Expand Down Expand Up @@ -61,10 +65,20 @@ unsafe fn avx2_process_row(
let u_values = avx2_interleave_even(uv_values);
let v_values = avx2_interleave_odd(uv_values);

u_high_u8 = _mm256_extracti128_si256::<1>(u_values);
v_high_u8 = _mm256_extracti128_si256::<1>(v_values);
u_low_u8 = _mm256_castsi256_si128(u_values);
v_low_u8 =_mm256_castsi256_si128(v_values);
match order {
YuvNVOrder::UV => {
u_high_u8 = _mm256_extracti128_si256::<1>(u_values);
v_high_u8 = _mm256_extracti128_si256::<1>(v_values);
u_low_u8 = _mm256_castsi256_si128(u_values);
v_low_u8 = _mm256_castsi256_si128(v_values);
}
YuvNVOrder::VU => {
u_high_u8 = _mm256_extracti128_si256::<1>(v_values);
v_high_u8 = _mm256_extracti128_si256::<1>(u_values);
u_low_u8 = _mm256_castsi256_si128(v_values);
v_low_u8 = _mm256_castsi256_si128(u_values);
}
}
}
YuvChromaSample::YUV444 => {
let offset = uv_offset + uv_x;
Expand All @@ -74,10 +88,20 @@ unsafe fn avx2_process_row(
let full_u = avx2_interleave_even_2_epi8(uv_values_l, uv_values_h);
let full_v = avx2_interleave_odd_2_epi8(uv_values_l, uv_values_h);

u_high_u8 = _mm256_extracti128_si256::<1>(full_u);
v_high_u8 = _mm256_extracti128_si256::<1>(full_v);
u_low_u8 = _mm256_castsi256_si128(full_u);
v_low_u8 = _mm256_castsi256_si128(full_v);
match order {
YuvNVOrder::UV => {
u_high_u8 = _mm256_extracti128_si256::<1>(full_u);
v_high_u8 = _mm256_extracti128_si256::<1>(full_v);
u_low_u8 = _mm256_castsi256_si128(full_u);
v_low_u8 = _mm256_castsi256_si128(full_v);
}
YuvNVOrder::VU => {
u_high_u8 = _mm256_extracti128_si256::<1>(full_v);
v_high_u8 = _mm256_extracti128_si256::<1>(full_u);
u_low_u8 = _mm256_castsi256_si128(full_v);
v_low_u8 = _mm256_castsi256_si128(full_u);
}
}
}
}

Expand Down Expand Up @@ -183,11 +207,13 @@ unsafe fn avx2_process_row(
#[cfg(target_arch = "x86_64")]
#[inline(always)]
#[allow(dead_code)]
unsafe fn sse42_process_row(
order: YuvNVOrder,
unsafe fn sse42_process_row<
const UV_ORDER: u8,
const DESTINATION_CHANNELS: u8,
const YUV_CHROMA_SAMPLING: u8,
>(
range: &YuvChromaRange,
transform: &CbCrInverseTransform<i32>,
chroma_subsampling: &YuvChromaSample,
y_plane: &[u8],
uv_plane: &[u8],
rgba: &mut [u8],
Expand All @@ -197,9 +223,11 @@ unsafe fn sse42_process_row(
uv_offset: usize,
rgba_offset: usize,
channels: usize,
destination_channels: YuvSourceChannels,
width: usize,
) -> ProcessedOffset {
let order: YuvNVOrder = UV_ORDER.into();
let destination_channels: YuvSourceChannels = DESTINATION_CHANNELS.into();
let chroma_subsampling: YuvChromaSample = YUV_CHROMA_SAMPLING.into();
let mut cx = start_cx;
let mut uv_x = start_ux;

Expand Down Expand Up @@ -370,7 +398,6 @@ unsafe fn sse42_process_row(
return ProcessedOffset { cx, ux: uv_x };
}


fn yuv_nv12_to_rgbx<
const UV_ORDER: u8,
const DESTINATION_CHANNELS: u8,
Expand All @@ -388,9 +415,9 @@ fn yuv_nv12_to_rgbx<
matrix: YuvStandardMatrix,
) {
let order: YuvNVOrder = UV_ORDER.into();
let range = get_yuv_range(8, range);
let destination_channels: YuvSourceChannels = DESTINATION_CHANNELS.into();
let chroma_subsampling: YuvChromaSample = YUV_CHROMA_SAMPLING.into();
let range = get_yuv_range(8, range);
let channels = destination_channels.get_channels_count();
let kr_kb = get_kr_kb(matrix);
let transform = get_inverse_transform(255, range.range_y, range.range_uv, kr_kb.kr, kr_kb.kb);
Expand All @@ -415,33 +442,16 @@ fn yuv_nv12_to_rgbx<
};

#[cfg(target_arch = "x86_64")]
let x86_runner: Option<unsafe fn(
order: YuvNVOrder,
range: &YuvChromaRange,
transform: &CbCrInverseTransform<i32>,
chroma_subsampling: &YuvChromaSample,
y_plane: &[u8],
uv_plane: &[u8],
rgba: &mut [u8],
start_cx: usize,
start_ux: usize,
y_offset: usize,
uv_offset: usize,
rgba_offset: usize,
channels: usize,
destination_channels: YuvSourceChannels,
width: usize,
) -> ProcessedOffset>;
let mut use_avx2 = false;
#[cfg(target_arch = "x86_64")]
let mut use_sse = false;

#[cfg(target_arch = "x86_64")]
{
if std::arch::is_x86_feature_detected!("avx2") {
x86_runner = Some(avx2_process_row);
} else if std::arch::is_x86_feature_detected!("sse4.1")
{
x86_runner = Some(sse42_process_row);
} else {
x86_runner = None;
use_avx2 = true;
} else if std::arch::is_x86_feature_detected!("sse4.1") {
use_sse = true;
}
}

Expand All @@ -456,24 +466,40 @@ fn yuv_nv12_to_rgbx<

#[cfg(all(target_arch = "x86_64"))]
unsafe {
if let Some(runner) = x86_runner {
let processed = runner(
order,
&range,
&inverse_transform,
&chroma_subsampling,
y_plane,
uv_plane,
bgra,
cx,
ux,
y_offset,
uv_offset,
dst_offset,
channels,
destination_channels,
width as usize,
);
if use_avx2 {
let processed =
avx2_process_row::<UV_ORDER, DESTINATION_CHANNELS, YUV_CHROMA_SAMPLING>(
&range,
&inverse_transform,
y_plane,
uv_plane,
bgra,
cx,
ux,
y_offset,
uv_offset,
dst_offset,
channels,
width as usize,
);
cx += processed.cx;
ux += processed.ux;
} else if use_sse {
let processed =
sse42_process_row::<UV_ORDER, DESTINATION_CHANNELS, YUV_CHROMA_SAMPLING>(
&range,
&inverse_transform,
y_plane,
uv_plane,
bgra,
cx,
ux,
y_offset,
uv_offset,
dst_offset,
channels,
width as usize,
);
cx += processed.cx;
ux += processed.ux;
}
Expand Down
Loading

0 comments on commit 8bb5d1a

Please sign in to comment.