Skip to content

Commit

Permalink
YUV 4:2:2, 4:0:0 speeding up
Browse files Browse the repository at this point in the history
  • Loading branch information
awxkee committed Nov 23, 2024
1 parent 0ee8a46 commit c2fe258
Show file tree
Hide file tree
Showing 9 changed files with 291 additions and 132 deletions.
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -104,19 +104,19 @@ Tests performed on the image 5763x3842

| | time(NEON) | Time(AVX) |
|------------------------|:----------:|:---------:|
| utils RGB->YUV 4:2:0 | 4.37ms | 9.86ms |
| utils RGB->YUV 4:2:0 | 4.37ms | 6.14ms |
| libyuv RGB->YUV 4:2:0 | 3.66ms | 33.87ms |
| utils RGBA->YUV 4:2:0 | 4.88ms | 10.63ms |
| utils RGBA->YUV 4:2:0 | 4.88ms | 7.34ms |
| libyuv RGBA->YUV 4:2:0 | 4.87ms | 23.48ms |
| utils RGBA->YUV 4:2:2 | 4.99ms | 8.24ms |
| utils RGBA->YUV 4:2:2 | 4.99ms | 7.08ms |
| libyuv RGBA->YUV 4:2:2 | 5.90ms | 35.23ms |
| utils RGBA->YUV 4:4:4 | 5.37ms | 7.97ms |

### Decoding

| | time(NEON) | Time(AVX) |
|------------------------|:----------:|:---------:|
| utils YUV 4:2:0->RGB | 4.95ms | 5.47ms |
| utils YUV 4:2:0->RGB | 4.95ms | 5.44ms |
| libyuv YUV 4:2:0->RGB | 5.70ms | 44.95ms |
| utils YUV 4:2:0->RGBA | 5.56ms | 6.45ms |
| libyuv YUV 4:2:0->RGBA | 6.13ms | 6.88ms |
Expand Down
2 changes: 1 addition & 1 deletion app/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ version = "0.1.0"
edition = "2021"

[dependencies]
yuvutils-rs = { path = "..", features = [] }
yuvutils-rs = { path = "..", features = ["nightly_avx512"] }
image = "0.25.5"
yuv-sys = "0.3.6"

Expand Down
82 changes: 57 additions & 25 deletions src/avx2/rgb_to_nv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@

use crate::avx2::avx2_utils::{
_mm256_deinterleave_rgba_epi8, _mm256_interleave_x2_epi8, avx2_deinterleave_rgb, avx2_pack_u16,
avx2_pairwise_widen_avg,
};
use crate::internals::ProcessedOffset;
use crate::yuv_support::{
Expand Down Expand Up @@ -212,7 +211,7 @@ unsafe fn avx2_rgba_to_nv_impl<
let y_yuv = avx2_pack_u16(y_l, y_h);
_mm256_storeu_si256(y_ptr.add(cx) as *mut __m256i, y_yuv);

if compute_uv_row {
if chroma_subsampling == YuvChromaSubsampling::Yuv444 {
let cb_l = _mm256_max_epi16(
_mm256_min_epi16(
_mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
Expand Down Expand Up @@ -279,31 +278,64 @@ unsafe fn avx2_rgba_to_nv_impl<
);

let cb = avx2_pack_u16(cb_l, cb_h);

let cr = avx2_pack_u16(cr_l, cr_h);

match chroma_subsampling {
YuvChromaSubsampling::Yuv420 | YuvChromaSubsampling::Yuv422 => {
let cb_h = avx2_pairwise_widen_avg(cb);
let cr_h = avx2_pairwise_widen_avg(cr);
let (row0, _) = match order {
YuvNVOrder::UV => _mm256_interleave_x2_epi8(cb_h, cr_h),
YuvNVOrder::VU => _mm256_interleave_x2_epi8(cr_h, cb_h),
};
_mm256_storeu_si256(uv_ptr.add(uv_x) as *mut __m256i, row0);
uv_x += 32;
}
YuvChromaSubsampling::Yuv444 => {
let (row0, row1) = match order {
YuvNVOrder::UV => _mm256_interleave_x2_epi8(cb, cr),
YuvNVOrder::VU => _mm256_interleave_x2_epi8(cr, cb),
};
let dst_ptr = uv_ptr.add(uv_x);
_mm256_storeu_si256(dst_ptr as *mut __m256i, row0);
_mm256_storeu_si256(dst_ptr.add(32) as *mut __m256i, row1);
uv_x += 64;
}
}
let (row0, row1) = match order {
YuvNVOrder::UV => _mm256_interleave_x2_epi8(cb, cr),
YuvNVOrder::VU => _mm256_interleave_x2_epi8(cr, cb),
};
let dst_ptr = uv_ptr.add(uv_x);
_mm256_storeu_si256(dst_ptr as *mut __m256i, row0);
_mm256_storeu_si256(dst_ptr.add(32) as *mut __m256i, row1);
uv_x += 64;
} else if chroma_subsampling == YuvChromaSubsampling::Yuv422
|| (chroma_subsampling == YuvChromaSubsampling::Yuv420 && compute_uv_row)
{
let r1 = _mm256_avg_epu16(r_low, r_high);
let g1 = _mm256_avg_epu16(g_low, g_high);
let b1 = _mm256_avg_epu16(b_low, b_high);
let cb = _mm256_max_epi16(
_mm256_min_epi16(
_mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
uv_bias,
_mm256_add_epi16(
_mm256_add_epi16(
_mm256_mulhi_epi16(r1, v_cb_r),
_mm256_mulhi_epi16(g1, v_cb_g),
),
_mm256_mulhi_epi16(b1, v_cb_b),
),
)),
i_cap_uv,
),
i_bias_y,
);
let cr = _mm256_max_epi16(
_mm256_min_epi16(
_mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
uv_bias,
_mm256_add_epi16(
_mm256_add_epi16(
_mm256_mulhi_epi16(r1, v_cr_r),
_mm256_mulhi_epi16(g1, v_cr_g),
),
_mm256_mulhi_epi16(b1, v_cr_b),
),
)),
i_cap_uv,
),
i_bias_y,
);

let cb = avx2_pack_u16(cb, cb);
let cr = avx2_pack_u16(cr, cr);

let (row0, _) = match order {
YuvNVOrder::UV => _mm256_interleave_x2_epi8(cb, cr),
YuvNVOrder::VU => _mm256_interleave_x2_epi8(cr, cb),
};
_mm256_storeu_si256(uv_ptr.add(uv_x) as *mut __m256i, row0);
uv_x += 32;
}

cx += 32;
Expand Down
73 changes: 56 additions & 17 deletions src/avx2/rgba_to_yuv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
*/

use crate::avx2::avx2_utils::{
_mm256_deinterleave_rgba_epi8, avx2_deinterleave_rgb, avx2_pack_u16, avx2_pairwise_widen_avg,
_mm256_deinterleave_rgba_epi8, avx2_deinterleave_rgb, avx2_pack_u16,
};
use crate::internals::ProcessedOffset;
use crate::yuv_support::{
Expand Down Expand Up @@ -210,7 +210,7 @@ unsafe fn avx2_rgba_to_yuv_impl<const ORIGIN_CHANNELS: u8, const SAMPLING: u8>(
let y_yuv = avx2_pack_u16(y_l, y_h);
_mm256_storeu_si256(y_ptr.add(cx) as *mut __m256i, y_yuv);

if chroma_subsampling != YuvChromaSubsampling::Yuv420 || compute_uv_row {
if chroma_subsampling == YuvChromaSubsampling::Yuv444 {
let cb_l = _mm256_max_epi16(
_mm256_min_epi16(
_mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
Expand Down Expand Up @@ -277,23 +277,62 @@ unsafe fn avx2_rgba_to_yuv_impl<const ORIGIN_CHANNELS: u8, const SAMPLING: u8>(
);

let cb = avx2_pack_u16(cb_l, cb_h);

let cr = avx2_pack_u16(cr_l, cr_h);

match chroma_subsampling {
YuvChromaSubsampling::Yuv420 | YuvChromaSubsampling::Yuv422 => {
let cb_h = _mm256_castsi256_si128(avx2_pairwise_widen_avg(cb));
let cr_h = _mm256_castsi256_si128(avx2_pairwise_widen_avg(cr));
_mm_storeu_si128(u_ptr.add(uv_x) as *mut _ as *mut __m128i, cb_h);
_mm_storeu_si128(v_ptr.add(uv_x) as *mut _ as *mut __m128i, cr_h);
uv_x += 16;
}
YuvChromaSubsampling::Yuv444 => {
_mm256_storeu_si256(u_ptr.add(uv_x) as *mut __m256i, cb);
_mm256_storeu_si256(v_ptr.add(uv_x) as *mut __m256i, cr);
uv_x += 32;
}
}
_mm256_storeu_si256(u_ptr.add(uv_x) as *mut __m256i, cb);
_mm256_storeu_si256(v_ptr.add(uv_x) as *mut __m256i, cr);
uv_x += 32;
} else if chroma_subsampling == YuvChromaSubsampling::Yuv422
|| (chroma_subsampling == YuvChromaSubsampling::Yuv420 && compute_uv_row)
{
let r1 = _mm256_avg_epu16(r_low, r_high);
let g1 = _mm256_avg_epu16(g_low, g_high);
let b1 = _mm256_avg_epu16(b_low, b_high);
let cb = _mm256_max_epi16(
_mm256_min_epi16(
_mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
uv_bias,
_mm256_add_epi16(
_mm256_add_epi16(
_mm256_mulhi_epi16(r1, v_cb_r),
_mm256_mulhi_epi16(g1, v_cb_g),
),
_mm256_mulhi_epi16(b1, v_cb_b),
),
)),
i_cap_uv,
),
i_bias_y,
);
let cr = _mm256_max_epi16(
_mm256_min_epi16(
_mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
uv_bias,
_mm256_add_epi16(
_mm256_add_epi16(
_mm256_mulhi_epi16(r1, v_cr_r),
_mm256_mulhi_epi16(g1, v_cr_g),
),
_mm256_mulhi_epi16(b1, v_cr_b),
),
)),
i_cap_uv,
),
i_bias_y,
);

let cb = avx2_pack_u16(cb, cb);
let cr = avx2_pack_u16(cr, cr);

_mm_storeu_si128(
u_ptr.add(uv_x) as *mut _ as *mut __m128i,
_mm256_castsi256_si128(cb),
);
_mm_storeu_si128(
v_ptr.add(uv_x) as *mut _ as *mut __m128i,
_mm256_castsi256_si128(cr),
);
uv_x += 16;
}

cx += 32;
Expand Down
75 changes: 58 additions & 17 deletions src/avx512bw/rgba_to_yuv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
*/

use crate::avx512bw::avx512_utils::{
avx512_deinterleave_rgb, avx512_deinterleave_rgba, avx512_pack_u16, avx512_pairwise_widen_avg,
avx512_deinterleave_rgb, avx512_deinterleave_rgba, avx512_pack_u16,
};
use crate::internals::ProcessedOffset;
use crate::yuv_support::{
Expand Down Expand Up @@ -210,7 +210,7 @@ unsafe fn avx512_rgba_to_yuv_impl<const ORIGIN_CHANNELS: u8, const SAMPLING: u8>
let y_yuv = avx512_pack_u16(y_l, y_h);
_mm512_storeu_si512(y_ptr.add(cx) as *mut i32, y_yuv);

if chroma_subsampling != YuvChromaSubsampling::Yuv420 || compute_uv_row {
if chroma_subsampling == YuvChromaSubsampling::Yuv444 {
let cb_l = _mm512_max_epi16(
_mm512_min_epi16(
_mm512_srai_epi16::<V_SHR>(_mm512_add_epi16(
Expand Down Expand Up @@ -277,23 +277,64 @@ unsafe fn avx512_rgba_to_yuv_impl<const ORIGIN_CHANNELS: u8, const SAMPLING: u8>
);

let cb = avx512_pack_u16(cb_l, cb_h);

let cr = avx512_pack_u16(cr_l, cr_h);

match chroma_subsampling {
YuvChromaSubsampling::Yuv420 | YuvChromaSubsampling::Yuv422 => {
let cb_h = _mm512_castsi512_si256(avx512_pairwise_widen_avg(cb));
let cr_h = _mm512_castsi512_si256(avx512_pairwise_widen_avg(cr));
_mm256_storeu_si256(u_ptr.add(uv_x) as *mut _ as *mut __m256i, cb_h);
_mm256_storeu_si256(v_ptr.add(uv_x) as *mut _ as *mut __m256i, cr_h);
uv_x += 32;
}
YuvChromaSubsampling::Yuv444 => {
_mm512_storeu_si512(u_ptr.add(uv_x) as *mut i32, cb);
_mm512_storeu_si512(v_ptr.add(uv_x) as *mut i32, cr);
uv_x += 64;
}
}
_mm512_storeu_si512(u_ptr.add(uv_x) as *mut i32, cb);
_mm512_storeu_si512(v_ptr.add(uv_x) as *mut i32, cr);
uv_x += 64;
} else if chroma_subsampling == YuvChromaSubsampling::Yuv422
|| (chroma_subsampling == YuvChromaSubsampling::Yuv420 && compute_uv_row)
{
let r1 = _mm512_avg_epu16(r_low, r_high);
let g1 = _mm512_avg_epu16(g_low, g_high);
let b1 = _mm512_avg_epu16(b_low, b_high);

let cbk = _mm512_max_epi16(
_mm512_min_epi16(
_mm512_srai_epi16::<V_SHR>(_mm512_add_epi16(
uv_bias,
_mm512_add_epi16(
_mm512_add_epi16(
_mm512_mulhi_epi16(r1, v_cb_r),
_mm512_mulhi_epi16(g1, v_cb_g),
),
_mm512_mulhi_epi16(b1, v_cb_b),
),
)),
i_cap_uv,
),
i_bias_y,
);

let crk = _mm512_max_epi16(
_mm512_min_epi16(
_mm512_srai_epi16::<V_SHR>(_mm512_add_epi16(
uv_bias,
_mm512_add_epi16(
_mm512_add_epi16(
_mm512_mulhi_epi16(r1, v_cr_r),
_mm512_mulhi_epi16(g1, v_cr_g),
),
_mm512_mulhi_epi16(b1, v_cr_b),
),
)),
i_cap_uv,
),
i_bias_y,
);

let cb = avx512_pack_u16(cbk, cbk);
let cr = avx512_pack_u16(crk, crk);

_mm256_storeu_si256(
u_ptr.add(uv_x) as *mut _ as *mut __m256i,
_mm512_castsi512_si256(cb),
);
_mm256_storeu_si256(
v_ptr.add(uv_x) as *mut _ as *mut __m256i,
_mm512_castsi512_si256(cr),
);
uv_x += 32;
}

cx += 64;
Expand Down
8 changes: 4 additions & 4 deletions src/rgba_to_nv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -84,9 +84,9 @@ fn rgbx_to_nv<const ORIGIN_CHANNELS: u8, const UV_ORDER: u8, const SAMPLING: u8>
let i_cap_uv = i_bias_y + range.range_uv as i32;

#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
let _use_sse = std::arch::is_x86_feature_detected!("sse4.1");
let use_sse = std::arch::is_x86_feature_detected!("sse4.1");
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
let _use_avx2 = std::arch::is_x86_feature_detected!("avx2");
let use_avx2 = std::arch::is_x86_feature_detected!("avx2");
#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
let is_rdm_available = std::arch::is_aarch64_feature_detected!("rdm");
#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
Expand All @@ -103,7 +103,7 @@ fn rgbx_to_nv<const ORIGIN_CHANNELS: u8, const UV_ORDER: u8, const SAMPLING: u8>
|y_plane: &mut [u8], uv_plane: &mut [u8], rgba: &[u8], compute_uv_row| {
let mut _offset: ProcessedOffset = ProcessedOffset { cx: 0, ux: 0 };
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
if _use_avx2 {
if use_avx2 {
let offset = avx2_rgba_to_nv::<ORIGIN_CHANNELS, UV_ORDER, SAMPLING>(
y_plane,
uv_plane,
Expand All @@ -118,7 +118,7 @@ fn rgbx_to_nv<const ORIGIN_CHANNELS: u8, const UV_ORDER: u8, const SAMPLING: u8>
_offset = offset;
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
if _use_sse {
if use_sse {
let offset = sse_rgba_to_nv_row::<ORIGIN_CHANNELS, UV_ORDER, SAMPLING>(
y_plane,
uv_plane,
Expand Down
Loading

0 comments on commit c2fe258

Please sign in to comment.