YUV 4:2:2, 4:0:0 speeding up

awxkee · Nov 23, 2024 · c2fe258 · c2fe258
1 parent 0ee8a46
commit c2fe258
Show file tree

Hide file tree

Showing 9 changed files with 291 additions and 132 deletions.
diff --git a/README.md b/README.md
@@ -104,19 +104,19 @@ Tests performed on the image 5763x3842
 
 |                        | time(NEON) | Time(AVX) |
 |------------------------|:----------:|:---------:|
-| utils RGB->YUV 4:2:0   |   4.37ms   |  9.86ms   |
+| utils RGB->YUV 4:2:0   |   4.37ms   |  6.14ms   |
 | libyuv RGB->YUV 4:2:0  |   3.66ms   |  33.87ms  |
-| utils RGBA->YUV 4:2:0  |   4.88ms   |  10.63ms  |
+| utils RGBA->YUV 4:2:0  |   4.88ms   |  7.34ms   |
 | libyuv RGBA->YUV 4:2:0 |   4.87ms   |  23.48ms  |
-| utils RGBA->YUV 4:2:2  |   4.99ms   |  8.24ms   |
+| utils RGBA->YUV 4:2:2  |   4.99ms   |  7.08ms   |
 | libyuv RGBA->YUV 4:2:2 |   5.90ms   |  35.23ms  |
 | utils RGBA->YUV 4:4:4  |   5.37ms   |  7.97ms   |
 
 ### Decoding
 
 |                        | time(NEON) | Time(AVX) |
 |------------------------|:----------:|:---------:|
-| utils YUV 4:2:0->RGB   |   4.95ms   |  5.47ms   |
+| utils YUV 4:2:0->RGB   |   4.95ms   |  5.44ms   |
 | libyuv YUV 4:2:0->RGB  |   5.70ms   |  44.95ms  |
 | utils YUV 4:2:0->RGBA  |   5.56ms   |  6.45ms   |
 | libyuv YUV 4:2:0->RGBA |   6.13ms   |  6.88ms   |

diff --git a/app/Cargo.toml b/app/Cargo.toml
@@ -4,7 +4,7 @@ version = "0.1.0"
 edition = "2021"
 
 [dependencies]
-yuvutils-rs = { path = "..", features = [] }
+yuvutils-rs = { path = "..", features = ["nightly_avx512"] }
 image = "0.25.5"
 yuv-sys = "0.3.6"
 

diff --git a/src/avx2/rgb_to_nv.rs b/src/avx2/rgb_to_nv.rs
@@ -29,7 +29,6 @@
 
 use crate::avx2::avx2_utils::{
     _mm256_deinterleave_rgba_epi8, _mm256_interleave_x2_epi8, avx2_deinterleave_rgb, avx2_pack_u16,
-    avx2_pairwise_widen_avg,
 };
 use crate::internals::ProcessedOffset;
 use crate::yuv_support::{
@@ -212,7 +211,7 @@ unsafe fn avx2_rgba_to_nv_impl<
         let y_yuv = avx2_pack_u16(y_l, y_h);
         _mm256_storeu_si256(y_ptr.add(cx) as *mut __m256i, y_yuv);
 
-        if compute_uv_row {
+        if chroma_subsampling == YuvChromaSubsampling::Yuv444 {
             let cb_l = _mm256_max_epi16(
                 _mm256_min_epi16(
                     _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
@@ -279,31 +278,64 @@ unsafe fn avx2_rgba_to_nv_impl<
             );
 
             let cb = avx2_pack_u16(cb_l, cb_h);
-
             let cr = avx2_pack_u16(cr_l, cr_h);
 
-            match chroma_subsampling {
-                YuvChromaSubsampling::Yuv420 | YuvChromaSubsampling::Yuv422 => {
-                    let cb_h = avx2_pairwise_widen_avg(cb);
-                    let cr_h = avx2_pairwise_widen_avg(cr);
-                    let (row0, _) = match order {
-                        YuvNVOrder::UV => _mm256_interleave_x2_epi8(cb_h, cr_h),
-                        YuvNVOrder::VU => _mm256_interleave_x2_epi8(cr_h, cb_h),
-                    };
-                    _mm256_storeu_si256(uv_ptr.add(uv_x) as *mut __m256i, row0);
-                    uv_x += 32;
-                }
-                YuvChromaSubsampling::Yuv444 => {
-                    let (row0, row1) = match order {
-                        YuvNVOrder::UV => _mm256_interleave_x2_epi8(cb, cr),
-                        YuvNVOrder::VU => _mm256_interleave_x2_epi8(cr, cb),
-                    };
-                    let dst_ptr = uv_ptr.add(uv_x);
-                    _mm256_storeu_si256(dst_ptr as *mut __m256i, row0);
-                    _mm256_storeu_si256(dst_ptr.add(32) as *mut __m256i, row1);
-                    uv_x += 64;
-                }
-            }
+            let (row0, row1) = match order {
+                YuvNVOrder::UV => _mm256_interleave_x2_epi8(cb, cr),
+                YuvNVOrder::VU => _mm256_interleave_x2_epi8(cr, cb),
+            };
+            let dst_ptr = uv_ptr.add(uv_x);
+            _mm256_storeu_si256(dst_ptr as *mut __m256i, row0);
+            _mm256_storeu_si256(dst_ptr.add(32) as *mut __m256i, row1);
+            uv_x += 64;
+        } else if chroma_subsampling == YuvChromaSubsampling::Yuv422
+            || (chroma_subsampling == YuvChromaSubsampling::Yuv420 && compute_uv_row)
+        {
+            let r1 = _mm256_avg_epu16(r_low, r_high);
+            let g1 = _mm256_avg_epu16(g_low, g_high);
+            let b1 = _mm256_avg_epu16(b_low, b_high);
+            let cb = _mm256_max_epi16(
+                _mm256_min_epi16(
+                    _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
+                        uv_bias,
+                        _mm256_add_epi16(
+                            _mm256_add_epi16(
+                                _mm256_mulhi_epi16(r1, v_cb_r),
+                                _mm256_mulhi_epi16(g1, v_cb_g),
+                            ),
+                            _mm256_mulhi_epi16(b1, v_cb_b),
+                        ),
+                    )),
+                    i_cap_uv,
+                ),
+                i_bias_y,
+            );
+            let cr = _mm256_max_epi16(
+                _mm256_min_epi16(
+                    _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
+                        uv_bias,
+                        _mm256_add_epi16(
+                            _mm256_add_epi16(
+                                _mm256_mulhi_epi16(r1, v_cr_r),
+                                _mm256_mulhi_epi16(g1, v_cr_g),
+                            ),
+                            _mm256_mulhi_epi16(b1, v_cr_b),
+                        ),
+                    )),
+                    i_cap_uv,
+                ),
+                i_bias_y,
+            );
+
+            let cb = avx2_pack_u16(cb, cb);
+            let cr = avx2_pack_u16(cr, cr);
+
+            let (row0, _) = match order {
+                YuvNVOrder::UV => _mm256_interleave_x2_epi8(cb, cr),
+                YuvNVOrder::VU => _mm256_interleave_x2_epi8(cr, cb),
+            };
+            _mm256_storeu_si256(uv_ptr.add(uv_x) as *mut __m256i, row0);
+            uv_x += 32;
         }
 
         cx += 32;

diff --git a/src/avx2/rgba_to_yuv.rs b/src/avx2/rgba_to_yuv.rs
@@ -28,7 +28,7 @@
  */
 
 use crate::avx2::avx2_utils::{
-    _mm256_deinterleave_rgba_epi8, avx2_deinterleave_rgb, avx2_pack_u16, avx2_pairwise_widen_avg,
+    _mm256_deinterleave_rgba_epi8, avx2_deinterleave_rgb, avx2_pack_u16,
 };
 use crate::internals::ProcessedOffset;
 use crate::yuv_support::{
@@ -210,7 +210,7 @@ unsafe fn avx2_rgba_to_yuv_impl<const ORIGIN_CHANNELS: u8, const SAMPLING: u8>(
         let y_yuv = avx2_pack_u16(y_l, y_h);
         _mm256_storeu_si256(y_ptr.add(cx) as *mut __m256i, y_yuv);
 
-        if chroma_subsampling != YuvChromaSubsampling::Yuv420 || compute_uv_row {
+        if chroma_subsampling == YuvChromaSubsampling::Yuv444 {
             let cb_l = _mm256_max_epi16(
                 _mm256_min_epi16(
                     _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
@@ -277,23 +277,62 @@ unsafe fn avx2_rgba_to_yuv_impl<const ORIGIN_CHANNELS: u8, const SAMPLING: u8>(
             );
 
             let cb = avx2_pack_u16(cb_l, cb_h);
-
             let cr = avx2_pack_u16(cr_l, cr_h);
 
-            match chroma_subsampling {
-                YuvChromaSubsampling::Yuv420 | YuvChromaSubsampling::Yuv422 => {
-                    let cb_h = _mm256_castsi256_si128(avx2_pairwise_widen_avg(cb));
-                    let cr_h = _mm256_castsi256_si128(avx2_pairwise_widen_avg(cr));
-                    _mm_storeu_si128(u_ptr.add(uv_x) as *mut _ as *mut __m128i, cb_h);
-                    _mm_storeu_si128(v_ptr.add(uv_x) as *mut _ as *mut __m128i, cr_h);
-                    uv_x += 16;
-                }
-                YuvChromaSubsampling::Yuv444 => {
-                    _mm256_storeu_si256(u_ptr.add(uv_x) as *mut __m256i, cb);
-                    _mm256_storeu_si256(v_ptr.add(uv_x) as *mut __m256i, cr);
-                    uv_x += 32;
-                }
-            }
+            _mm256_storeu_si256(u_ptr.add(uv_x) as *mut __m256i, cb);
+            _mm256_storeu_si256(v_ptr.add(uv_x) as *mut __m256i, cr);
+            uv_x += 32;
+        } else if chroma_subsampling == YuvChromaSubsampling::Yuv422
+            || (chroma_subsampling == YuvChromaSubsampling::Yuv420 && compute_uv_row)
+        {
+            let r1 = _mm256_avg_epu16(r_low, r_high);
+            let g1 = _mm256_avg_epu16(g_low, g_high);
+            let b1 = _mm256_avg_epu16(b_low, b_high);
+            let cb = _mm256_max_epi16(
+                _mm256_min_epi16(
+                    _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
+                        uv_bias,
+                        _mm256_add_epi16(
+                            _mm256_add_epi16(
+                                _mm256_mulhi_epi16(r1, v_cb_r),
+                                _mm256_mulhi_epi16(g1, v_cb_g),
+                            ),
+                            _mm256_mulhi_epi16(b1, v_cb_b),
+                        ),
+                    )),
+                    i_cap_uv,
+                ),
+                i_bias_y,
+            );
+            let cr = _mm256_max_epi16(
+                _mm256_min_epi16(
+                    _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
+                        uv_bias,
+                        _mm256_add_epi16(
+                            _mm256_add_epi16(
+                                _mm256_mulhi_epi16(r1, v_cr_r),
+                                _mm256_mulhi_epi16(g1, v_cr_g),
+                            ),
+                            _mm256_mulhi_epi16(b1, v_cr_b),
+                        ),
+                    )),
+                    i_cap_uv,
+                ),
+                i_bias_y,
+            );
+
+            let cb = avx2_pack_u16(cb, cb);
+            let cr = avx2_pack_u16(cr, cr);
+
+            _mm_storeu_si128(
+                u_ptr.add(uv_x) as *mut _ as *mut __m128i,
+                _mm256_castsi256_si128(cb),
+            );
+            _mm_storeu_si128(
+                v_ptr.add(uv_x) as *mut _ as *mut __m128i,
+                _mm256_castsi256_si128(cr),
+            );
+            uv_x += 16;
         }
 
         cx += 32;

diff --git a/src/avx512bw/rgba_to_yuv.rs b/src/avx512bw/rgba_to_yuv.rs
@@ -28,7 +28,7 @@
  */
 
 use crate::avx512bw::avx512_utils::{
-    avx512_deinterleave_rgb, avx512_deinterleave_rgba, avx512_pack_u16, avx512_pairwise_widen_avg,
+    avx512_deinterleave_rgb, avx512_deinterleave_rgba, avx512_pack_u16,
 };
 use crate::internals::ProcessedOffset;
 use crate::yuv_support::{
@@ -210,7 +210,7 @@ unsafe fn avx512_rgba_to_yuv_impl<const ORIGIN_CHANNELS: u8, const SAMPLING: u8>
         let y_yuv = avx512_pack_u16(y_l, y_h);
         _mm512_storeu_si512(y_ptr.add(cx) as *mut i32, y_yuv);
 
-        if chroma_subsampling != YuvChromaSubsampling::Yuv420 || compute_uv_row {
+        if chroma_subsampling == YuvChromaSubsampling::Yuv444 {
             let cb_l = _mm512_max_epi16(
                 _mm512_min_epi16(
                     _mm512_srai_epi16::<V_SHR>(_mm512_add_epi16(
@@ -277,23 +277,64 @@ unsafe fn avx512_rgba_to_yuv_impl<const ORIGIN_CHANNELS: u8, const SAMPLING: u8>
             );
 
             let cb = avx512_pack_u16(cb_l, cb_h);
-
             let cr = avx512_pack_u16(cr_l, cr_h);
 
-            match chroma_subsampling {
-                YuvChromaSubsampling::Yuv420 | YuvChromaSubsampling::Yuv422 => {
-                    let cb_h = _mm512_castsi512_si256(avx512_pairwise_widen_avg(cb));
-                    let cr_h = _mm512_castsi512_si256(avx512_pairwise_widen_avg(cr));
-                    _mm256_storeu_si256(u_ptr.add(uv_x) as *mut _ as *mut __m256i, cb_h);
-                    _mm256_storeu_si256(v_ptr.add(uv_x) as *mut _ as *mut __m256i, cr_h);
-                    uv_x += 32;
-                }
-                YuvChromaSubsampling::Yuv444 => {
-                    _mm512_storeu_si512(u_ptr.add(uv_x) as *mut i32, cb);
-                    _mm512_storeu_si512(v_ptr.add(uv_x) as *mut i32, cr);
-                    uv_x += 64;
-                }
-            }
+            _mm512_storeu_si512(u_ptr.add(uv_x) as *mut i32, cb);
+            _mm512_storeu_si512(v_ptr.add(uv_x) as *mut i32, cr);
+            uv_x += 64;
+        } else if chroma_subsampling == YuvChromaSubsampling::Yuv422
+            || (chroma_subsampling == YuvChromaSubsampling::Yuv420 && compute_uv_row)
+        {
+            let r1 = _mm512_avg_epu16(r_low, r_high);
+            let g1 = _mm512_avg_epu16(g_low, g_high);
+            let b1 = _mm512_avg_epu16(b_low, b_high);
+
+            let cbk = _mm512_max_epi16(
+                _mm512_min_epi16(
+                    _mm512_srai_epi16::<V_SHR>(_mm512_add_epi16(
+                        uv_bias,
+                        _mm512_add_epi16(
+                            _mm512_add_epi16(
+                                _mm512_mulhi_epi16(r1, v_cb_r),
+                                _mm512_mulhi_epi16(g1, v_cb_g),
+                            ),
+                            _mm512_mulhi_epi16(b1, v_cb_b),
+                        ),
+                    )),
+                    i_cap_uv,
+                ),
+                i_bias_y,
+            );
+
+            let crk = _mm512_max_epi16(
+                _mm512_min_epi16(
+                    _mm512_srai_epi16::<V_SHR>(_mm512_add_epi16(
+                        uv_bias,
+                        _mm512_add_epi16(
+                            _mm512_add_epi16(
+                                _mm512_mulhi_epi16(r1, v_cr_r),
+                                _mm512_mulhi_epi16(g1, v_cr_g),
+                            ),
+                            _mm512_mulhi_epi16(b1, v_cr_b),
+                        ),
+                    )),
+                    i_cap_uv,
+                ),
+                i_bias_y,
+            );
+
+            let cb = avx512_pack_u16(cbk, cbk);
+            let cr = avx512_pack_u16(crk, crk);
+
+            _mm256_storeu_si256(
+                u_ptr.add(uv_x) as *mut _ as *mut __m256i,
+                _mm512_castsi512_si256(cb),
+            );
+            _mm256_storeu_si256(
+                v_ptr.add(uv_x) as *mut _ as *mut __m256i,
+                _mm512_castsi512_si256(cr),
+            );
+            uv_x += 32;
         }
 
         cx += 64;

diff --git a/src/rgba_to_nv.rs b/src/rgba_to_nv.rs
@@ -84,9 +84,9 @@ fn rgbx_to_nv<const ORIGIN_CHANNELS: u8, const UV_ORDER: u8, const SAMPLING: u8>
     let i_cap_uv = i_bias_y + range.range_uv as i32;
 
     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-    let _use_sse = std::arch::is_x86_feature_detected!("sse4.1");
+    let use_sse = std::arch::is_x86_feature_detected!("sse4.1");
     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-    let _use_avx2 = std::arch::is_x86_feature_detected!("avx2");
+    let use_avx2 = std::arch::is_x86_feature_detected!("avx2");
     #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
     let is_rdm_available = std::arch::is_aarch64_feature_detected!("rdm");
     #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
@@ -103,7 +103,7 @@ fn rgbx_to_nv<const ORIGIN_CHANNELS: u8, const UV_ORDER: u8, const SAMPLING: u8>
         |y_plane: &mut [u8], uv_plane: &mut [u8], rgba: &[u8], compute_uv_row| {
             let mut _offset: ProcessedOffset = ProcessedOffset { cx: 0, ux: 0 };
             #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-            if _use_avx2 {
+            if use_avx2 {
                 let offset = avx2_rgba_to_nv::<ORIGIN_CHANNELS, UV_ORDER, SAMPLING>(
                     y_plane,
                     uv_plane,
@@ -118,7 +118,7 @@ fn rgbx_to_nv<const ORIGIN_CHANNELS: u8, const UV_ORDER: u8, const SAMPLING: u8>
                 _offset = offset;
             }
             #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-            if _use_sse {
+            if use_sse {
                 let offset = sse_rgba_to_nv_row::<ORIGIN_CHANNELS, UV_ORDER, SAMPLING>(
                     y_plane,
                     uv_plane,