From 8d360d92c780a43e72aa7e84a3daf9372cb34ff3 Mon Sep 17 00:00:00 2001
From: Radzivon Bartoshyk <radzivon.bartoshyk@proton.me>
Date: Mon, 25 Nov 2024 20:09:20 +0000
Subject: [PATCH] AVX, SSE encoding/decoding improvements

---
 README.md                     |   6 +-
 app/benches/yuv8/main.rs      | 100 ++++++----
 src/avx2/mod.rs               |   2 +
 src/avx2/rgb_to_nv.rs         |  51 ++---
 src/avx2/rgb_to_y.rs          |  14 +-
 src/avx2/rgba_to_yuv.rs       |  65 +++----
 src/avx2/rgba_to_yuv420.rs    | 350 ++++++++++++++++++++++++++++++++++
 src/avx2/yuv_nv_to_rgba.rs    |  49 ++---
 src/avx2/yuv_nv_to_rgba420.rs |  34 ++--
 src/avx2/yuv_to_rgba.rs       |  22 +--
 src/avx2/yuv_to_rgba420.rs    |  34 ++--
 src/avx2/yuv_to_rgba_alpha.rs |  47 ++---
 src/rgba_to_yuv.rs            |  44 ++++-
 src/sse/mod.rs                |   2 +
 src/sse/rgb_to_nv.rs          |  51 +++--
 src/sse/rgb_to_y.rs           |  13 +-
 src/sse/rgba_to_yuv.rs        |  66 ++++---
 src/sse/rgba_to_yuv420.rs     | 319 +++++++++++++++++++++++++++++++
 src/sse/yuv_nv_to_rgba.rs     |  71 +++----
 src/sse/yuv_nv_to_rgba420.rs  |  50 ++---
 src/sse/yuv_to_rgba.rs        |  41 ++--
 src/sse/yuv_to_rgba420.rs     |  50 ++---
 src/sse/yuv_to_rgba_alpha.rs  |  49 ++---
 23 files changed, 1137 insertions(+), 393 deletions(-)
 create mode 100644 src/avx2/rgba_to_yuv420.rs
 create mode 100644 src/sse/rgba_to_yuv420.rs
diff --git a/README.md b/README.md
index 3644016..72c10f5 100644
--- a/README.md
+++ b/README.md
@@ -76,9 +76,9 @@ Tests performed on the image 5763x3842
 
 |                        | time(NEON) | Time(AVX) |
 |------------------------|:----------:|:---------:|
-| utils RGB->YUV 4:2:0   |   3.48ms   |  6.14ms   |
+| utils RGB->YUV 4:2:0   |   3.48ms   |  3.64ms   |
 | libyuv RGB->YUV 4:2:0  |   3.58ms   |  33.87ms  |
-| utils RGBA->YUV 4:2:0  |   4.32ms   |  7.34ms   |
+| utils RGBA->YUV 4:2:0  |   4.32ms   |  5.74ms   |
 | libyuv RGBA->YUV 4:2:0 |   4.87ms   |  23.48ms  |
 | utils RGBA->YUV 4:2:2  |   4.83ms   |  7.08ms   |
 | libyuv RGBA->YUV 4:2:2 |   5.90ms   |  35.23ms  |
@@ -90,7 +90,7 @@ Tests performed on the image 5763x3842
 |------------------------|:----------:|:---------:|
 | utils YUV NV12->RGB    |   3.86ms   |  6.48ms   |
 | libyuv YUV NV12->RGB   |   5.20ms   |  45.28ms  |
-| utils YUV 4:2:0->RGB   |   3.28ms   |  5.44ms   |
+| utils YUV 4:2:0->RGB   |   3.28ms   |  5.34ms   |
 | libyuv YUV 4:2:0->RGB  |   5.70ms   |  44.95ms  |
 | utils YUV 4:2:0->RGBA  |   3.82ms   |  5.98ms   |
 | libyuv YUV 4:2:0->RGBA |   6.13ms   |  6.88ms   |
diff --git a/app/benches/yuv8/main.rs b/app/benches/yuv8/main.rs
index 4408cf1..c510271 100644
--- a/app/benches/yuv8/main.rs
+++ b/app/benches/yuv8/main.rs
@@ -26,7 +26,7 @@
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
-
+use std::alloc::Layout;
 use criterion::{criterion_group, criterion_main, Criterion};
 use image::{GenericImageView, ImageReader};
 use yuv_sys::{
@@ -99,25 +99,36 @@ pub fn criterion_benchmark(c: &mut Criterion) {
     });
 
     c.bench_function("libyuv RGB -> YUV 4:2:0", |b| {
-        let mut test_planar = YuvPlanarImageMut::<u8>::alloc(
-            dimensions.0,
-            dimensions.1,
-            YuvChromaSubsampling::Yuv420,
-        );
-        b.iter(|| unsafe {
-            rs_RGB24ToI420(
-                src_bytes.as_ptr(),
-                stride as i32,
-                test_planar.y_plane.borrow_mut().as_mut_ptr(),
-                test_planar.y_stride as i32,
-                test_planar.u_plane.borrow_mut().as_mut_ptr(),
-                test_planar.u_stride as i32,
-                test_planar.v_plane.borrow_mut().as_mut_ptr(),
-                test_planar.v_stride as i32,
-                test_planar.width as i32,
-                test_planar.height as i32,
-            );
-        })
+        unsafe {
+            let layout_rgb = Layout::from_size_align(dimensions.0 as usize * dimensions.1 as usize * 3, 16).unwrap();
+            let layout_y = Layout::from_size_align(dimensions.0 as usize * dimensions.1 as usize, 16).unwrap();
+            let layout_uv = Layout::from_size_align((dimensions.0 as usize + 1) / 2 * (dimensions.1 as usize + 1) / 2, 16).unwrap();
+            let target_y = std::alloc::alloc(layout_y);
+            let target_u = std::alloc::alloc(layout_uv);
+            let target_v = std::alloc::alloc(layout_uv);
+            let source_rgb = std::alloc::alloc(layout_rgb);
+            for (x, src) in src_bytes.iter().enumerate() {
+                *source_rgb.add(x) = *src;
+            }
+            b.iter(|| {
+                rs_RGB24ToI420(
+                    source_rgb,
+                    stride as i32,
+                    target_y,
+                    dimensions.0 as i32,
+                    target_u,
+                    (dimensions.0 as i32 + 1) / 2,
+                    target_v,
+                    (dimensions.0 as i32 + 1) / 2,
+                    dimensions.0 as i32,
+                    dimensions.1 as i32,
+                );
+            });
+            std::alloc::dealloc(target_y, layout_y);
+            std::alloc::dealloc(target_u, layout_uv);
+            std::alloc::dealloc(target_v, layout_uv);
+            std::alloc::dealloc(source_rgb, layout_rgb);
+        }
     });
 
     c.bench_function("yuvutils RGBA -> YUV 4:2:0", |b| {
@@ -139,25 +150,36 @@ pub fn criterion_benchmark(c: &mut Criterion) {
     });
 
     c.bench_function("libyuv RGBA -> YUV 4:2:0", |b| {
-        let mut test_planar = YuvPlanarImageMut::<u8>::alloc(
-            dimensions.0,
-            dimensions.1,
-            YuvChromaSubsampling::Yuv420,
-        );
-        b.iter(|| unsafe {
-            rs_ABGRToI420(
-                rgba_image.as_ptr(),
-                dimensions.0 as i32 * 4i32,
-                test_planar.y_plane.borrow_mut().as_mut_ptr(),
-                test_planar.y_stride as i32,
-                test_planar.u_plane.borrow_mut().as_mut_ptr(),
-                test_planar.u_stride as i32,
-                test_planar.v_plane.borrow_mut().as_mut_ptr(),
-                test_planar.v_stride as i32,
-                test_planar.width as i32,
-                test_planar.height as i32,
-            );
-        })
+       unsafe {
+           let layout_rgba = Layout::from_size_align(dimensions.0 as usize * dimensions.1 as usize * 4, 16).unwrap();
+           let layout_y = Layout::from_size_align(dimensions.0 as usize * dimensions.1 as usize, 16).unwrap();
+           let layout_uv = Layout::from_size_align((dimensions.0 as usize + 1) / 2 * (dimensions.1 as usize + 1) / 2, 16).unwrap();
+           let target_y = std::alloc::alloc(layout_y);
+           let target_u = std::alloc::alloc(layout_uv);
+           let target_v = std::alloc::alloc(layout_uv);
+           let source_rgb = std::alloc::alloc(layout_rgba);
+           for (x, src) in src_bytes.iter().enumerate() {
+               *source_rgb.add(x) = *src;
+           }
+           b.iter(|| {
+               rs_ABGRToI420(
+                   source_rgb,
+                   dimensions.0 as i32 * 4i32,
+                   target_y,
+                   dimensions.0 as i32,
+                   target_u,
+                   (dimensions.0 as i32 + 1) / 2,
+                   target_v,
+                   (dimensions.0 as i32 + 1) / 2,
+                   dimensions.0 as i32,
+                   dimensions.1 as i32,
+               );
+           });
+           std::alloc::dealloc(target_y, layout_y);
+           std::alloc::dealloc(target_u, layout_uv);
+           std::alloc::dealloc(target_v, layout_uv);
+           std::alloc::dealloc(source_rgb, layout_rgba);
+       }
     });
 
     c.bench_function("yuvutils RGBA -> YUV 4:2:2", |b| {
diff --git a/src/avx2/mod.rs b/src/avx2/mod.rs
index 07bbd26..7f67999 100644
--- a/src/avx2/mod.rs
+++ b/src/avx2/mod.rs
@@ -45,6 +45,7 @@ mod yuv_to_rgba_alpha;
 mod yuv_to_yuv2;
 mod yuy2_to_rgb;
 mod yuy2_to_yuv;
+mod rgba_to_yuv420;
 
 pub(crate) use rgb_to_nv::avx2_rgba_to_nv;
 pub(crate) use rgb_to_y::avx2_rgb_to_y_row;
@@ -62,3 +63,4 @@ pub(crate) use yuv_to_rgba_alpha::avx2_yuv_to_rgba_alpha;
 pub(crate) use yuv_to_yuv2::yuv_to_yuy2_avx2_row;
 pub(crate) use yuy2_to_rgb::yuy2_to_rgb_avx;
 pub(crate) use yuy2_to_yuv::yuy2_to_yuv_avx;
+pub(crate) use rgba_to_yuv420::avx2_rgba_to_yuv420;
\ No newline at end of file
diff --git a/src/avx2/rgb_to_nv.rs b/src/avx2/rgb_to_nv.rs
index 051d135..efb9add 100644
--- a/src/avx2/rgb_to_nv.rs
+++ b/src/avx2/rgb_to_nv.rs
@@ -94,7 +94,8 @@ unsafe fn avx2_rgba_to_nv_impl<
     let mut uv_x = start_ux;
 
     const V_SHR: i32 = 3;
-    const V_SCALE: i32 = 7;
+    const V_SCALE: i32 = 6;
+
     let rounding_const_bias: i16 = 1 << (V_SHR - 1);
     let bias_y = range.bias_y as i16 * (1 << V_SHR) + rounding_const_bias;
     let bias_uv = range.bias_uv as i16 * (1 << V_SHR) + rounding_const_bias;
@@ -180,10 +181,10 @@ unsafe fn avx2_rgba_to_nv_impl<
                     y_bias,
                     _mm256_add_epi16(
                         _mm256_add_epi16(
-                            _mm256_mulhi_epi16(r_low, v_yr),
-                            _mm256_mulhi_epi16(g_low, v_yg),
+                            _mm256_mulhrs_epi16(r_low, v_yr),
+                            _mm256_mulhrs_epi16(g_low, v_yg),
                         ),
-                        _mm256_mulhi_epi16(b_low, v_yb),
+                        _mm256_mulhrs_epi16(b_low, v_yb),
                     ),
                 )),
                 i_cap_y,
@@ -197,10 +198,10 @@ unsafe fn avx2_rgba_to_nv_impl<
                     y_bias,
                     _mm256_add_epi16(
                         _mm256_add_epi16(
-                            _mm256_mulhi_epi16(r_high, v_yr),
-                            _mm256_mulhi_epi16(g_high, v_yg),
+                            _mm256_mulhrs_epi16(r_high, v_yr),
+                            _mm256_mulhrs_epi16(g_high, v_yg),
                         ),
-                        _mm256_mulhi_epi16(b_high, v_yb),
+                        _mm256_mulhrs_epi16(b_high, v_yb),
                     ),
                 )),
                 i_cap_y,
@@ -218,10 +219,10 @@ unsafe fn avx2_rgba_to_nv_impl<
                         uv_bias,
                         _mm256_add_epi16(
                             _mm256_add_epi16(
-                                _mm256_mulhi_epi16(r_low, v_cb_r),
-                                _mm256_mulhi_epi16(g_low, v_cb_g),
+                                _mm256_mulhrs_epi16(r_low, v_cb_r),
+                                _mm256_mulhrs_epi16(g_low, v_cb_g),
                             ),
-                            _mm256_mulhi_epi16(b_low, v_cb_b),
+                            _mm256_mulhrs_epi16(b_low, v_cb_b),
                         ),
                     )),
                     i_cap_uv,
@@ -234,10 +235,10 @@ unsafe fn avx2_rgba_to_nv_impl<
                         uv_bias,
                         _mm256_add_epi16(
                             _mm256_add_epi16(
-                                _mm256_mulhi_epi16(r_low, v_cr_r),
-                                _mm256_mulhi_epi16(g_low, v_cr_g),
+                                _mm256_mulhrs_epi16(r_low, v_cr_r),
+                                _mm256_mulhrs_epi16(g_low, v_cr_g),
                             ),
-                            _mm256_mulhi_epi16(b_low, v_cr_b),
+                            _mm256_mulhrs_epi16(b_low, v_cr_b),
                         ),
                     )),
                     i_cap_uv,
@@ -250,10 +251,10 @@ unsafe fn avx2_rgba_to_nv_impl<
                         uv_bias,
                         _mm256_add_epi16(
                             _mm256_add_epi16(
-                                _mm256_mulhi_epi16(r_high, v_cb_r),
-                                _mm256_mulhi_epi16(g_high, v_cb_g),
+                                _mm256_mulhrs_epi16(r_high, v_cb_r),
+                                _mm256_mulhrs_epi16(g_high, v_cb_g),
                             ),
-                            _mm256_mulhi_epi16(b_high, v_cb_b),
+                            _mm256_mulhrs_epi16(b_high, v_cb_b),
                         ),
                     )),
                     i_cap_uv,
@@ -266,10 +267,10 @@ unsafe fn avx2_rgba_to_nv_impl<
                         uv_bias,
                         _mm256_add_epi16(
                             _mm256_add_epi16(
-                                _mm256_mulhi_epi16(r_high, v_cr_r),
-                                _mm256_mulhi_epi16(g_high, v_cr_g),
+                                _mm256_mulhrs_epi16(r_high, v_cr_r),
+                                _mm256_mulhrs_epi16(g_high, v_cr_g),
                             ),
-                            _mm256_mulhi_epi16(b_high, v_cr_b),
+                            _mm256_mulhrs_epi16(b_high, v_cr_b),
                         ),
                     )),
                     i_cap_uv,
@@ -300,10 +301,10 @@ unsafe fn avx2_rgba_to_nv_impl<
                         uv_bias,
                         _mm256_add_epi16(
                             _mm256_add_epi16(
-                                _mm256_mulhi_epi16(r1, v_cb_r),
-                                _mm256_mulhi_epi16(g1, v_cb_g),
+                                _mm256_mulhrs_epi16(r1, v_cb_r),
+                                _mm256_mulhrs_epi16(g1, v_cb_g),
                             ),
-                            _mm256_mulhi_epi16(b1, v_cb_b),
+                            _mm256_mulhrs_epi16(b1, v_cb_b),
                         ),
                     )),
                     i_cap_uv,
@@ -316,10 +317,10 @@ unsafe fn avx2_rgba_to_nv_impl<
                         uv_bias,
                         _mm256_add_epi16(
                             _mm256_add_epi16(
-                                _mm256_mulhi_epi16(r1, v_cr_r),
-                                _mm256_mulhi_epi16(g1, v_cr_g),
+                                _mm256_mulhrs_epi16(r1, v_cr_r),
+                                _mm256_mulhrs_epi16(g1, v_cr_g),
                             ),
-                            _mm256_mulhi_epi16(b1, v_cr_b),
+                            _mm256_mulhrs_epi16(b1, v_cr_b),
                         ),
                     )),
                     i_cap_uv,
diff --git a/src/avx2/rgb_to_y.rs b/src/avx2/rgb_to_y.rs
index 4091873..29cd446 100644
--- a/src/avx2/rgb_to_y.rs
+++ b/src/avx2/rgb_to_y.rs
@@ -67,7 +67,7 @@ pub(crate) unsafe fn avx2_rgb_to_y_row_impl<const ORIGIN_CHANNELS: u8>(
     let mut cx = start_cx;
 
     const V_SHR: i32 = 3;
-    const V_SCALE: i32 = 7;
+    const V_SCALE: i32 = 6;
     let rounding_const_bias: i16 = 1 << (V_SHR - 1);
     let bias_y = range.bias_y as i16 * (1 << V_SHR) + rounding_const_bias;
 
@@ -144,10 +144,10 @@ pub(crate) unsafe fn avx2_rgb_to_y_row_impl<const ORIGIN_CHANNELS: u8>(
                     y_bias,
                     _mm256_add_epi16(
                         _mm256_add_epi16(
-                            _mm256_mulhi_epi16(r_low, v_yr),
-                            _mm256_mulhi_epi16(g_low, v_yg),
+                            _mm256_mulhrs_epi16(r_low, v_yr),
+                            _mm256_mulhrs_epi16(g_low, v_yg),
                         ),
-                        _mm256_mulhi_epi16(b_low, v_yb),
+                        _mm256_mulhrs_epi16(b_low, v_yb),
                     ),
                 )),
                 i_cap_y,
@@ -161,10 +161,10 @@ pub(crate) unsafe fn avx2_rgb_to_y_row_impl<const ORIGIN_CHANNELS: u8>(
                     y_bias,
                     _mm256_add_epi16(
                         _mm256_add_epi16(
-                            _mm256_mulhi_epi16(r_high, v_yr),
-                            _mm256_mulhi_epi16(g_high, v_yg),
+                            _mm256_mulhrs_epi16(r_high, v_yr),
+                            _mm256_mulhrs_epi16(g_high, v_yg),
                         ),
-                        _mm256_mulhi_epi16(b_high, v_yb),
+                        _mm256_mulhrs_epi16(b_high, v_yb),
                     ),
                 )),
                 i_cap_y,
diff --git a/src/avx2/rgba_to_yuv.rs b/src/avx2/rgba_to_yuv.rs
index 93f8b8c..25c46f4 100644
--- a/src/avx2/rgba_to_yuv.rs
+++ b/src/avx2/rgba_to_yuv.rs
@@ -49,20 +49,10 @@ pub(crate) fn avx2_rgba_to_yuv<const ORIGIN_CHANNELS: u8, const SAMPLING: u8>(
     start_cx: usize,
     start_ux: usize,
     width: usize,
-    compute_uv_row: bool,
 ) -> ProcessedOffset {
     unsafe {
         avx2_rgba_to_yuv_impl::<ORIGIN_CHANNELS, SAMPLING>(
-            transform,
-            range,
-            y_plane,
-            u_plane,
-            v_plane,
-            rgba,
-            start_cx,
-            start_ux,
-            width,
-            compute_uv_row,
+            transform, range, y_plane, u_plane, v_plane, rgba, start_cx, start_ux, width,
         )
     }
 }
@@ -78,7 +68,6 @@ unsafe fn avx2_rgba_to_yuv_impl<const ORIGIN_CHANNELS: u8, const SAMPLING: u8>(
     start_cx: usize,
     start_ux: usize,
     width: usize,
-    compute_uv_row: bool,
 ) -> ProcessedOffset {
     let chroma_subsampling: YuvChromaSubsampling = SAMPLING.into();
     let source_channels: YuvSourceChannels = ORIGIN_CHANNELS.into();
@@ -93,7 +82,7 @@ unsafe fn avx2_rgba_to_yuv_impl<const ORIGIN_CHANNELS: u8, const SAMPLING: u8>(
     let mut uv_x = start_ux;
 
     const V_SHR: i32 = 3;
-    const V_SCALE: i32 = 7;
+    const V_SCALE: i32 = 6;
     let rounding_const_bias: i16 = 1 << (V_SHR - 1);
     let bias_y = range.bias_y as i16 * (1 << V_SHR) + rounding_const_bias;
     let bias_uv = range.bias_uv as i16 * (1 << V_SHR) + rounding_const_bias;
@@ -179,10 +168,10 @@ unsafe fn avx2_rgba_to_yuv_impl<const ORIGIN_CHANNELS: u8, const SAMPLING: u8>(
                     y_bias,
                     _mm256_add_epi16(
                         _mm256_add_epi16(
-                            _mm256_mulhi_epi16(r_low, v_yr),
-                            _mm256_mulhi_epi16(g_low, v_yg),
+                            _mm256_mulhrs_epi16(r_low, v_yr),
+                            _mm256_mulhrs_epi16(g_low, v_yg),
                         ),
-                        _mm256_mulhi_epi16(b_low, v_yb),
+                        _mm256_mulhrs_epi16(b_low, v_yb),
                     ),
                 )),
                 i_cap_y,
@@ -196,10 +185,10 @@ unsafe fn avx2_rgba_to_yuv_impl<const ORIGIN_CHANNELS: u8, const SAMPLING: u8>(
                     y_bias,
                     _mm256_add_epi16(
                         _mm256_add_epi16(
-                            _mm256_mulhi_epi16(r_high, v_yr),
-                            _mm256_mulhi_epi16(g_high, v_yg),
+                            _mm256_mulhrs_epi16(r_high, v_yr),
+                            _mm256_mulhrs_epi16(g_high, v_yg),
                         ),
-                        _mm256_mulhi_epi16(b_high, v_yb),
+                        _mm256_mulhrs_epi16(b_high, v_yb),
                     ),
                 )),
                 i_cap_y,
@@ -217,10 +206,10 @@ unsafe fn avx2_rgba_to_yuv_impl<const ORIGIN_CHANNELS: u8, const SAMPLING: u8>(
                         uv_bias,
                         _mm256_add_epi16(
                             _mm256_add_epi16(
-                                _mm256_mulhi_epi16(r_low, v_cb_r),
-                                _mm256_mulhi_epi16(g_low, v_cb_g),
+                                _mm256_mulhrs_epi16(r_low, v_cb_r),
+                                _mm256_mulhrs_epi16(g_low, v_cb_g),
                             ),
-                            _mm256_mulhi_epi16(b_low, v_cb_b),
+                            _mm256_mulhrs_epi16(b_low, v_cb_b),
                         ),
                     )),
                     i_cap_uv,
@@ -233,10 +222,10 @@ unsafe fn avx2_rgba_to_yuv_impl<const ORIGIN_CHANNELS: u8, const SAMPLING: u8>(
                         uv_bias,
                         _mm256_add_epi16(
                             _mm256_add_epi16(
-                                _mm256_mulhi_epi16(r_low, v_cr_r),
-                                _mm256_mulhi_epi16(g_low, v_cr_g),
+                                _mm256_mulhrs_epi16(r_low, v_cr_r),
+                                _mm256_mulhrs_epi16(g_low, v_cr_g),
                             ),
-                            _mm256_mulhi_epi16(b_low, v_cr_b),
+                            _mm256_mulhrs_epi16(b_low, v_cr_b),
                         ),
                     )),
                     i_cap_uv,
@@ -249,10 +238,10 @@ unsafe fn avx2_rgba_to_yuv_impl<const ORIGIN_CHANNELS: u8, const SAMPLING: u8>(
                         uv_bias,
                         _mm256_add_epi16(
                             _mm256_add_epi16(
-                                _mm256_mulhi_epi16(r_high, v_cb_r),
-                                _mm256_mulhi_epi16(g_high, v_cb_g),
+                                _mm256_mulhrs_epi16(r_high, v_cb_r),
+                                _mm256_mulhrs_epi16(g_high, v_cb_g),
                             ),
-                            _mm256_mulhi_epi16(b_high, v_cb_b),
+                            _mm256_mulhrs_epi16(b_high, v_cb_b),
                         ),
                     )),
                     i_cap_uv,
@@ -265,10 +254,10 @@ unsafe fn avx2_rgba_to_yuv_impl<const ORIGIN_CHANNELS: u8, const SAMPLING: u8>(
                         uv_bias,
                         _mm256_add_epi16(
                             _mm256_add_epi16(
-                                _mm256_mulhi_epi16(r_high, v_cr_r),
-                                _mm256_mulhi_epi16(g_high, v_cr_g),
+                                _mm256_mulhrs_epi16(r_high, v_cr_r),
+                                _mm256_mulhrs_epi16(g_high, v_cr_g),
                             ),
-                            _mm256_mulhi_epi16(b_high, v_cr_b),
+                            _mm256_mulhrs_epi16(b_high, v_cr_b),
                         ),
                     )),
                     i_cap_uv,
@@ -283,7 +272,7 @@ unsafe fn avx2_rgba_to_yuv_impl<const ORIGIN_CHANNELS: u8, const SAMPLING: u8>(
             _mm256_storeu_si256(v_ptr.add(uv_x) as *mut __m256i, cr);
             uv_x += 32;
         } else if chroma_subsampling == YuvChromaSubsampling::Yuv422
-            || (chroma_subsampling == YuvChromaSubsampling::Yuv420 && compute_uv_row)
+            || (chroma_subsampling == YuvChromaSubsampling::Yuv420)
         {
             let r1 = _mm256_avg_epu16(r_low, r_high);
             let g1 = _mm256_avg_epu16(g_low, g_high);
@@ -294,10 +283,10 @@ unsafe fn avx2_rgba_to_yuv_impl<const ORIGIN_CHANNELS: u8, const SAMPLING: u8>(
                         uv_bias,
                         _mm256_add_epi16(
                             _mm256_add_epi16(
-                                _mm256_mulhi_epi16(r1, v_cb_r),
-                                _mm256_mulhi_epi16(g1, v_cb_g),
+                                _mm256_mulhrs_epi16(r1, v_cb_r),
+                                _mm256_mulhrs_epi16(g1, v_cb_g),
                             ),
-                            _mm256_mulhi_epi16(b1, v_cb_b),
+                            _mm256_mulhrs_epi16(b1, v_cb_b),
                         ),
                     )),
                     i_cap_uv,
@@ -310,10 +299,10 @@ unsafe fn avx2_rgba_to_yuv_impl<const ORIGIN_CHANNELS: u8, const SAMPLING: u8>(
                         uv_bias,
                         _mm256_add_epi16(
                             _mm256_add_epi16(
-                                _mm256_mulhi_epi16(r1, v_cr_r),
-                                _mm256_mulhi_epi16(g1, v_cr_g),
+                                _mm256_mulhrs_epi16(r1, v_cr_r),
+                                _mm256_mulhrs_epi16(g1, v_cr_g),
                             ),
-                            _mm256_mulhi_epi16(b1, v_cr_b),
+                            _mm256_mulhrs_epi16(b1, v_cr_b),
                         ),
                     )),
                     i_cap_uv,
diff --git a/src/avx2/rgba_to_yuv420.rs b/src/avx2/rgba_to_yuv420.rs
new file mode 100644
index 0000000..f03618d
--- /dev/null
+++ b/src/avx2/rgba_to_yuv420.rs
@@ -0,0 +1,350 @@
+/*
+ * Copyright (c) Radzivon Bartoshyk, 10/2024. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1.  Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3.  Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+use crate::avx2::avx2_utils::{
+    _mm256_deinterleave_rgba_epi8, avx2_deinterleave_rgb, avx2_pack_u16,
+};
+use crate::internals::ProcessedOffset;
+use crate::yuv_support::{
+    CbCrForwardTransform, YuvChromaRange, YuvSourceChannels,
+};
+#[cfg(target_arch = "x86")]
+use std::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use std::arch::x86_64::*;
+
+pub(crate) fn avx2_rgba_to_yuv420<const ORIGIN_CHANNELS: u8>(
+    transform: &CbCrForwardTransform<i32>,
+    range: &YuvChromaRange,
+    y_plane0: &mut [u8],
+    y_plane1: &mut [u8],
+    u_plane: &mut [u8],
+    v_plane: &mut [u8],
+    rgba0: &[u8],
+    rgba1: &[u8],
+    start_cx: usize,
+    start_ux: usize,
+    width: usize,
+) -> ProcessedOffset {
+    unsafe {
+        avx2_rgba_to_yuv_impl420::<ORIGIN_CHANNELS>(
+            transform, range, y_plane0, y_plane1, u_plane, v_plane, rgba0, rgba1, start_cx,
+            start_ux, width,
+        )
+    }
+}
+
+#[target_feature(enable = "avx2")]
+unsafe fn avx2_rgba_to_yuv_impl420<const ORIGIN_CHANNELS: u8>(
+    transform: &CbCrForwardTransform<i32>,
+    range: &YuvChromaRange,
+    y_plane0: &mut [u8],
+    y_plane1: &mut [u8],
+    u_plane: &mut [u8],
+    v_plane: &mut [u8],
+    rgba0: &[u8],
+    rgba1: &[u8],
+    start_cx: usize,
+    start_ux: usize,
+    width: usize,
+) -> ProcessedOffset {
+    let source_channels: YuvSourceChannels = ORIGIN_CHANNELS.into();
+    let channels = source_channels.get_channels_count();
+
+    let u_ptr = u_plane.as_mut_ptr();
+    let v_ptr = v_plane.as_mut_ptr();
+
+    let mut cx = start_cx;
+    let mut uv_x = start_ux;
+
+    const V_SHR: i32 = 3;
+    const V_SCALE: i32 = 6;
+    let rounding_const_bias: i16 = 1 << (V_SHR - 1);
+    let bias_y = range.bias_y as i16 * (1 << V_SHR) + rounding_const_bias;
+    let bias_uv = range.bias_uv as i16 * (1 << V_SHR) + rounding_const_bias;
+
+    let i_bias_y = _mm256_set1_epi16(range.bias_y as i16);
+    let i_cap_y = _mm256_set1_epi16(range.range_y as i16 + range.bias_y as i16);
+    let i_cap_uv = _mm256_set1_epi16(range.bias_y as i16 + range.range_uv as i16);
+
+    let y_bias = _mm256_set1_epi16(bias_y);
+    let uv_bias = _mm256_set1_epi16(bias_uv);
+    let v_yr = _mm256_set1_epi16(transform.yr as i16);
+    let v_yg = _mm256_set1_epi16(transform.yg as i16);
+    let v_yb = _mm256_set1_epi16(transform.yb as i16);
+    let v_cb_r = _mm256_set1_epi16(transform.cb_r as i16);
+    let v_cb_g = _mm256_set1_epi16(transform.cb_g as i16);
+    let v_cb_b = _mm256_set1_epi16(transform.cb_b as i16);
+    let v_cr_r = _mm256_set1_epi16(transform.cr_r as i16);
+    let v_cr_g = _mm256_set1_epi16(transform.cr_g as i16);
+    let v_cr_b = _mm256_set1_epi16(transform.cr_b as i16);
+
+    while cx + 32 < width {
+        let (r_values0, g_values0, b_values0);
+        let (r_values1, g_values1, b_values1);
+
+        let px = cx * channels;
+
+        match source_channels {
+            YuvSourceChannels::Rgb | YuvSourceChannels::Bgr => {
+                let source_ptr0 = rgba0.get_unchecked(px..).as_ptr();
+                let row_1 = _mm256_loadu_si256(source_ptr0 as *const __m256i);
+                let row_2 = _mm256_loadu_si256(source_ptr0.add(32) as *const __m256i);
+                let row_3 = _mm256_loadu_si256(source_ptr0.add(64) as *const __m256i);
+
+                let (it1, it2, it3) = avx2_deinterleave_rgb(row_1, row_2, row_3);
+                if source_channels == YuvSourceChannels::Rgb {
+                    r_values0 = it1;
+                    g_values0 = it2;
+                    b_values0 = it3;
+                } else {
+                    r_values0 = it3;
+                    g_values0 = it2;
+                    b_values0 = it1;
+                }
+
+                let source_ptr1 = rgba0.get_unchecked(px..).as_ptr();
+                let row_11 = _mm256_loadu_si256(source_ptr1 as *const __m256i);
+                let row_21 = _mm256_loadu_si256(source_ptr1.add(32) as *const __m256i);
+                let row_31 = _mm256_loadu_si256(source_ptr1.add(64) as *const __m256i);
+
+                let (it11, it21, it31) = avx2_deinterleave_rgb(row_11, row_21, row_31);
+                if source_channels == YuvSourceChannels::Rgb {
+                    r_values1 = it11;
+                    g_values1 = it21;
+                    b_values1 = it31;
+                } else {
+                    r_values1 = it31;
+                    g_values1 = it21;
+                    b_values1 = it11;
+                }
+            }
+            YuvSourceChannels::Rgba | YuvSourceChannels::Bgra => {
+                let source_ptr0 = rgba0.get_unchecked(px..).as_ptr();
+                let row_1 = _mm256_loadu_si256(source_ptr0 as *const __m256i);
+                let row_2 = _mm256_loadu_si256(source_ptr0.add(32) as *const __m256i);
+                let row_3 = _mm256_loadu_si256(source_ptr0.add(64) as *const __m256i);
+                let row_4 = _mm256_loadu_si256(source_ptr0.add(96) as *const __m256i);
+
+                let (it1, it2, it3, _) = _mm256_deinterleave_rgba_epi8(row_1, row_2, row_3, row_4);
+                if source_channels == YuvSourceChannels::Rgba {
+                    r_values0 = it1;
+                    g_values0 = it2;
+                    b_values0 = it3;
+                } else {
+                    r_values0 = it3;
+                    g_values0 = it2;
+                    b_values0 = it1;
+                }
+
+                let source_ptr1 = rgba1.get_unchecked(px..).as_ptr();
+                let row_11 = _mm256_loadu_si256(source_ptr1 as *const __m256i);
+                let row_21 = _mm256_loadu_si256(source_ptr1.add(32) as *const __m256i);
+                let row_31 = _mm256_loadu_si256(source_ptr1.add(64) as *const __m256i);
+                let row_41 = _mm256_loadu_si256(source_ptr1.add(96) as *const __m256i);
+
+                let (it1, it2, it3, _) = _mm256_deinterleave_rgba_epi8(row_11, row_21, row_31, row_41);
+                if source_channels == YuvSourceChannels::Rgba {
+                    r_values1 = it1;
+                    g_values1 = it2;
+                    b_values1 = it3;
+                } else {
+                    r_values1 = it3;
+                    g_values1 = it2;
+                    b_values1 = it1;
+                }
+            }
+        }
+
+        let r0_low =
+            _mm256_slli_epi16::<V_SCALE>(_mm256_cvtepu8_epi16(_mm256_castsi256_si128(r_values0)));
+        let r0_high = _mm256_slli_epi16::<V_SCALE>(_mm256_cvtepu8_epi16(
+            _mm256_extracti128_si256::<1>(r_values0),
+        ));
+        let g0_low =
+            _mm256_slli_epi16::<V_SCALE>(_mm256_cvtepu8_epi16(_mm256_castsi256_si128(g_values0)));
+        let g0_high = _mm256_slli_epi16::<V_SCALE>(_mm256_cvtepu8_epi16(
+            _mm256_extracti128_si256::<1>(g_values0),
+        ));
+        let b0_low =
+            _mm256_slli_epi16::<V_SCALE>(_mm256_cvtepu8_epi16(_mm256_castsi256_si128(b_values0)));
+        let b0_high = _mm256_slli_epi16::<V_SCALE>(_mm256_cvtepu8_epi16(
+            _mm256_extracti128_si256::<1>(b_values0),
+        ));
+
+        let y0_l = _mm256_max_epi16(
+            _mm256_min_epi16(
+                _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
+                    y_bias,
+                    _mm256_add_epi16(
+                        _mm256_add_epi16(
+                            _mm256_mulhrs_epi16(r0_low, v_yr),
+                            _mm256_mulhrs_epi16(g0_low, v_yg),
+                        ),
+                        _mm256_mulhrs_epi16(b0_low, v_yb),
+                    ),
+                )),
+                i_cap_y,
+            ),
+            i_bias_y,
+        );
+
+        let y0_h = _mm256_max_epi16(
+            _mm256_min_epi16(
+                _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
+                    y_bias,
+                    _mm256_add_epi16(
+                        _mm256_add_epi16(
+                            _mm256_mulhrs_epi16(r0_high, v_yr),
+                            _mm256_mulhrs_epi16(g0_high, v_yg),
+                        ),
+                        _mm256_mulhrs_epi16(b0_high, v_yb),
+                    ),
+                )),
+                i_cap_y,
+            ),
+            i_bias_y,
+        );
+
+        let r1_low =
+            _mm256_slli_epi16::<V_SCALE>(_mm256_cvtepu8_epi16(_mm256_castsi256_si128(r_values1)));
+        let r1_high = _mm256_slli_epi16::<V_SCALE>(_mm256_cvtepu8_epi16(
+            _mm256_extracti128_si256::<1>(r_values1),
+        ));
+        let g1_low =
+            _mm256_slli_epi16::<V_SCALE>(_mm256_cvtepu8_epi16(_mm256_castsi256_si128(g_values1)));
+        let g1_high = _mm256_slli_epi16::<V_SCALE>(_mm256_cvtepu8_epi16(
+            _mm256_extracti128_si256::<1>(g_values1),
+        ));
+        let b1_low =
+            _mm256_slli_epi16::<V_SCALE>(_mm256_cvtepu8_epi16(_mm256_castsi256_si128(b_values1)));
+        let b1_high = _mm256_slli_epi16::<V_SCALE>(_mm256_cvtepu8_epi16(
+            _mm256_extracti128_si256::<1>(b_values1),
+        ));
+
+        let y1_l = _mm256_max_epi16(
+            _mm256_min_epi16(
+                _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
+                    y_bias,
+                    _mm256_add_epi16(
+                        _mm256_add_epi16(
+                            _mm256_mulhrs_epi16(r1_low, v_yr),
+                            _mm256_mulhrs_epi16(g1_low, v_yg),
+                        ),
+                        _mm256_mulhrs_epi16(b1_low, v_yb),
+                    ),
+                )),
+                i_cap_y,
+            ),
+            i_bias_y,
+        );
+
+        let y1_h = _mm256_max_epi16(
+            _mm256_min_epi16(
+                _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
+                    y_bias,
+                    _mm256_add_epi16(
+                        _mm256_add_epi16(
+                            _mm256_mulhrs_epi16(r1_high, v_yr),
+                            _mm256_mulhrs_epi16(g1_high, v_yg),
+                        ),
+                        _mm256_mulhrs_epi16(b1_high, v_yb),
+                    ),
+                )),
+                i_cap_y,
+            ),
+            i_bias_y,
+        );
+
+        let y0_yuv = avx2_pack_u16(y0_l, y0_h);
+        let y1_yuv = avx2_pack_u16(y1_l, y1_h);
+
+        _mm256_storeu_si256(
+            y_plane0.get_unchecked_mut(cx..).as_mut_ptr() as *mut __m256i,
+            y0_yuv,
+        );
+        _mm256_storeu_si256(
+            y_plane1.get_unchecked_mut(cx..).as_mut_ptr() as *mut __m256i,
+            y1_yuv,
+        );
+
+        let r_uv = _mm256_avg_epu16(r0_low, r0_high);
+        let g_uv = _mm256_avg_epu16(g0_low, g0_high);
+        let b_uv = _mm256_avg_epu16(b0_low, b0_high);
+        let cb = _mm256_max_epi16(
+            _mm256_min_epi16(
+                _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
+                    uv_bias,
+                    _mm256_add_epi16(
+                        _mm256_add_epi16(
+                            _mm256_mulhrs_epi16(r_uv, v_cb_r),
+                            _mm256_mulhrs_epi16(g_uv, v_cb_g),
+                        ),
+                        _mm256_mulhrs_epi16(b_uv, v_cb_b),
+                    ),
+                )),
+                i_cap_uv,
+            ),
+            i_bias_y,
+        );
+        let cr = _mm256_max_epi16(
+            _mm256_min_epi16(
+                _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
+                    uv_bias,
+                    _mm256_add_epi16(
+                        _mm256_add_epi16(
+                            _mm256_mulhrs_epi16(r_uv, v_cr_r),
+                            _mm256_mulhrs_epi16(g_uv, v_cr_g),
+                        ),
+                        _mm256_mulhrs_epi16(b_uv, v_cr_b),
+                    ),
+                )),
+                i_cap_uv,
+            ),
+            i_bias_y,
+        );
+
+        let cb = avx2_pack_u16(cb, cb);
+        let cr = avx2_pack_u16(cr, cr);
+
+        _mm_storeu_si128(
+            u_ptr.add(uv_x) as *mut _ as *mut __m128i,
+            _mm256_castsi256_si128(cb),
+        );
+        _mm_storeu_si128(
+            v_ptr.add(uv_x) as *mut _ as *mut __m128i,
+            _mm256_castsi256_si128(cr),
+        );
+        uv_x += 16;
+
+        cx += 32;
+    }
+
+    ProcessedOffset { cx, ux: uv_x }
+}
diff --git a/src/avx2/yuv_nv_to_rgba.rs b/src/avx2/yuv_nv_to_rgba.rs
index 723ce3e..0274e53 100644
--- a/src/avx2/yuv_nv_to_rgba.rs
+++ b/src/avx2/yuv_nv_to_rgba.rs
@@ -84,6 +84,9 @@ unsafe fn avx2_yuv_nv_to_rgba_row_impl<
     let uv_ptr = uv_plane.as_ptr();
     let rgba_ptr = rgba.as_mut_ptr();
 
+    const SCALE: i32 = 6;
+    const V_SHR: i32 = 3;
+
     let y_corr = _mm256_set1_epi8(range.bias_y as i8);
     let uv_corr = _mm256_set1_epi16(range.bias_uv as i16);
     let v_luma_coeff = _mm256_set1_epi16(transform.y_coef as i16);
@@ -92,7 +95,7 @@ unsafe fn avx2_yuv_nv_to_rgba_row_impl<
     let v_g_coeff_1 = _mm256_set1_epi16(transform.g_coeff_1 as i16);
     let v_g_coeff_2 = _mm256_set1_epi16(transform.g_coeff_2 as i16);
     let v_alpha = _mm256_set1_epi8(255u8 as i8);
-    let rounding_const = _mm256_set1_epi16(1 << 2);
+    let rounding_const = _mm256_set1_epi16(1 << (V_SHR - 1));
 
     while cx + 32 < width {
         let y_values =
@@ -148,58 +151,58 @@ unsafe fn avx2_yuv_nv_to_rgba_row_impl<
         }
 
         let u_high =
-            _mm256_slli_epi16::<7>(_mm256_sub_epi16(_mm256_cvtepu8_epi16(u_high_u8), uv_corr));
+            _mm256_slli_epi16::<SCALE>(_mm256_sub_epi16(_mm256_cvtepu8_epi16(u_high_u8), uv_corr));
         let v_high =
-            _mm256_slli_epi16::<7>(_mm256_sub_epi16(_mm256_cvtepu8_epi16(v_high_u8), uv_corr));
-        let y_high = _mm256_mulhi_epi16(
-            _mm256_slli_epi16::<7>(_mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>(
+            _mm256_slli_epi16::<SCALE>(_mm256_sub_epi16(_mm256_cvtepu8_epi16(v_high_u8), uv_corr));
+        let y_high = _mm256_mulhrs_epi16(
+            _mm256_slli_epi16::<SCALE>(_mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>(
                 y_values,
             ))),
             v_luma_coeff,
         );
 
-        let r_high = _mm256_srli_epi16::<3>(_mm256_add_epi16(
-            _mm256_add_epi16(y_high, _mm256_mulhi_epi16(v_high, v_cr_coeff)),
+        let r_high = _mm256_srli_epi16::<V_SHR>(_mm256_add_epi16(
+            _mm256_add_epi16(y_high, _mm256_mulhrs_epi16(v_high, v_cr_coeff)),
             rounding_const,
         ));
-        let b_high = _mm256_srli_epi16::<3>(_mm256_add_epi16(
-            _mm256_add_epi16(y_high, _mm256_mulhi_epi16(u_high, v_cb_coeff)),
+        let b_high = _mm256_srli_epi16::<V_SHR>(_mm256_add_epi16(
+            _mm256_add_epi16(y_high, _mm256_mulhrs_epi16(u_high, v_cb_coeff)),
             rounding_const,
         ));
-        let g_high = _mm256_srli_epi16::<3>(_mm256_add_epi16(
+        let g_high = _mm256_srli_epi16::<V_SHR>(_mm256_add_epi16(
             _mm256_sub_epi16(
                 y_high,
                 _mm256_add_epi16(
-                    _mm256_mulhi_epi16(v_high, v_g_coeff_1),
-                    _mm256_mulhi_epi16(u_high, v_g_coeff_2),
+                    _mm256_mulhrs_epi16(v_high, v_g_coeff_1),
+                    _mm256_mulhrs_epi16(u_high, v_g_coeff_2),
                 ),
             ),
             rounding_const,
         ));
 
         let u_low =
-            _mm256_slli_epi16::<7>(_mm256_sub_epi16(_mm256_cvtepu8_epi16(u_low_u8), uv_corr));
+            _mm256_slli_epi16::<SCALE>(_mm256_sub_epi16(_mm256_cvtepu8_epi16(u_low_u8), uv_corr));
         let v_low =
-            _mm256_slli_epi16::<7>(_mm256_sub_epi16(_mm256_cvtepu8_epi16(v_low_u8), uv_corr));
-        let y_low = _mm256_mulhi_epi16(
-            _mm256_slli_epi16::<7>(_mm256_cvtepu8_epi16(_mm256_castsi256_si128(y_values))),
+            _mm256_slli_epi16::<SCALE>(_mm256_sub_epi16(_mm256_cvtepu8_epi16(v_low_u8), uv_corr));
+        let y_low = _mm256_mulhrs_epi16(
+            _mm256_slli_epi16::<SCALE>(_mm256_cvtepu8_epi16(_mm256_castsi256_si128(y_values))),
             v_luma_coeff,
         );
 
-        let r_low = _mm256_srli_epi16::<3>(_mm256_add_epi16(
-            _mm256_add_epi16(y_low, _mm256_mulhi_epi16(v_low, v_cr_coeff)),
+        let r_low = _mm256_srli_epi16::<V_SHR>(_mm256_add_epi16(
+            _mm256_add_epi16(y_low, _mm256_mulhrs_epi16(v_low, v_cr_coeff)),
             rounding_const,
         ));
-        let b_low = _mm256_srli_epi16::<3>(_mm256_add_epi16(
-            _mm256_add_epi16(y_low, _mm256_mulhi_epi16(u_low, v_cb_coeff)),
+        let b_low = _mm256_srli_epi16::<V_SHR>(_mm256_add_epi16(
+            _mm256_add_epi16(y_low, _mm256_mulhrs_epi16(u_low, v_cb_coeff)),
             rounding_const,
         ));
-        let g_low = _mm256_srli_epi16::<3>(_mm256_add_epi16(
+        let g_low = _mm256_srli_epi16::<V_SHR>(_mm256_add_epi16(
             _mm256_sub_epi16(
                 y_low,
                 _mm256_add_epi16(
-                    _mm256_mulhi_epi16(v_low, v_g_coeff_1),
-                    _mm256_mulhi_epi16(u_low, v_g_coeff_2),
+                    _mm256_mulhrs_epi16(v_low, v_g_coeff_1),
+                    _mm256_mulhrs_epi16(u_low, v_g_coeff_2),
                 ),
             ),
             rounding_const,
diff --git a/src/avx2/yuv_nv_to_rgba420.rs b/src/avx2/yuv_nv_to_rgba420.rs
index 2b8c4f4..82637f8 100644
--- a/src/avx2/yuv_nv_to_rgba420.rs
+++ b/src/avx2/yuv_nv_to_rgba420.rs
@@ -75,7 +75,7 @@ unsafe fn avx2_yuv_nv_to_rgba_row_impl420<const UV_ORDER: u8, const DESTINATION_
     let mut uv_x = start_ux;
     let uv_ptr = uv_plane.as_ptr();
 
-    const SCALE: i32 = 7;
+    const SCALE: i32 = 6;
     const V_SHR: i32 = 3;
 
     let y_corr = _mm256_set1_epi8(range.bias_y as i8);
@@ -124,13 +124,13 @@ unsafe fn avx2_yuv_nv_to_rgba_row_impl420<const UV_ORDER: u8, const DESTINATION_
             _mm256_slli_epi16::<SCALE>(_mm256_sub_epi16(_mm256_cvtepu8_epi16(u_high_u8), uv_corr));
         let v_high =
             _mm256_slli_epi16::<SCALE>(_mm256_sub_epi16(_mm256_cvtepu8_epi16(v_high_u8), uv_corr));
-        let y_high0 = _mm256_mulhi_epi16(
+        let y_high0 = _mm256_mulhrs_epi16(
             _mm256_slli_epi16::<SCALE>(_mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>(
                 y_values0,
             ))),
             v_luma_coeff,
         );
-        let y_high1 = _mm256_mulhi_epi16(
+        let y_high1 = _mm256_mulhrs_epi16(
             _mm256_slli_epi16::<SCALE>(_mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>(
                 y_values1,
             ))),
@@ -138,16 +138,16 @@ unsafe fn avx2_yuv_nv_to_rgba_row_impl420<const UV_ORDER: u8, const DESTINATION_
         );
 
         let g_coeff_hi = _mm256_add_epi16(
-            _mm256_mulhi_epi16(v_high, v_g_coeff_1),
-            _mm256_mulhi_epi16(u_high, v_g_coeff_2),
+            _mm256_mulhrs_epi16(v_high, v_g_coeff_1),
+            _mm256_mulhrs_epi16(u_high, v_g_coeff_2),
         );
 
         let r_high0 = _mm256_srli_epi16::<V_SHR>(_mm256_add_epi16(
-            _mm256_add_epi16(y_high0, _mm256_mulhi_epi16(v_high, v_cr_coeff)),
+            _mm256_add_epi16(y_high0, _mm256_mulhrs_epi16(v_high, v_cr_coeff)),
             rounding_const,
         ));
         let b_high0 = _mm256_srli_epi16::<V_SHR>(_mm256_add_epi16(
-            _mm256_add_epi16(y_high0, _mm256_mulhi_epi16(u_high, v_cb_coeff)),
+            _mm256_add_epi16(y_high0, _mm256_mulhrs_epi16(u_high, v_cb_coeff)),
             rounding_const,
         ));
         let g_high0 = _mm256_srli_epi16::<V_SHR>(_mm256_add_epi16(
@@ -155,11 +155,11 @@ unsafe fn avx2_yuv_nv_to_rgba_row_impl420<const UV_ORDER: u8, const DESTINATION_
             rounding_const,
         ));
         let r_high1 = _mm256_srli_epi16::<V_SHR>(_mm256_add_epi16(
-            _mm256_add_epi16(y_high1, _mm256_mulhi_epi16(v_high, v_cr_coeff)),
+            _mm256_add_epi16(y_high1, _mm256_mulhrs_epi16(v_high, v_cr_coeff)),
             rounding_const,
         ));
         let b_high1 = _mm256_srli_epi16::<V_SHR>(_mm256_add_epi16(
-            _mm256_add_epi16(y_high1, _mm256_mulhi_epi16(u_high, v_cb_coeff)),
+            _mm256_add_epi16(y_high1, _mm256_mulhrs_epi16(u_high, v_cb_coeff)),
             rounding_const,
         ));
         let g_high1 = _mm256_srli_epi16::<V_SHR>(_mm256_add_epi16(
@@ -171,26 +171,26 @@ unsafe fn avx2_yuv_nv_to_rgba_row_impl420<const UV_ORDER: u8, const DESTINATION_
             _mm256_slli_epi16::<SCALE>(_mm256_sub_epi16(_mm256_cvtepu8_epi16(u_low_u8), uv_corr));
         let v_low =
             _mm256_slli_epi16::<SCALE>(_mm256_sub_epi16(_mm256_cvtepu8_epi16(v_low_u8), uv_corr));
-        let y_low0 = _mm256_mulhi_epi16(
+        let y_low0 = _mm256_mulhrs_epi16(
             _mm256_slli_epi16::<SCALE>(_mm256_cvtepu8_epi16(_mm256_castsi256_si128(y_values0))),
             v_luma_coeff,
         );
-        let y_low1 = _mm256_mulhi_epi16(
+        let y_low1 = _mm256_mulhrs_epi16(
             _mm256_slli_epi16::<SCALE>(_mm256_cvtepu8_epi16(_mm256_castsi256_si128(y_values1))),
             v_luma_coeff,
         );
 
         let g_coeff_lo = _mm256_add_epi16(
-            _mm256_mulhi_epi16(v_low, v_g_coeff_1),
-            _mm256_mulhi_epi16(u_low, v_g_coeff_2),
+            _mm256_mulhrs_epi16(v_low, v_g_coeff_1),
+            _mm256_mulhrs_epi16(u_low, v_g_coeff_2),
         );
 
         let r_low0 = _mm256_srli_epi16::<V_SHR>(_mm256_add_epi16(
-            _mm256_add_epi16(y_low0, _mm256_mulhi_epi16(v_low, v_cr_coeff)),
+            _mm256_add_epi16(y_low0, _mm256_mulhrs_epi16(v_low, v_cr_coeff)),
             rounding_const,
         ));
         let b_low0 = _mm256_srli_epi16::<V_SHR>(_mm256_add_epi16(
-            _mm256_add_epi16(y_low0, _mm256_mulhi_epi16(u_low, v_cb_coeff)),
+            _mm256_add_epi16(y_low0, _mm256_mulhrs_epi16(u_low, v_cb_coeff)),
             rounding_const,
         ));
         let g_low0 = _mm256_srli_epi16::<V_SHR>(_mm256_add_epi16(
@@ -199,11 +199,11 @@ unsafe fn avx2_yuv_nv_to_rgba_row_impl420<const UV_ORDER: u8, const DESTINATION_
         ));
 
         let r_low1 = _mm256_srli_epi16::<V_SHR>(_mm256_add_epi16(
-            _mm256_add_epi16(y_low1, _mm256_mulhi_epi16(v_low, v_cr_coeff)),
+            _mm256_add_epi16(y_low1, _mm256_mulhrs_epi16(v_low, v_cr_coeff)),
             rounding_const,
         ));
         let b_low1 = _mm256_srli_epi16::<V_SHR>(_mm256_add_epi16(
-            _mm256_add_epi16(y_low1, _mm256_mulhi_epi16(u_low, v_cb_coeff)),
+            _mm256_add_epi16(y_low1, _mm256_mulhrs_epi16(u_low, v_cb_coeff)),
             rounding_const,
         ));
         let g_low1 = _mm256_srli_epi16::<V_SHR>(_mm256_add_epi16(
diff --git a/src/avx2/yuv_to_rgba.rs b/src/avx2/yuv_to_rgba.rs
index 1937351..36711b3 100644
--- a/src/avx2/yuv_to_rgba.rs
+++ b/src/avx2/yuv_to_rgba.rs
@@ -87,7 +87,7 @@ unsafe fn avx2_yuv_to_rgba_row_impl<const DESTINATION_CHANNELS: u8, const SAMPLI
     let v_g_coeff_2 = _mm256_set1_epi16(transform.g_coeff_2 as i16);
     let v_alpha = _mm256_set1_epi8(255u8 as i8);
 
-    const SCALE: i32 = 7;
+    const SCALE: i32 = 6;
     const V_SHR: i32 = 3;
 
     let rounding_const = _mm256_set1_epi16(1 << (V_SHR - 1));
@@ -121,7 +121,7 @@ unsafe fn avx2_yuv_to_rgba_row_impl<const DESTINATION_CHANNELS: u8, const SAMPLI
 
         let u_high = _mm256_slli_epi16::<SCALE>(_mm256_sub_epi16(u_high_u16, uv_corr));
         let v_high = _mm256_slli_epi16::<SCALE>(_mm256_sub_epi16(v_high_u16, uv_corr));
-        let y_high = _mm256_mulhi_epi16(
+        let y_high = _mm256_mulhrs_epi16(
             _mm256_slli_epi16::<SCALE>(_mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>(
                 y_values,
             ))),
@@ -129,19 +129,19 @@ unsafe fn avx2_yuv_to_rgba_row_impl<const DESTINATION_CHANNELS: u8, const SAMPLI
         );
 
         let r_high = _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
-            _mm256_add_epi16(y_high, _mm256_mulhi_epi16(v_high, v_cr_coeff)),
+            _mm256_add_epi16(y_high, _mm256_mulhrs_epi16(v_high, v_cr_coeff)),
             rounding_const,
         ));
         let b_high = _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
-            _mm256_add_epi16(y_high, _mm256_mulhi_epi16(u_high, v_cb_coeff)),
+            _mm256_add_epi16(y_high, _mm256_mulhrs_epi16(u_high, v_cb_coeff)),
             rounding_const,
         ));
         let g_high = _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
             _mm256_sub_epi16(
                 y_high,
                 _mm256_add_epi16(
-                    _mm256_mulhi_epi16(v_high, v_g_coeff_1),
-                    _mm256_mulhi_epi16(u_high, v_g_coeff_2),
+                    _mm256_mulhrs_epi16(v_high, v_g_coeff_1),
+                    _mm256_mulhrs_epi16(u_high, v_g_coeff_2),
                 ),
             ),
             rounding_const,
@@ -149,25 +149,25 @@ unsafe fn avx2_yuv_to_rgba_row_impl<const DESTINATION_CHANNELS: u8, const SAMPLI
 
         let u_low = _mm256_slli_epi16::<SCALE>(_mm256_sub_epi16(u_low_u16, uv_corr));
         let v_low = _mm256_slli_epi16::<SCALE>(_mm256_sub_epi16(v_low_u16, uv_corr));
-        let y_low = _mm256_mulhi_epi16(
+        let y_low = _mm256_mulhrs_epi16(
             _mm256_slli_epi16::<SCALE>(_mm256_cvtepu8_epi16(_mm256_castsi256_si128(y_values))),
             v_luma_coeff,
         );
 
         let r_low = _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
-            _mm256_add_epi16(y_low, _mm256_mulhi_epi16(v_low, v_cr_coeff)),
+            _mm256_add_epi16(y_low, _mm256_mulhrs_epi16(v_low, v_cr_coeff)),
             rounding_const,
         ));
         let b_low = _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
-            _mm256_add_epi16(y_low, _mm256_mulhi_epi16(u_low, v_cb_coeff)),
+            _mm256_add_epi16(y_low, _mm256_mulhrs_epi16(u_low, v_cb_coeff)),
             rounding_const,
         ));
         let g_low = _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
             _mm256_sub_epi16(
                 y_low,
                 _mm256_add_epi16(
-                    _mm256_mulhi_epi16(v_low, v_g_coeff_1),
-                    _mm256_mulhi_epi16(u_low, v_g_coeff_2),
+                    _mm256_mulhrs_epi16(v_low, v_g_coeff_1),
+                    _mm256_mulhrs_epi16(u_low, v_g_coeff_2),
                 ),
             ),
             rounding_const,
diff --git a/src/avx2/yuv_to_rgba420.rs b/src/avx2/yuv_to_rgba420.rs
index aa62cf5..f85b167 100644
--- a/src/avx2/yuv_to_rgba420.rs
+++ b/src/avx2/yuv_to_rgba420.rs
@@ -87,7 +87,7 @@ unsafe fn avx2_yuv_to_rgba_row_impl420<const DESTINATION_CHANNELS: u8>(
     let v_g_coeff_2 = _mm256_set1_epi16(transform.g_coeff_2 as i16);
     let v_alpha = _mm256_set1_epi8(255u8 as i8);
 
-    const SCALE: i32 = 7;
+    const SCALE: i32 = 6;
     const V_SHR: i32 = 3;
 
     let rounding_const = _mm256_set1_epi16(1 << (V_SHR - 1));
@@ -112,13 +112,13 @@ unsafe fn avx2_yuv_to_rgba_row_impl420<const DESTINATION_CHANNELS: u8>(
 
         let u_high = _mm256_slli_epi16::<SCALE>(_mm256_sub_epi16(u_high_u16, uv_corr));
         let v_high = _mm256_slli_epi16::<SCALE>(_mm256_sub_epi16(v_high_u16, uv_corr));
-        let y_high0 = _mm256_mulhi_epi16(
+        let y_high0 = _mm256_mulhrs_epi16(
             _mm256_slli_epi16::<SCALE>(_mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>(
                 y_values0,
             ))),
             v_luma_coeff,
         );
-        let y_high1 = _mm256_mulhi_epi16(
+        let y_high1 = _mm256_mulhrs_epi16(
             _mm256_slli_epi16::<SCALE>(_mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>(
                 y_values1,
             ))),
@@ -126,16 +126,16 @@ unsafe fn avx2_yuv_to_rgba_row_impl420<const DESTINATION_CHANNELS: u8>(
         );
 
         let g_coeff_hi = _mm256_add_epi16(
-            _mm256_mulhi_epi16(v_high, v_g_coeff_1),
-            _mm256_mulhi_epi16(u_high, v_g_coeff_2),
+            _mm256_mulhrs_epi16(v_high, v_g_coeff_1),
+            _mm256_mulhrs_epi16(u_high, v_g_coeff_2),
         );
 
         let r_high0 = _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
-            _mm256_add_epi16(y_high0, _mm256_mulhi_epi16(v_high, v_cr_coeff)),
+            _mm256_add_epi16(y_high0, _mm256_mulhrs_epi16(v_high, v_cr_coeff)),
             rounding_const,
         ));
         let b_high0 = _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
-            _mm256_add_epi16(y_high0, _mm256_mulhi_epi16(u_high, v_cb_coeff)),
+            _mm256_add_epi16(y_high0, _mm256_mulhrs_epi16(u_high, v_cb_coeff)),
             rounding_const,
         ));
         let g_high0 = _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
@@ -144,11 +144,11 @@ unsafe fn avx2_yuv_to_rgba_row_impl420<const DESTINATION_CHANNELS: u8>(
         ));
 
         let r_high1 = _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
-            _mm256_add_epi16(y_high1, _mm256_mulhi_epi16(v_high, v_cr_coeff)),
+            _mm256_add_epi16(y_high1, _mm256_mulhrs_epi16(v_high, v_cr_coeff)),
             rounding_const,
         ));
         let b_high1 = _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
-            _mm256_add_epi16(y_high1, _mm256_mulhi_epi16(u_high, v_cb_coeff)),
+            _mm256_add_epi16(y_high1, _mm256_mulhrs_epi16(u_high, v_cb_coeff)),
             rounding_const,
         ));
         let g_high1 = _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
@@ -158,26 +158,26 @@ unsafe fn avx2_yuv_to_rgba_row_impl420<const DESTINATION_CHANNELS: u8>(
 
         let u_low = _mm256_slli_epi16::<SCALE>(_mm256_sub_epi16(u_low_u16, uv_corr));
         let v_low = _mm256_slli_epi16::<SCALE>(_mm256_sub_epi16(v_low_u16, uv_corr));
-        let y_low0 = _mm256_mulhi_epi16(
+        let y_low0 = _mm256_mulhrs_epi16(
             _mm256_slli_epi16::<SCALE>(_mm256_cvtepu8_epi16(_mm256_castsi256_si128(y_values0))),
             v_luma_coeff,
         );
-        let y_low1 = _mm256_mulhi_epi16(
+        let y_low1 = _mm256_mulhrs_epi16(
             _mm256_slli_epi16::<SCALE>(_mm256_cvtepu8_epi16(_mm256_castsi256_si128(y_values1))),
             v_luma_coeff,
         );
 
         let g_coeff_lo = _mm256_add_epi16(
-            _mm256_mulhi_epi16(v_low, v_g_coeff_1),
-            _mm256_mulhi_epi16(u_low, v_g_coeff_2),
+            _mm256_mulhrs_epi16(v_low, v_g_coeff_1),
+            _mm256_mulhrs_epi16(u_low, v_g_coeff_2),
         );
 
         let r_low0 = _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
-            _mm256_add_epi16(y_low0, _mm256_mulhi_epi16(v_low, v_cr_coeff)),
+            _mm256_add_epi16(y_low0, _mm256_mulhrs_epi16(v_low, v_cr_coeff)),
             rounding_const,
         ));
         let b_low0 = _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
-            _mm256_add_epi16(y_low0, _mm256_mulhi_epi16(u_low, v_cb_coeff)),
+            _mm256_add_epi16(y_low0, _mm256_mulhrs_epi16(u_low, v_cb_coeff)),
             rounding_const,
         ));
         let g_low0 = _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
@@ -186,11 +186,11 @@ unsafe fn avx2_yuv_to_rgba_row_impl420<const DESTINATION_CHANNELS: u8>(
         ));
 
         let r_low1 = _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
-            _mm256_add_epi16(y_low1, _mm256_mulhi_epi16(v_low, v_cr_coeff)),
+            _mm256_add_epi16(y_low1, _mm256_mulhrs_epi16(v_low, v_cr_coeff)),
             rounding_const,
         ));
         let b_low1 = _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
-            _mm256_add_epi16(y_low1, _mm256_mulhi_epi16(u_low, v_cb_coeff)),
+            _mm256_add_epi16(y_low1, _mm256_mulhrs_epi16(u_low, v_cb_coeff)),
             rounding_const,
         ));
         let g_low1 = _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
diff --git a/src/avx2/yuv_to_rgba_alpha.rs b/src/avx2/yuv_to_rgba_alpha.rs
index 141f440..a33bee4 100644
--- a/src/avx2/yuv_to_rgba_alpha.rs
+++ b/src/avx2/yuv_to_rgba_alpha.rs
@@ -93,6 +93,9 @@ unsafe fn avx2_yuv_to_rgba_alpha_impl<const DESTINATION_CHANNELS: u8, const SAMP
     let a_ptr = a_plane.as_ptr();
     let rgba_ptr = rgba.as_mut_ptr();
 
+    const SCALE: i32 = 6;
+    const V_SHR: i32 = 3;
+
     let y_corr = _mm256_set1_epi8(range.bias_y as i8);
     let uv_corr = _mm256_set1_epi16(range.bias_uv as i16);
     let v_luma_coeff = _mm256_set1_epi16(transform.y_coef as i16);
@@ -100,7 +103,7 @@ unsafe fn avx2_yuv_to_rgba_alpha_impl<const DESTINATION_CHANNELS: u8, const SAMP
     let v_cb_coeff = _mm256_set1_epi16(transform.cb_coef as i16);
     let v_g_coeff_1 = _mm256_set1_epi16(transform.g_coeff_1 as i16);
     let v_g_coeff_2 = _mm256_set1_epi16(transform.g_coeff_2 as i16);
-    let rounding_const = _mm256_set1_epi16(1 << 5);
+    let rounding_const = _mm256_set1_epi16(1 << (V_SHR - 1));
 
     while cx + 32 < width {
         let y_values =
@@ -131,55 +134,55 @@ unsafe fn avx2_yuv_to_rgba_alpha_impl<const DESTINATION_CHANNELS: u8, const SAMP
             }
         }
 
-        let u_high = _mm256_slli_epi16::<7>(_mm256_sub_epi16(u_high_u16, uv_corr));
-        let v_high = _mm256_slli_epi16::<7>(_mm256_sub_epi16(v_high_u16, uv_corr));
-        let y_high = _mm256_mulhi_epi16(
-            _mm256_slli_epi16::<7>(_mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>(
+        let u_high = _mm256_slli_epi16::<SCALE>(_mm256_sub_epi16(u_high_u16, uv_corr));
+        let v_high = _mm256_slli_epi16::<SCALE>(_mm256_sub_epi16(v_high_u16, uv_corr));
+        let y_high = _mm256_mulhrs_epi16(
+            _mm256_slli_epi16::<SCALE>(_mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>(
                 y_values,
             ))),
             v_luma_coeff,
         );
 
-        let r_high = _mm256_srai_epi16::<3>(_mm256_add_epi16(
-            _mm256_add_epi16(y_high, _mm256_mulhi_epi16(v_high, v_cr_coeff)),
+        let r_high = _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
+            _mm256_add_epi16(y_high, _mm256_mulhrs_epi16(v_high, v_cr_coeff)),
             rounding_const,
         ));
         let b_high = _mm256_srai_epi16::<3>(_mm256_add_epi16(
-            _mm256_add_epi16(y_high, _mm256_mulhi_epi16(u_high, v_cb_coeff)),
+            _mm256_add_epi16(y_high, _mm256_mulhrs_epi16(u_high, v_cb_coeff)),
             rounding_const,
         ));
-        let g_high = _mm256_srai_epi16::<3>(_mm256_add_epi16(
+        let g_high = _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
             _mm256_sub_epi16(
                 y_high,
                 _mm256_add_epi16(
-                    _mm256_mulhi_epi16(v_high, v_g_coeff_1),
-                    _mm256_mulhi_epi16(u_high, v_g_coeff_2),
+                    _mm256_mulhrs_epi16(v_high, v_g_coeff_1),
+                    _mm256_mulhrs_epi16(u_high, v_g_coeff_2),
                 ),
             ),
             rounding_const,
         ));
 
-        let u_low = _mm256_slli_epi16::<7>(_mm256_sub_epi16(u_low_u16, uv_corr));
-        let v_low = _mm256_slli_epi16::<7>(_mm256_sub_epi16(v_low_u16, uv_corr));
-        let y_low = _mm256_mulhi_epi16(
-            _mm256_slli_epi16::<7>(_mm256_cvtepu8_epi16(_mm256_castsi256_si128(y_values))),
+        let u_low = _mm256_slli_epi16::<SCALE>(_mm256_sub_epi16(u_low_u16, uv_corr));
+        let v_low = _mm256_slli_epi16::<SCALE>(_mm256_sub_epi16(v_low_u16, uv_corr));
+        let y_low = _mm256_mulhrs_epi16(
+            _mm256_slli_epi16::<SCALE>(_mm256_cvtepu8_epi16(_mm256_castsi256_si128(y_values))),
             v_luma_coeff,
         );
 
-        let r_low = _mm256_srai_epi16::<3>(_mm256_add_epi16(
-            _mm256_add_epi16(y_low, _mm256_mulhi_epi16(v_low, v_cr_coeff)),
+        let r_low = _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
+            _mm256_add_epi16(y_low, _mm256_mulhrs_epi16(v_low, v_cr_coeff)),
             rounding_const,
         ));
-        let b_low = _mm256_srai_epi16::<3>(_mm256_add_epi16(
-            _mm256_add_epi16(y_low, _mm256_mulhi_epi16(u_low, v_cb_coeff)),
+        let b_low = _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
+            _mm256_add_epi16(y_low, _mm256_mulhrs_epi16(u_low, v_cb_coeff)),
             rounding_const,
         ));
-        let g_low = _mm256_srai_epi16::<3>(_mm256_add_epi16(
+        let g_low = _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
             _mm256_sub_epi16(
                 y_low,
                 _mm256_add_epi16(
-                    _mm256_mulhi_epi16(v_low, v_g_coeff_1),
-                    _mm256_mulhi_epi16(u_low, v_g_coeff_2),
+                    _mm256_mulhrs_epi16(v_low, v_g_coeff_1),
+                    _mm256_mulhrs_epi16(u_low, v_g_coeff_2),
                 ),
             ),
             rounding_const,
diff --git a/src/rgba_to_yuv.rs b/src/rgba_to_yuv.rs
index 227d009..a9253b2 100644
--- a/src/rgba_to_yuv.rs
+++ b/src/rgba_to_yuv.rs
@@ -27,7 +27,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-use crate::avx2::avx2_rgba_to_yuv;
+use crate::avx2::{avx2_rgba_to_yuv, avx2_rgba_to_yuv420};
 #[cfg(all(
     any(target_arch = "x86", target_arch = "x86_64"),
     feature = "nightly_avx512"
@@ -40,7 +40,7 @@ use crate::neon::{
     neon_rgba_to_yuv, neon_rgba_to_yuv420, neon_rgba_to_yuv_rdm, neon_rgba_to_yuv_rdm420,
 };
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-use crate::sse::sse_rgba_to_yuv_row;
+use crate::sse::{sse_rgba_to_yuv_row, sse_rgba_to_yuv_row420};
 use crate::yuv_error::check_rgba_destination;
 #[allow(unused_imports)]
 use crate::yuv_support::*;
@@ -156,7 +156,6 @@ fn rgbx_to_yuv8<const ORIGIN_CHANNELS: u8, const SAMPLING: u8>(
                     _offset.cx,
                     _offset.ux,
                     planar_image.width as usize,
-                    compute_uv_row,
                 );
                 _offset = processed_offset;
             }
@@ -172,7 +171,6 @@ fn rgbx_to_yuv8<const ORIGIN_CHANNELS: u8, const SAMPLING: u8>(
                     _offset.cx,
                     _offset.ux,
                     planar_image.width as usize,
-                    compute_uv_row,
                 );
                 _offset = processed_offset;
             }
@@ -220,6 +218,42 @@ fn rgbx_to_yuv8<const ORIGIN_CHANNELS: u8, const SAMPLING: u8>(
             );
             _offset = offset;
         }
+        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+        {
+            if use_avx {
+                let processed_offset = avx2_rgba_to_yuv420::<ORIGIN_CHANNELS>(
+                    &transform,
+                    &range,
+                    _y_plane0,
+                    _y_plane1,
+                    _u_plane,
+                    _v_plane,
+                    _rgba0,
+                    _rgba1,
+                    _offset.cx,
+                    _offset.ux,
+                    planar_image.width as usize,
+                );
+                _offset = processed_offset;
+            }
+
+            if use_sse {
+                let processed_offset = sse_rgba_to_yuv_row420::<ORIGIN_CHANNELS>(
+                    &transform,
+                    &range,
+                    _y_plane0,
+                    _y_plane1,
+                    _u_plane,
+                    _v_plane,
+                    _rgba0,
+                    _rgba1,
+                    _offset.cx,
+                    _offset.ux,
+                    planar_image.width as usize,
+                );
+                _offset = processed_offset;
+            }
+        }
         _offset
     };
 
@@ -343,7 +377,7 @@ fn rgbx_to_yuv8<const ORIGIN_CHANNELS: u8, const SAMPLING: u8>(
             let b11 = src11[src_chans.get_b_channel_offset()] as i32;
             let y_11 = (r11 * transform.yr + g11 * transform.yg + b11 * transform.yb + bias_y)
                 >> PRECISION;
-            y_dst0[1] = y_11.max(i_bias_y).min(i_cap_y) as u8;
+            y_dst1[1] = y_11.max(i_bias_y).min(i_cap_y) as u8;
 
             let ruv = (r00 + r01 + 1) >> 1;
             let guv = (g00 + g01 + 1) >> 1;
diff --git a/src/sse/mod.rs b/src/sse/mod.rs
index 68f0981..15be6cb 100644
--- a/src/sse/mod.rs
+++ b/src/sse/mod.rs
@@ -50,6 +50,7 @@ mod yuv_to_rgba_alpha;
 mod yuv_to_yuy2;
 mod yuy2_to_rgb;
 mod yuy2_to_yuv;
+mod rgba_to_yuv420;
 
 pub(crate) use rgb_to_nv::sse_rgba_to_nv_row;
 pub(crate) use rgb_to_y::sse_rgb_to_y;
@@ -72,3 +73,4 @@ pub(crate) use yuv_to_rgba_alpha::sse_yuv_to_rgba_alpha_row;
 pub(crate) use yuv_to_yuy2::yuv_to_yuy2_sse;
 pub(crate) use yuy2_to_rgb::yuy2_to_rgb_sse;
 pub(crate) use yuy2_to_yuv::yuy2_to_yuv_sse;
+pub(crate) use rgba_to_yuv420::sse_rgba_to_yuv_row420;
\ No newline at end of file
diff --git a/src/sse/rgb_to_nv.rs b/src/sse/rgb_to_nv.rs
index a2182a3..2fedfae 100644
--- a/src/sse/rgb_to_nv.rs
+++ b/src/sse/rgb_to_nv.rs
@@ -96,7 +96,7 @@ unsafe fn sse_rgba_to_nv_row_impl<
     let mut uv_x = start_ux;
 
     const V_SHR: i32 = 3;
-    const V_SCALE: i32 = 7;
+    const V_SCALE: i32 = 6;
     let rounding_const_bias: i16 = 1 << (V_SHR - 1);
     let bias_y = range.bias_y as i16 * (1 << V_SHR) + rounding_const_bias;
     let bias_uv = range.bias_uv as i16 * (1 << V_SHR) + rounding_const_bias;
@@ -174,8 +174,8 @@ unsafe fn sse_rgba_to_nv_row_impl<
                 _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
                     y_bias,
                     _mm_add_epi16(
-                        _mm_add_epi16(_mm_mulhi_epi16(r_low, v_yr), _mm_mulhi_epi16(g_low, v_yg)),
-                        _mm_mulhi_epi16(b_low, v_yb),
+                        _mm_add_epi16(_mm_mulhrs_epi16(r_low, v_yr), _mm_mulhrs_epi16(g_low, v_yg)),
+                        _mm_mulhrs_epi16(b_low, v_yb),
                     ),
                 )),
                 i_cap_y,
@@ -188,8 +188,11 @@ unsafe fn sse_rgba_to_nv_row_impl<
                 _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
                     y_bias,
                     _mm_add_epi16(
-                        _mm_add_epi16(_mm_mulhi_epi16(r_high, v_yr), _mm_mulhi_epi16(g_high, v_yg)),
-                        _mm_mulhi_epi16(b_high, v_yb),
+                        _mm_add_epi16(
+                            _mm_mulhrs_epi16(r_high, v_yr),
+                            _mm_mulhrs_epi16(g_high, v_yg),
+                        ),
+                        _mm_mulhrs_epi16(b_high, v_yb),
                     ),
                 )),
                 i_cap_y,
@@ -207,10 +210,10 @@ unsafe fn sse_rgba_to_nv_row_impl<
                         uv_bias,
                         _mm_add_epi16(
                             _mm_add_epi16(
-                                _mm_mulhi_epi16(r_low, v_cb_r),
-                                _mm_mulhi_epi16(g_low, v_cb_g),
+                                _mm_mulhrs_epi16(r_low, v_cb_r),
+                                _mm_mulhrs_epi16(g_low, v_cb_g),
                             ),
-                            _mm_mulhi_epi16(b_low, v_cb_b),
+                            _mm_mulhrs_epi16(b_low, v_cb_b),
                         ),
                     )),
                     i_cap_uv,
@@ -223,10 +226,10 @@ unsafe fn sse_rgba_to_nv_row_impl<
                         uv_bias,
                         _mm_add_epi16(
                             _mm_add_epi16(
-                                _mm_mulhi_epi16(r_low, v_cr_r),
-                                _mm_mulhi_epi16(g_low, v_cr_g),
+                                _mm_mulhrs_epi16(r_low, v_cr_r),
+                                _mm_mulhrs_epi16(g_low, v_cr_g),
                             ),
-                            _mm_mulhi_epi16(b_low, v_cr_b),
+                            _mm_mulhrs_epi16(b_low, v_cr_b),
                         ),
                     )),
                     i_cap_uv,
@@ -239,10 +242,10 @@ unsafe fn sse_rgba_to_nv_row_impl<
                         uv_bias,
                         _mm_add_epi16(
                             _mm_add_epi16(
-                                _mm_mulhi_epi16(r_high, v_cb_r),
-                                _mm_mulhi_epi16(g_high, v_cb_g),
+                                _mm_mulhrs_epi16(r_high, v_cb_r),
+                                _mm_mulhrs_epi16(g_high, v_cb_g),
                             ),
-                            _mm_mulhi_epi16(b_high, v_cb_b),
+                            _mm_mulhrs_epi16(b_high, v_cb_b),
                         ),
                     )),
                     i_cap_uv,
@@ -255,10 +258,10 @@ unsafe fn sse_rgba_to_nv_row_impl<
                         uv_bias,
                         _mm_add_epi16(
                             _mm_add_epi16(
-                                _mm_mulhi_epi16(r_high, v_cr_r),
-                                _mm_mulhi_epi16(g_high, v_cr_g),
+                                _mm_mulhrs_epi16(r_high, v_cr_r),
+                                _mm_mulhrs_epi16(g_high, v_cr_g),
                             ),
-                            _mm_mulhi_epi16(b_high, v_cr_b),
+                            _mm_mulhrs_epi16(b_high, v_cr_b),
                         ),
                     )),
                     i_cap_uv,
@@ -294,8 +297,11 @@ unsafe fn sse_rgba_to_nv_row_impl<
                     _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
                         uv_bias,
                         _mm_add_epi16(
-                            _mm_add_epi16(_mm_mulhi_epi16(r1, v_cb_r), _mm_mulhi_epi16(g1, v_cb_g)),
-                            _mm_mulhi_epi16(b1, v_cb_b),
+                            _mm_add_epi16(
+                                _mm_mulhrs_epi16(r1, v_cb_r),
+                                _mm_mulhrs_epi16(g1, v_cb_g),
+                            ),
+                            _mm_mulhrs_epi16(b1, v_cb_b),
                         ),
                     )),
                     i_cap_uv,
@@ -308,8 +314,11 @@ unsafe fn sse_rgba_to_nv_row_impl<
                     _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
                         uv_bias,
                         _mm_add_epi16(
-                            _mm_add_epi16(_mm_mulhi_epi16(r1, v_cr_r), _mm_mulhi_epi16(g1, v_cr_g)),
-                            _mm_mulhi_epi16(b1, v_cr_b),
+                            _mm_add_epi16(
+                                _mm_mulhrs_epi16(r1, v_cr_r),
+                                _mm_mulhrs_epi16(g1, v_cr_g),
+                            ),
+                            _mm_mulhrs_epi16(b1, v_cr_b),
                         ),
                     )),
                     i_cap_uv,
diff --git a/src/sse/rgb_to_y.rs b/src/sse/rgb_to_y.rs
index f7116d2..5f0f046 100644
--- a/src/sse/rgb_to_y.rs
+++ b/src/sse/rgb_to_y.rs
@@ -65,7 +65,7 @@ unsafe fn sse_rgb_to_y_impl<const ORIGIN_CHANNELS: u8>(
     let mut cx = start_cx;
 
     const V_SHR: i32 = 3;
-    const V_SCALE: i32 = 7;
+    const V_SCALE: i32 = 6;
     let rounding_const_bias: i16 = 1 << (V_SHR - 1);
     let bias_y = range.bias_y as i16 * (1 << V_SHR) + rounding_const_bias;
 
@@ -133,8 +133,8 @@ unsafe fn sse_rgb_to_y_impl<const ORIGIN_CHANNELS: u8>(
                 _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
                     y_bias,
                     _mm_add_epi16(
-                        _mm_add_epi16(_mm_mulhi_epi16(r_low, v_yr), _mm_mulhi_epi16(g_low, v_yg)),
-                        _mm_mulhi_epi16(b_low, v_yb),
+                        _mm_add_epi16(_mm_mulhrs_epi16(r_low, v_yr), _mm_mulhrs_epi16(g_low, v_yg)),
+                        _mm_mulhrs_epi16(b_low, v_yb),
                     ),
                 )),
                 i_cap_y,
@@ -147,8 +147,11 @@ unsafe fn sse_rgb_to_y_impl<const ORIGIN_CHANNELS: u8>(
                 _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
                     y_bias,
                     _mm_add_epi16(
-                        _mm_add_epi16(_mm_mulhi_epi16(r_high, v_yr), _mm_mulhi_epi16(g_high, v_yg)),
-                        _mm_mulhi_epi16(b_high, v_yb),
+                        _mm_add_epi16(
+                            _mm_mulhrs_epi16(r_high, v_yr),
+                            _mm_mulhrs_epi16(g_high, v_yg),
+                        ),
+                        _mm_mulhrs_epi16(b_high, v_yb),
                     ),
                 )),
                 i_cap_y,
diff --git a/src/sse/rgba_to_yuv.rs b/src/sse/rgba_to_yuv.rs
index de648d5..be395ac 100644
--- a/src/sse/rgba_to_yuv.rs
+++ b/src/sse/rgba_to_yuv.rs
@@ -47,20 +47,10 @@ pub(crate) fn sse_rgba_to_yuv_row<const ORIGIN_CHANNELS: u8, const SAMPLING: u8>
     start_cx: usize,
     start_ux: usize,
     width: usize,
-    compute_uv_row: bool,
 ) -> ProcessedOffset {
     unsafe {
         sse_rgba_to_yuv_row_impl::<ORIGIN_CHANNELS, SAMPLING>(
-            transform,
-            range,
-            y_plane,
-            u_plane,
-            v_plane,
-            rgba,
-            start_cx,
-            start_ux,
-            width,
-            compute_uv_row,
+            transform, range, y_plane, u_plane, v_plane, rgba, start_cx, start_ux, width,
         )
     }
 }
@@ -76,7 +66,6 @@ unsafe fn sse_rgba_to_yuv_row_impl<const ORIGIN_CHANNELS: u8, const SAMPLING: u8
     start_cx: usize,
     start_ux: usize,
     width: usize,
-    compute_uv_row: bool,
 ) -> ProcessedOffset {
     let chroma_subsampling: YuvChromaSubsampling = SAMPLING.into();
     let source_channels: YuvSourceChannels = ORIGIN_CHANNELS.into();
@@ -91,7 +80,7 @@ unsafe fn sse_rgba_to_yuv_row_impl<const ORIGIN_CHANNELS: u8, const SAMPLING: u8
     let mut uv_x = start_ux;
 
     const V_SHR: i32 = 3;
-    const V_SCALE: i32 = 7;
+    const V_SCALE: i32 = 6;
     let rounding_const_bias: i16 = 1 << (V_SHR - 1);
     let bias_y = range.bias_y as i16 * (1 << V_SHR) + rounding_const_bias;
     let bias_uv = range.bias_uv as i16 * (1 << V_SHR) + rounding_const_bias;
@@ -169,8 +158,8 @@ unsafe fn sse_rgba_to_yuv_row_impl<const ORIGIN_CHANNELS: u8, const SAMPLING: u8
                 _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
                     y_bias,
                     _mm_add_epi16(
-                        _mm_add_epi16(_mm_mulhi_epi16(r_low, v_yr), _mm_mulhi_epi16(g_low, v_yg)),
-                        _mm_mulhi_epi16(b_low, v_yb),
+                        _mm_add_epi16(_mm_mulhrs_epi16(r_low, v_yr), _mm_mulhrs_epi16(g_low, v_yg)),
+                        _mm_mulhrs_epi16(b_low, v_yb),
                     ),
                 )),
                 i_cap_y,
@@ -183,8 +172,11 @@ unsafe fn sse_rgba_to_yuv_row_impl<const ORIGIN_CHANNELS: u8, const SAMPLING: u8
                 _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
                     y_bias,
                     _mm_add_epi16(
-                        _mm_add_epi16(_mm_mulhi_epi16(r_high, v_yr), _mm_mulhi_epi16(g_high, v_yg)),
-                        _mm_mulhi_epi16(b_high, v_yb),
+                        _mm_add_epi16(
+                            _mm_mulhrs_epi16(r_high, v_yr),
+                            _mm_mulhrs_epi16(g_high, v_yg),
+                        ),
+                        _mm_mulhrs_epi16(b_high, v_yb),
                     ),
                 )),
                 i_cap_y,
@@ -202,10 +194,10 @@ unsafe fn sse_rgba_to_yuv_row_impl<const ORIGIN_CHANNELS: u8, const SAMPLING: u8
                         uv_bias,
                         _mm_add_epi16(
                             _mm_add_epi16(
-                                _mm_mulhi_epi16(r_low, v_cb_r),
-                                _mm_mulhi_epi16(g_low, v_cb_g),
+                                _mm_mulhrs_epi16(r_low, v_cb_r),
+                                _mm_mulhrs_epi16(g_low, v_cb_g),
                             ),
-                            _mm_mulhi_epi16(b_low, v_cb_b),
+                            _mm_mulhrs_epi16(b_low, v_cb_b),
                         ),
                     )),
                     i_cap_uv,
@@ -218,10 +210,10 @@ unsafe fn sse_rgba_to_yuv_row_impl<const ORIGIN_CHANNELS: u8, const SAMPLING: u8
                         uv_bias,
                         _mm_add_epi16(
                             _mm_add_epi16(
-                                _mm_mulhi_epi16(r_low, v_cr_r),
-                                _mm_mulhi_epi16(g_low, v_cr_g),
+                                _mm_mulhrs_epi16(r_low, v_cr_r),
+                                _mm_mulhrs_epi16(g_low, v_cr_g),
                             ),
-                            _mm_mulhi_epi16(b_low, v_cr_b),
+                            _mm_mulhrs_epi16(b_low, v_cr_b),
                         ),
                     )),
                     i_cap_uv,
@@ -234,10 +226,10 @@ unsafe fn sse_rgba_to_yuv_row_impl<const ORIGIN_CHANNELS: u8, const SAMPLING: u8
                         uv_bias,
                         _mm_add_epi16(
                             _mm_add_epi16(
-                                _mm_mulhi_epi16(r_high, v_cb_r),
-                                _mm_mulhi_epi16(g_high, v_cb_g),
+                                _mm_mulhrs_epi16(r_high, v_cb_r),
+                                _mm_mulhrs_epi16(g_high, v_cb_g),
                             ),
-                            _mm_mulhi_epi16(b_high, v_cb_b),
+                            _mm_mulhrs_epi16(b_high, v_cb_b),
                         ),
                     )),
                     i_cap_uv,
@@ -250,10 +242,10 @@ unsafe fn sse_rgba_to_yuv_row_impl<const ORIGIN_CHANNELS: u8, const SAMPLING: u8
                         uv_bias,
                         _mm_add_epi16(
                             _mm_add_epi16(
-                                _mm_mulhi_epi16(r_high, v_cr_r),
-                                _mm_mulhi_epi16(g_high, v_cr_g),
+                                _mm_mulhrs_epi16(r_high, v_cr_r),
+                                _mm_mulhrs_epi16(g_high, v_cr_g),
                             ),
-                            _mm_mulhi_epi16(b_high, v_cr_b),
+                            _mm_mulhrs_epi16(b_high, v_cr_b),
                         ),
                     )),
                     i_cap_uv,
@@ -269,7 +261,7 @@ unsafe fn sse_rgba_to_yuv_row_impl<const ORIGIN_CHANNELS: u8, const SAMPLING: u8
             _mm_storeu_si128(v_ptr.add(uv_x) as *mut __m128i, cr);
             uv_x += 16;
         } else if chroma_subsampling == YuvChromaSubsampling::Yuv422
-            || (chroma_subsampling == YuvChromaSubsampling::Yuv420 && compute_uv_row)
+            || (chroma_subsampling == YuvChromaSubsampling::Yuv420)
         {
             let r1 = _mm_avg_epu16(r_low, r_high);
             let g1 = _mm_avg_epu16(g_low, g_high);
@@ -280,8 +272,11 @@ unsafe fn sse_rgba_to_yuv_row_impl<const ORIGIN_CHANNELS: u8, const SAMPLING: u8
                     _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
                         uv_bias,
                         _mm_add_epi16(
-                            _mm_add_epi16(_mm_mulhi_epi16(r1, v_cb_r), _mm_mulhi_epi16(g1, v_cb_g)),
-                            _mm_mulhi_epi16(b1, v_cb_b),
+                            _mm_add_epi16(
+                                _mm_mulhrs_epi16(r1, v_cb_r),
+                                _mm_mulhrs_epi16(g1, v_cb_g),
+                            ),
+                            _mm_mulhrs_epi16(b1, v_cb_b),
                         ),
                     )),
                     i_cap_uv,
@@ -294,8 +289,11 @@ unsafe fn sse_rgba_to_yuv_row_impl<const ORIGIN_CHANNELS: u8, const SAMPLING: u8
                     _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
                         uv_bias,
                         _mm_add_epi16(
-                            _mm_add_epi16(_mm_mulhi_epi16(r1, v_cr_r), _mm_mulhi_epi16(g1, v_cr_g)),
-                            _mm_mulhi_epi16(b1, v_cr_b),
+                            _mm_add_epi16(
+                                _mm_mulhrs_epi16(r1, v_cr_r),
+                                _mm_mulhrs_epi16(g1, v_cr_g),
+                            ),
+                            _mm_mulhrs_epi16(b1, v_cr_b),
                         ),
                     )),
                     i_cap_uv,
diff --git a/src/sse/rgba_to_yuv420.rs b/src/sse/rgba_to_yuv420.rs
new file mode 100644
index 0000000..b53a6c7
--- /dev/null
+++ b/src/sse/rgba_to_yuv420.rs
@@ -0,0 +1,319 @@
+/*
+ * Copyright (c) Radzivon Bartoshyk, 10/2024. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1.  Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3.  Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+use crate::internals::ProcessedOffset;
+use crate::sse::sse_support::{sse_deinterleave_rgb, sse_deinterleave_rgba};
+use crate::yuv_support::{CbCrForwardTransform, YuvChromaRange, YuvSourceChannels};
+#[cfg(target_arch = "x86")]
+use std::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use std::arch::x86_64::*;
+
+pub(crate) fn sse_rgba_to_yuv_row420<const ORIGIN_CHANNELS: u8>(
+    transform: &CbCrForwardTransform<i32>,
+    range: &YuvChromaRange,
+    y_plane0: &mut [u8],
+    y_plane1: &mut [u8],
+    u_plane: &mut [u8],
+    v_plane: &mut [u8],
+    rgba0: &[u8],
+    rgba1: &[u8],
+    start_cx: usize,
+    start_ux: usize,
+    width: usize,
+) -> ProcessedOffset {
+    unsafe {
+        sse_rgba_to_yuv_row_impl420::<ORIGIN_CHANNELS>(
+            transform, range, y_plane0, y_plane1, u_plane, v_plane, rgba0, rgba1, start_cx,
+            start_ux, width,
+        )
+    }
+}
+
+#[target_feature(enable = "sse4.1")]
+unsafe fn sse_rgba_to_yuv_row_impl420<const ORIGIN_CHANNELS: u8>(
+    transform: &CbCrForwardTransform<i32>,
+    range: &YuvChromaRange,
+    y_plane0: &mut [u8],
+    y_plane1: &mut [u8],
+    u_plane: &mut [u8],
+    v_plane: &mut [u8],
+    rgba0: &[u8],
+    rgba1: &[u8],
+    start_cx: usize,
+    start_ux: usize,
+    width: usize,
+) -> ProcessedOffset {
+    let source_channels: YuvSourceChannels = ORIGIN_CHANNELS.into();
+    let channels = source_channels.get_channels_count();
+
+    let u_ptr = u_plane.as_mut_ptr();
+    let v_ptr = v_plane.as_mut_ptr();
+
+    let mut cx = start_cx;
+    let mut uv_x = start_ux;
+
+    const V_SHR: i32 = 3;
+    const V_SCALE: i32 = 6;
+    let rounding_const_bias: i16 = 1 << (V_SHR - 1);
+    let bias_y = range.bias_y as i16 * (1 << V_SHR) + rounding_const_bias;
+    let bias_uv = range.bias_uv as i16 * (1 << V_SHR) + rounding_const_bias;
+
+    let i_bias_y = _mm_set1_epi16(range.bias_y as i16);
+    let i_cap_y = _mm_set1_epi16(range.range_y as i16 + range.bias_y as i16);
+    let i_cap_uv = _mm_set1_epi16(range.bias_y as i16 + range.range_uv as i16);
+
+    let zeros = _mm_setzero_si128();
+
+    let y_bias = _mm_set1_epi16(bias_y);
+    let uv_bias = _mm_set1_epi16(bias_uv);
+    let v_yr = _mm_set1_epi16(transform.yr as i16);
+    let v_yg = _mm_set1_epi16(transform.yg as i16);
+    let v_yb = _mm_set1_epi16(transform.yb as i16);
+    let v_cb_r = _mm_set1_epi16(transform.cb_r as i16);
+    let v_cb_g = _mm_set1_epi16(transform.cb_g as i16);
+    let v_cb_b = _mm_set1_epi16(transform.cb_b as i16);
+    let v_cr_r = _mm_set1_epi16(transform.cr_r as i16);
+    let v_cr_g = _mm_set1_epi16(transform.cr_g as i16);
+    let v_cr_b = _mm_set1_epi16(transform.cr_b as i16);
+
+    while cx + 16 < width {
+        let (r_values0, g_values0, b_values0);
+        let (r_values1, g_values1, b_values1);
+
+        let px = cx * channels;
+
+        match source_channels {
+            YuvSourceChannels::Rgb | YuvSourceChannels::Bgr => {
+                let row_start0 = rgba0.get_unchecked(px..).as_ptr();
+                let row_1 = _mm_loadu_si128(row_start0 as *const __m128i);
+                let row_2 = _mm_loadu_si128(row_start0.add(16) as *const __m128i);
+                let row_3 = _mm_loadu_si128(row_start0.add(32) as *const __m128i);
+
+                let (it1, it2, it3) = sse_deinterleave_rgb(row_1, row_2, row_3);
+                if source_channels == YuvSourceChannels::Rgb {
+                    r_values0 = it1;
+                    g_values0 = it2;
+                    b_values0 = it3;
+                } else {
+                    r_values0 = it3;
+                    g_values0 = it2;
+                    b_values0 = it1;
+                }
+
+                let row_start1 = rgba1.get_unchecked(px..).as_ptr();
+                let row_11 = _mm_loadu_si128(row_start1 as *const __m128i);
+                let row_21 = _mm_loadu_si128(row_start1.add(16) as *const __m128i);
+                let row_31 = _mm_loadu_si128(row_start1.add(32) as *const __m128i);
+
+                let (it11, it21, it31) = sse_deinterleave_rgb(row_11, row_21, row_31);
+                if source_channels == YuvSourceChannels::Rgb {
+                    r_values1 = it11;
+                    g_values1 = it21;
+                    b_values1 = it31;
+                } else {
+                    r_values1 = it31;
+                    g_values1 = it21;
+                    b_values1 = it11;
+                }
+            }
+            YuvSourceChannels::Rgba | YuvSourceChannels::Bgra => {
+                let row_start0 = rgba0.get_unchecked(px..).as_ptr();
+                let row_1 = _mm_loadu_si128(row_start0 as *const __m128i);
+                let row_2 = _mm_loadu_si128(row_start0.add(16) as *const __m128i);
+                let row_3 = _mm_loadu_si128(row_start0.add(32) as *const __m128i);
+                let row_4 = _mm_loadu_si128(row_start0.add(48) as *const __m128i);
+
+                let (it1, it2, it3, _) = sse_deinterleave_rgba(row_1, row_2, row_3, row_4);
+                if source_channels == YuvSourceChannels::Rgba {
+                    r_values0 = it1;
+                    g_values0 = it2;
+                    b_values0 = it3;
+                } else {
+                    r_values0 = it3;
+                    g_values0 = it2;
+                    b_values0 = it1;
+                }
+
+                let row_start1 = rgba1.get_unchecked(px..).as_ptr();
+                let row_11 = _mm_loadu_si128(row_start1 as *const __m128i);
+                let row_21 = _mm_loadu_si128(row_start1.add(16) as *const __m128i);
+                let row_31 = _mm_loadu_si128(row_start1.add(32) as *const __m128i);
+                let row_41 = _mm_loadu_si128(row_start1.add(48) as *const __m128i);
+
+                let (it11, it21, it31, _) = sse_deinterleave_rgba(row_11, row_21, row_31, row_41);
+                if source_channels == YuvSourceChannels::Rgba {
+                    r_values1 = it11;
+                    g_values1 = it21;
+                    b_values1 = it31;
+                } else {
+                    r_values1 = it31;
+                    g_values1 = it21;
+                    b_values1 = it11;
+                }
+            }
+        }
+
+        let r0_low = _mm_slli_epi16::<V_SCALE>(_mm_cvtepu8_epi16(r_values0));
+        let r0_high = _mm_slli_epi16::<V_SCALE>(_mm_unpackhi_epi8(r_values0, zeros));
+        let g0_low = _mm_slli_epi16::<V_SCALE>(_mm_cvtepu8_epi16(g_values0));
+        let g0_high = _mm_slli_epi16::<V_SCALE>(_mm_unpackhi_epi8(g_values0, zeros));
+        let b0_low = _mm_slli_epi16::<V_SCALE>(_mm_cvtepu8_epi16(b_values0));
+        let b0_high = _mm_slli_epi16::<V_SCALE>(_mm_unpackhi_epi8(b_values0, zeros));
+
+        let y0_l = _mm_max_epi16(
+            _mm_min_epi16(
+                _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
+                    y_bias,
+                    _mm_add_epi16(
+                        _mm_add_epi16(
+                            _mm_mulhrs_epi16(r0_low, v_yr),
+                            _mm_mulhrs_epi16(g0_low, v_yg),
+                        ),
+                        _mm_mulhrs_epi16(b0_low, v_yb),
+                    ),
+                )),
+                i_cap_y,
+            ),
+            i_bias_y,
+        );
+
+        let y0_h = _mm_max_epi16(
+            _mm_min_epi16(
+                _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
+                    y_bias,
+                    _mm_add_epi16(
+                        _mm_add_epi16(
+                            _mm_mulhrs_epi16(r0_high, v_yr),
+                            _mm_mulhrs_epi16(g0_high, v_yg),
+                        ),
+                        _mm_mulhrs_epi16(b0_high, v_yb),
+                    ),
+                )),
+                i_cap_y,
+            ),
+            i_bias_y,
+        );
+
+        let r1_low = _mm_slli_epi16::<V_SCALE>(_mm_cvtepu8_epi16(r_values1));
+        let r1_high = _mm_slli_epi16::<V_SCALE>(_mm_unpackhi_epi8(r_values1, zeros));
+        let g1_low = _mm_slli_epi16::<V_SCALE>(_mm_cvtepu8_epi16(g_values1));
+        let g1_high = _mm_slli_epi16::<V_SCALE>(_mm_unpackhi_epi8(g_values1, zeros));
+        let b1_low = _mm_slli_epi16::<V_SCALE>(_mm_cvtepu8_epi16(b_values1));
+        let b1_high = _mm_slli_epi16::<V_SCALE>(_mm_unpackhi_epi8(b_values1, zeros));
+
+        let y1_l = _mm_max_epi16(
+            _mm_min_epi16(
+                _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
+                    y_bias,
+                    _mm_add_epi16(
+                        _mm_add_epi16(
+                            _mm_mulhrs_epi16(r1_low, v_yr),
+                            _mm_mulhrs_epi16(g1_low, v_yg),
+                        ),
+                        _mm_mulhrs_epi16(b1_low, v_yb),
+                    ),
+                )),
+                i_cap_y,
+            ),
+            i_bias_y,
+        );
+
+        let y1_h = _mm_max_epi16(
+            _mm_min_epi16(
+                _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
+                    y_bias,
+                    _mm_add_epi16(
+                        _mm_add_epi16(
+                            _mm_mulhrs_epi16(r1_high, v_yr),
+                            _mm_mulhrs_epi16(g1_high, v_yg),
+                        ),
+                        _mm_mulhrs_epi16(b1_high, v_yb),
+                    ),
+                )),
+                i_cap_y,
+            ),
+            i_bias_y,
+        );
+
+        let y0_yuv = _mm_packus_epi16(y0_l, y0_h);
+        let y1_yuv = _mm_packus_epi16(y1_l, y1_h);
+
+        _mm_storeu_si128(
+            y_plane0.get_unchecked_mut(cx..).as_mut_ptr() as *mut __m128i,
+            y0_yuv,
+        );
+        _mm_storeu_si128(
+            y_plane1.get_unchecked_mut(cx..).as_mut_ptr() as *mut __m128i,
+            y1_yuv,
+        );
+
+        let r1 = _mm_avg_epu16(r0_low, r0_high);
+        let g1 = _mm_avg_epu16(g0_low, g0_high);
+        let b1 = _mm_avg_epu16(b0_low, b0_high);
+
+        let cbk = _mm_max_epi16(
+            _mm_min_epi16(
+                _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
+                    uv_bias,
+                    _mm_add_epi16(
+                        _mm_add_epi16(_mm_mulhrs_epi16(r1, v_cb_r), _mm_mulhrs_epi16(g1, v_cb_g)),
+                        _mm_mulhrs_epi16(b1, v_cb_b),
+                    ),
+                )),
+                i_cap_uv,
+            ),
+            i_bias_y,
+        );
+
+        let crk = _mm_max_epi16(
+            _mm_min_epi16(
+                _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
+                    uv_bias,
+                    _mm_add_epi16(
+                        _mm_add_epi16(_mm_mulhrs_epi16(r1, v_cr_r), _mm_mulhrs_epi16(g1, v_cr_g)),
+                        _mm_mulhrs_epi16(b1, v_cr_b),
+                    ),
+                )),
+                i_cap_uv,
+            ),
+            i_bias_y,
+        );
+
+        let cb = _mm_packus_epi16(cbk, cbk);
+        let cr = _mm_packus_epi16(crk, crk);
+
+        std::ptr::copy_nonoverlapping(&cb as *const _ as *const u8, u_ptr.add(uv_x), 8);
+        std::ptr::copy_nonoverlapping(&cr as *const _ as *const u8, v_ptr.add(uv_x), 8);
+        uv_x += 8;
+        cx += 16;
+    }
+
+    ProcessedOffset { cx, ux: uv_x }
+}
diff --git a/src/sse/yuv_nv_to_rgba.rs b/src/sse/yuv_nv_to_rgba.rs
index 92663c7..eeddc51 100644
--- a/src/sse/yuv_nv_to_rgba.rs
+++ b/src/sse/yuv_nv_to_rgba.rs
@@ -86,6 +86,9 @@ unsafe fn sse_yuv_nv_to_rgba_impl<
     let uv_ptr = uv_plane.as_ptr();
     let rgba_ptr = rgba.as_mut_ptr();
 
+    const SCALE: i32 = 6;
+    const V_SHR: i32 = 3;
+
     let y_corr = _mm_set1_epi8(range.bias_y as i8);
     let uv_corr = _mm_set1_epi16(range.bias_uv as i16);
     let v_luma_coeff = _mm_set1_epi16(transform.y_coef as i16);
@@ -94,7 +97,7 @@ unsafe fn sse_yuv_nv_to_rgba_impl<
     let v_g_coeff_1 = _mm_set1_epi16(transform.g_coeff_1 as i16);
     let v_g_coeff_2 = _mm_set1_epi16(transform.g_coeff_2 as i16);
     let v_alpha = _mm_set1_epi8(255u8 as i8);
-    let rounding_const = _mm_set1_epi16(1 << 2);
+    let rounding_const = _mm_set1_epi16(1 << (V_SHR - 1));
 
     let zeros = _mm_setzero_si128();
 
@@ -151,53 +154,53 @@ unsafe fn sse_yuv_nv_to_rgba_impl<
             }
         }
 
-        let u_high = _mm_slli_epi16::<7>(_mm_sub_epi16(u_high_u16, uv_corr));
-        let v_high = _mm_slli_epi16::<7>(_mm_sub_epi16(v_high_u16, uv_corr));
-        let y_high = _mm_mulhi_epi16(
-            _mm_slli_epi16::<7>(_mm_unpackhi_epi8(y_values, zeros)),
+        let u_high = _mm_slli_epi16::<SCALE>(_mm_sub_epi16(u_high_u16, uv_corr));
+        let v_high = _mm_slli_epi16::<SCALE>(_mm_sub_epi16(v_high_u16, uv_corr));
+        let y_high = _mm_mulhrs_epi16(
+            _mm_slli_epi16::<SCALE>(_mm_unpackhi_epi8(y_values, zeros)),
             v_luma_coeff,
         );
 
-        let r_high = _mm_srai_epi16::<3>(_mm_add_epi16(
-            _mm_add_epi16(y_high, _mm_mulhi_epi16(v_high, v_cr_coeff)),
+        let r_high = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
+            _mm_add_epi16(y_high, _mm_mulhrs_epi16(v_high, v_cr_coeff)),
             rounding_const,
         ));
-        let b_high = _mm_srai_epi16::<3>(_mm_add_epi16(
-            _mm_add_epi16(y_high, _mm_mulhi_epi16(u_high, v_cb_coeff)),
+        let b_high = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
+            _mm_add_epi16(y_high, _mm_mulhrs_epi16(u_high, v_cb_coeff)),
             rounding_const,
         ));
-        let g_high = _mm_srai_epi16::<3>(_mm_add_epi16(
+        let g_high = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
             _mm_sub_epi16(
                 y_high,
                 _mm_add_epi16(
-                    _mm_mulhi_epi16(v_high, v_g_coeff_1),
-                    _mm_mulhi_epi16(u_high, v_g_coeff_2),
+                    _mm_mulhrs_epi16(v_high, v_g_coeff_1),
+                    _mm_mulhrs_epi16(u_high, v_g_coeff_2),
                 ),
             ),
             rounding_const,
         ));
 
-        let u_low = _mm_slli_epi16::<7>(_mm_sub_epi16(u_low_u16, uv_corr));
-        let v_low = _mm_slli_epi16::<7>(_mm_sub_epi16(v_low_u16, uv_corr));
-        let y_low = _mm_mulhi_epi16(
-            _mm_slli_epi16::<7>(_mm_cvtepu8_epi16(y_values)),
+        let u_low = _mm_slli_epi16::<SCALE>(_mm_sub_epi16(u_low_u16, uv_corr));
+        let v_low = _mm_slli_epi16::<SCALE>(_mm_sub_epi16(v_low_u16, uv_corr));
+        let y_low = _mm_mulhrs_epi16(
+            _mm_slli_epi16::<SCALE>(_mm_cvtepu8_epi16(y_values)),
             v_luma_coeff,
         );
 
-        let r_low = _mm_srai_epi16::<3>(_mm_add_epi16(
-            _mm_add_epi16(y_low, _mm_mulhi_epi16(v_low, v_cr_coeff)),
+        let r_low = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
+            _mm_add_epi16(y_low, _mm_mulhrs_epi16(v_low, v_cr_coeff)),
             rounding_const,
         ));
-        let b_low = _mm_srai_epi16::<3>(_mm_add_epi16(
-            _mm_add_epi16(y_low, _mm_mulhi_epi16(u_low, v_cb_coeff)),
+        let b_low = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
+            _mm_add_epi16(y_low, _mm_mulhrs_epi16(u_low, v_cb_coeff)),
             rounding_const,
         ));
-        let g_low = _mm_srai_epi16::<3>(_mm_add_epi16(
+        let g_low = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
             _mm_sub_epi16(
                 y_low,
                 _mm_add_epi16(
-                    _mm_mulhi_epi16(v_low, v_g_coeff_1),
-                    _mm_mulhi_epi16(u_low, v_g_coeff_2),
+                    _mm_mulhrs_epi16(v_low, v_g_coeff_1),
+                    _mm_mulhrs_epi16(u_low, v_g_coeff_2),
                 ),
             ),
             rounding_const,
@@ -290,27 +293,27 @@ unsafe fn sse_yuv_nv_to_rgba_impl<
             }
         }
 
-        let u_low = _mm_slli_epi16::<7>(_mm_sub_epi16(u_low_u16, uv_corr));
-        let v_low = _mm_slli_epi16::<7>(_mm_sub_epi16(v_low_u16, uv_corr));
-        let y_low = _mm_mulhi_epi16(
-            _mm_slli_epi16::<7>(_mm_cvtepu8_epi16(y_values)),
+        let u_low = _mm_slli_epi16::<SCALE>(_mm_sub_epi16(u_low_u16, uv_corr));
+        let v_low = _mm_slli_epi16::<SCALE>(_mm_sub_epi16(v_low_u16, uv_corr));
+        let y_low = _mm_mulhrs_epi16(
+            _mm_slli_epi16::<SCALE>(_mm_cvtepu8_epi16(y_values)),
             v_luma_coeff,
         );
 
-        let r_low = _mm_srai_epi16::<3>(_mm_add_epi16(
-            _mm_add_epi16(y_low, _mm_mulhi_epi16(v_low, v_cr_coeff)),
+        let r_low = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
+            _mm_add_epi16(y_low, _mm_mulhrs_epi16(v_low, v_cr_coeff)),
             rounding_const,
         ));
-        let b_low = _mm_srai_epi16::<3>(_mm_add_epi16(
-            _mm_add_epi16(y_low, _mm_mulhi_epi16(u_low, v_cb_coeff)),
+        let b_low = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
+            _mm_add_epi16(y_low, _mm_mulhrs_epi16(u_low, v_cb_coeff)),
             rounding_const,
         ));
-        let g_low = _mm_srai_epi16::<3>(_mm_add_epi16(
+        let g_low = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
             _mm_sub_epi16(
                 y_low,
                 _mm_add_epi16(
-                    _mm_mulhi_epi16(v_low, v_g_coeff_1),
-                    _mm_mulhi_epi16(u_low, v_g_coeff_2),
+                    _mm_mulhrs_epi16(v_low, v_g_coeff_1),
+                    _mm_mulhrs_epi16(u_low, v_g_coeff_2),
                 ),
             ),
             rounding_const,
diff --git a/src/sse/yuv_nv_to_rgba420.rs b/src/sse/yuv_nv_to_rgba420.rs
index 87006ff..5d3689f 100644
--- a/src/sse/yuv_nv_to_rgba420.rs
+++ b/src/sse/yuv_nv_to_rgba420.rs
@@ -77,7 +77,7 @@ unsafe fn sse_yuv_nv_to_rgba_impl420<const UV_ORDER: u8, const DESTINATION_CHANN
 
     let uv_ptr = uv_plane.as_ptr();
 
-    const SCALE: i32 = 7;
+    const SCALE: i32 = 6;
     const V_SHR: i32 = 3;
 
     let y_corr = _mm_set1_epi8(range.bias_y as i8);
@@ -129,26 +129,26 @@ unsafe fn sse_yuv_nv_to_rgba_impl420<const UV_ORDER: u8, const DESTINATION_CHANN
 
         let u_high = _mm_slli_epi16::<SCALE>(_mm_sub_epi16(u_high_u16, uv_corr));
         let v_high = _mm_slli_epi16::<SCALE>(_mm_sub_epi16(v_high_u16, uv_corr));
-        let y_high0 = _mm_mulhi_epi16(
+        let y_high0 = _mm_mulhrs_epi16(
             _mm_slli_epi16::<SCALE>(_mm_unpackhi_epi8(y_values0, zeros)),
             v_luma_coeff,
         );
-        let y_high1 = _mm_mulhi_epi16(
+        let y_high1 = _mm_mulhrs_epi16(
             _mm_slli_epi16::<SCALE>(_mm_unpackhi_epi8(y_values1, zeros)),
             v_luma_coeff,
         );
 
         let g_coeff_hi = _mm_add_epi16(
-            _mm_mulhi_epi16(v_high, v_g_coeff_1),
-            _mm_mulhi_epi16(u_high, v_g_coeff_2),
+            _mm_mulhrs_epi16(v_high, v_g_coeff_1),
+            _mm_mulhrs_epi16(u_high, v_g_coeff_2),
         );
 
         let r_high0 = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_add_epi16(y_high0, _mm_mulhi_epi16(v_high, v_cr_coeff)),
+            _mm_add_epi16(y_high0, _mm_mulhrs_epi16(v_high, v_cr_coeff)),
             rounding_const,
         ));
         let b_high0 = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_add_epi16(y_high0, _mm_mulhi_epi16(u_high, v_cb_coeff)),
+            _mm_add_epi16(y_high0, _mm_mulhrs_epi16(u_high, v_cb_coeff)),
             rounding_const,
         ));
         let g_high0 = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
@@ -156,11 +156,11 @@ unsafe fn sse_yuv_nv_to_rgba_impl420<const UV_ORDER: u8, const DESTINATION_CHANN
             rounding_const,
         ));
         let r_high1 = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_add_epi16(y_high1, _mm_mulhi_epi16(v_high, v_cr_coeff)),
+            _mm_add_epi16(y_high1, _mm_mulhrs_epi16(v_high, v_cr_coeff)),
             rounding_const,
         ));
         let b_high1 = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_add_epi16(y_high1, _mm_mulhi_epi16(u_high, v_cb_coeff)),
+            _mm_add_epi16(y_high1, _mm_mulhrs_epi16(u_high, v_cb_coeff)),
             rounding_const,
         ));
         let g_high1 = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
@@ -170,26 +170,26 @@ unsafe fn sse_yuv_nv_to_rgba_impl420<const UV_ORDER: u8, const DESTINATION_CHANN
 
         let u_low = _mm_slli_epi16::<SCALE>(_mm_sub_epi16(u_low_u16, uv_corr));
         let v_low = _mm_slli_epi16::<SCALE>(_mm_sub_epi16(v_low_u16, uv_corr));
-        let y_low0 = _mm_mulhi_epi16(
+        let y_low0 = _mm_mulhrs_epi16(
             _mm_slli_epi16::<SCALE>(_mm_cvtepu8_epi16(y_values0)),
             v_luma_coeff,
         );
-        let y_low1 = _mm_mulhi_epi16(
+        let y_low1 = _mm_mulhrs_epi16(
             _mm_slli_epi16::<SCALE>(_mm_cvtepu8_epi16(y_values1)),
             v_luma_coeff,
         );
 
         let g_coeff_lo = _mm_add_epi16(
-            _mm_mulhi_epi16(v_low, v_g_coeff_1),
-            _mm_mulhi_epi16(u_low, v_g_coeff_2),
+            _mm_mulhrs_epi16(v_low, v_g_coeff_1),
+            _mm_mulhrs_epi16(u_low, v_g_coeff_2),
         );
 
         let r_low0 = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_add_epi16(y_low0, _mm_mulhi_epi16(v_low, v_cr_coeff)),
+            _mm_add_epi16(y_low0, _mm_mulhrs_epi16(v_low, v_cr_coeff)),
             rounding_const,
         ));
         let b_low0 = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_add_epi16(y_low0, _mm_mulhi_epi16(u_low, v_cb_coeff)),
+            _mm_add_epi16(y_low0, _mm_mulhrs_epi16(u_low, v_cb_coeff)),
             rounding_const,
         ));
         let g_low0 = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
@@ -197,11 +197,11 @@ unsafe fn sse_yuv_nv_to_rgba_impl420<const UV_ORDER: u8, const DESTINATION_CHANN
             rounding_const,
         ));
         let r_low1 = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_add_epi16(y_low1, _mm_mulhi_epi16(v_low, v_cr_coeff)),
+            _mm_add_epi16(y_low1, _mm_mulhrs_epi16(v_low, v_cr_coeff)),
             rounding_const,
         ));
         let b_low1 = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_add_epi16(y_low1, _mm_mulhi_epi16(u_low, v_cb_coeff)),
+            _mm_add_epi16(y_low1, _mm_mulhrs_epi16(u_low, v_cb_coeff)),
             rounding_const,
         ));
         let g_low1 = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
@@ -317,26 +317,26 @@ unsafe fn sse_yuv_nv_to_rgba_impl420<const UV_ORDER: u8, const DESTINATION_CHANN
 
         let u_low = _mm_slli_epi16::<SCALE>(_mm_sub_epi16(u_low_u16, uv_corr));
         let v_low = _mm_slli_epi16::<SCALE>(_mm_sub_epi16(v_low_u16, uv_corr));
-        let y_low0 = _mm_mulhi_epi16(
+        let y_low0 = _mm_mulhrs_epi16(
             _mm_slli_epi16::<SCALE>(_mm_cvtepu8_epi16(y_values0)),
             v_luma_coeff,
         );
-        let y_low1 = _mm_mulhi_epi16(
+        let y_low1 = _mm_mulhrs_epi16(
             _mm_slli_epi16::<SCALE>(_mm_cvtepu8_epi16(y_values1)),
             v_luma_coeff,
         );
 
         let g_coeff_lo = _mm_add_epi16(
-            _mm_mulhi_epi16(v_low, v_g_coeff_1),
-            _mm_mulhi_epi16(u_low, v_g_coeff_2),
+            _mm_mulhrs_epi16(v_low, v_g_coeff_1),
+            _mm_mulhrs_epi16(u_low, v_g_coeff_2),
         );
 
         let r_low0 = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_add_epi16(y_low0, _mm_mulhi_epi16(v_low, v_cr_coeff)),
+            _mm_add_epi16(y_low0, _mm_mulhrs_epi16(v_low, v_cr_coeff)),
             rounding_const,
         ));
         let b_low0 = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_add_epi16(y_low0, _mm_mulhi_epi16(u_low, v_cb_coeff)),
+            _mm_add_epi16(y_low0, _mm_mulhrs_epi16(u_low, v_cb_coeff)),
             rounding_const,
         ));
         let g_low0 = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
@@ -345,11 +345,11 @@ unsafe fn sse_yuv_nv_to_rgba_impl420<const UV_ORDER: u8, const DESTINATION_CHANN
         ));
 
         let r_low1 = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_add_epi16(y_low1, _mm_mulhi_epi16(v_low, v_cr_coeff)),
+            _mm_add_epi16(y_low1, _mm_mulhrs_epi16(v_low, v_cr_coeff)),
             rounding_const,
         ));
         let b_low1 = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_add_epi16(y_low1, _mm_mulhi_epi16(u_low, v_cb_coeff)),
+            _mm_add_epi16(y_low1, _mm_mulhrs_epi16(u_low, v_cb_coeff)),
             rounding_const,
         ));
         let g_low1 = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
diff --git a/src/sse/yuv_to_rgba.rs b/src/sse/yuv_to_rgba.rs
index fb58366..0ba8153 100644
--- a/src/sse/yuv_to_rgba.rs
+++ b/src/sse/yuv_to_rgba.rs
@@ -80,6 +80,9 @@ unsafe fn sse_yuv_to_rgba_row_impl<const DESTINATION_CHANNELS: u8, const SAMPLIN
     let v_ptr = v_plane.as_ptr();
     let rgba_ptr = rgba.as_mut_ptr();
 
+    const SCALE: i32 = 6;
+    const V_SHR: i32 = 3;
+
     let y_corr = _mm_set1_epi8(range.bias_y as i8);
     let uv_corr = _mm_set1_epi16(range.bias_uv as i16);
     let v_luma_coeff = _mm_set1_epi16(transform.y_coef as i16);
@@ -88,7 +91,7 @@ unsafe fn sse_yuv_to_rgba_row_impl<const DESTINATION_CHANNELS: u8, const SAMPLIN
     let v_g_coeff_1 = _mm_set1_epi16(transform.g_coeff_1 as i16);
     let v_g_coeff_2 = _mm_set1_epi16(transform.g_coeff_2 as i16);
     let v_alpha = _mm_set1_epi8(255u8 as i8);
-    let rounding_const = _mm_set1_epi16(1 << 2);
+    let rounding_const = _mm_set1_epi16(1 << (V_SHR - 1));
 
     let zeros = _mm_setzero_si128();
 
@@ -119,22 +122,22 @@ unsafe fn sse_yuv_to_rgba_row_impl<const DESTINATION_CHANNELS: u8, const SAMPLIN
             }
         }
 
-        let u_high = _mm_slli_epi16::<7>(_mm_sub_epi16(u_high_u16, uv_corr));
-        let v_high = _mm_slli_epi16::<7>(_mm_sub_epi16(v_high_u16, uv_corr));
+        let u_high = _mm_slli_epi16::<SCALE>(_mm_sub_epi16(u_high_u16, uv_corr));
+        let v_high = _mm_slli_epi16::<SCALE>(_mm_sub_epi16(v_high_u16, uv_corr));
         let y_high = _mm_mulhi_epi16(
-            _mm_slli_epi16::<7>(_mm_unpackhi_epi8(y_values, zeros)),
+            _mm_slli_epi16::<SCALE>(_mm_unpackhi_epi8(y_values, zeros)),
             v_luma_coeff,
         );
 
-        let r_high = _mm_srai_epi16::<3>(_mm_add_epi16(
+        let r_high = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
             _mm_add_epi16(y_high, _mm_mulhi_epi16(v_high, v_cr_coeff)),
             rounding_const,
         ));
-        let b_high = _mm_srai_epi16::<3>(_mm_add_epi16(
+        let b_high = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
             _mm_add_epi16(y_high, _mm_mulhi_epi16(u_high, v_cb_coeff)),
             rounding_const,
         ));
-        let g_high = _mm_srai_epi16::<3>(_mm_add_epi16(
+        let g_high = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
             _mm_sub_epi16(
                 y_high,
                 _mm_add_epi16(
@@ -145,22 +148,22 @@ unsafe fn sse_yuv_to_rgba_row_impl<const DESTINATION_CHANNELS: u8, const SAMPLIN
             rounding_const,
         ));
 
-        let u_low = _mm_slli_epi16::<7>(_mm_sub_epi16(u_low_u16, uv_corr));
-        let v_low = _mm_slli_epi16::<7>(_mm_sub_epi16(v_low_u16, uv_corr));
+        let u_low = _mm_slli_epi16::<SCALE>(_mm_sub_epi16(u_low_u16, uv_corr));
+        let v_low = _mm_slli_epi16::<SCALE>(_mm_sub_epi16(v_low_u16, uv_corr));
         let y_low = _mm_mulhi_epi16(
-            _mm_slli_epi16::<7>(_mm_cvtepu8_epi16(y_values)),
+            _mm_slli_epi16::<SCALE>(_mm_cvtepu8_epi16(y_values)),
             v_luma_coeff,
         );
 
-        let r_low = _mm_srai_epi16::<3>(_mm_add_epi16(
+        let r_low = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
             _mm_add_epi16(y_low, _mm_mulhi_epi16(v_low, v_cr_coeff)),
             rounding_const,
         ));
-        let b_low = _mm_srai_epi16::<3>(_mm_add_epi16(
+        let b_low = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
             _mm_add_epi16(y_low, _mm_mulhi_epi16(u_low, v_cb_coeff)),
             rounding_const,
         ));
-        let g_low = _mm_srai_epi16::<3>(_mm_add_epi16(
+        let g_low = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
             _mm_sub_epi16(
                 y_low,
                 _mm_add_epi16(
@@ -247,22 +250,22 @@ unsafe fn sse_yuv_to_rgba_row_impl<const DESTINATION_CHANNELS: u8, const SAMPLIN
             }
         }
 
-        let u_low = _mm_slli_epi16::<7>(_mm_sub_epi16(u_low_u16, uv_corr));
-        let v_low = _mm_slli_epi16::<7>(_mm_sub_epi16(v_low_u16, uv_corr));
+        let u_low = _mm_slli_epi16::<SCALE>(_mm_sub_epi16(u_low_u16, uv_corr));
+        let v_low = _mm_slli_epi16::<SCALE>(_mm_sub_epi16(v_low_u16, uv_corr));
         let y_low = _mm_mulhi_epi16(
-            _mm_slli_epi16::<7>(_mm_cvtepu8_epi16(y_values)),
+            _mm_slli_epi16::<SCALE>(_mm_cvtepu8_epi16(y_values)),
             v_luma_coeff,
         );
 
-        let r_low = _mm_srai_epi16::<3>(_mm_add_epi16(
+        let r_low = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
             _mm_add_epi16(y_low, _mm_mulhi_epi16(v_low, v_cr_coeff)),
             rounding_const,
         ));
-        let b_low = _mm_srai_epi16::<3>(_mm_add_epi16(
+        let b_low = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
             _mm_add_epi16(y_low, _mm_mulhi_epi16(u_low, v_cb_coeff)),
             rounding_const,
         ));
-        let g_low = _mm_srai_epi16::<3>(_mm_add_epi16(
+        let g_low = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
             _mm_sub_epi16(
                 y_low,
                 _mm_add_epi16(
diff --git a/src/sse/yuv_to_rgba420.rs b/src/sse/yuv_to_rgba420.rs
index 2bb1461..bf45971 100644
--- a/src/sse/yuv_to_rgba420.rs
+++ b/src/sse/yuv_to_rgba420.rs
@@ -80,7 +80,7 @@ unsafe fn sse_yuv_to_rgba_row_impl420<const DESTINATION_CHANNELS: u8>(
     let u_ptr = u_plane.as_ptr();
     let v_ptr = v_plane.as_ptr();
 
-    const SCALE: i32 = 7;
+    const SCALE: i32 = 6;
     const V_SHR: i32 = 3;
 
     let y_corr = _mm_set1_epi8(range.bias_y as i8);
@@ -116,26 +116,26 @@ unsafe fn sse_yuv_to_rgba_row_impl420<const DESTINATION_CHANNELS: u8>(
 
         let u_high = _mm_slli_epi16::<SCALE>(_mm_sub_epi16(u_high_u16, uv_corr));
         let v_high = _mm_slli_epi16::<SCALE>(_mm_sub_epi16(v_high_u16, uv_corr));
-        let y_high0 = _mm_mulhi_epi16(
+        let y_high0 = _mm_mulhrs_epi16(
             _mm_slli_epi16::<SCALE>(_mm_unpackhi_epi8(y_values0, zeros)),
             v_luma_coeff,
         );
-        let y_high1 = _mm_mulhi_epi16(
+        let y_high1 = _mm_mulhrs_epi16(
             _mm_slli_epi16::<SCALE>(_mm_unpackhi_epi8(y_values1, zeros)),
             v_luma_coeff,
         );
 
         let g_coeff_hi = _mm_add_epi16(
-            _mm_mulhi_epi16(v_high, v_g_coeff_1),
-            _mm_mulhi_epi16(u_high, v_g_coeff_2),
+            _mm_mulhrs_epi16(v_high, v_g_coeff_1),
+            _mm_mulhrs_epi16(u_high, v_g_coeff_2),
         );
 
         let r_high0 = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_add_epi16(y_high0, _mm_mulhi_epi16(v_high, v_cr_coeff)),
+            _mm_add_epi16(y_high0, _mm_mulhrs_epi16(v_high, v_cr_coeff)),
             rounding_const,
         ));
         let b_high0 = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_add_epi16(y_high0, _mm_mulhi_epi16(u_high, v_cb_coeff)),
+            _mm_add_epi16(y_high0, _mm_mulhrs_epi16(u_high, v_cb_coeff)),
             rounding_const,
         ));
         let g_high0 = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
@@ -144,11 +144,11 @@ unsafe fn sse_yuv_to_rgba_row_impl420<const DESTINATION_CHANNELS: u8>(
         ));
 
         let r_high1 = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_add_epi16(y_high1, _mm_mulhi_epi16(v_high, v_cr_coeff)),
+            _mm_add_epi16(y_high1, _mm_mulhrs_epi16(v_high, v_cr_coeff)),
             rounding_const,
         ));
         let b_high1 = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_add_epi16(y_high1, _mm_mulhi_epi16(u_high, v_cb_coeff)),
+            _mm_add_epi16(y_high1, _mm_mulhrs_epi16(u_high, v_cb_coeff)),
             rounding_const,
         ));
         let g_high1 = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
@@ -158,26 +158,26 @@ unsafe fn sse_yuv_to_rgba_row_impl420<const DESTINATION_CHANNELS: u8>(
 
         let u_low = _mm_slli_epi16::<SCALE>(_mm_sub_epi16(u_low_u16, uv_corr));
         let v_low = _mm_slli_epi16::<SCALE>(_mm_sub_epi16(v_low_u16, uv_corr));
-        let y_low0 = _mm_mulhi_epi16(
+        let y_low0 = _mm_mulhrs_epi16(
             _mm_slli_epi16::<SCALE>(_mm_cvtepu8_epi16(y_values0)),
             v_luma_coeff,
         );
-        let y_low1 = _mm_mulhi_epi16(
+        let y_low1 = _mm_mulhrs_epi16(
             _mm_slli_epi16::<SCALE>(_mm_cvtepu8_epi16(y_values1)),
             v_luma_coeff,
         );
 
         let g_coeff_lo = _mm_add_epi16(
-            _mm_mulhi_epi16(v_low, v_g_coeff_1),
-            _mm_mulhi_epi16(u_low, v_g_coeff_2),
+            _mm_mulhrs_epi16(v_low, v_g_coeff_1),
+            _mm_mulhrs_epi16(u_low, v_g_coeff_2),
         );
 
         let r_low0 = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_add_epi16(y_low0, _mm_mulhi_epi16(v_low, v_cr_coeff)),
+            _mm_add_epi16(y_low0, _mm_mulhrs_epi16(v_low, v_cr_coeff)),
             rounding_const,
         ));
         let b_low0 = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_add_epi16(y_low0, _mm_mulhi_epi16(u_low, v_cb_coeff)),
+            _mm_add_epi16(y_low0, _mm_mulhrs_epi16(u_low, v_cb_coeff)),
             rounding_const,
         ));
         let g_low0 = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
@@ -186,11 +186,11 @@ unsafe fn sse_yuv_to_rgba_row_impl420<const DESTINATION_CHANNELS: u8>(
         ));
 
         let r_low1 = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_add_epi16(y_low1, _mm_mulhi_epi16(v_low, v_cr_coeff)),
+            _mm_add_epi16(y_low1, _mm_mulhrs_epi16(v_low, v_cr_coeff)),
             rounding_const,
         ));
         let b_low1 = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_add_epi16(y_low1, _mm_mulhi_epi16(u_low, v_cb_coeff)),
+            _mm_add_epi16(y_low1, _mm_mulhrs_epi16(u_low, v_cb_coeff)),
             rounding_const,
         ));
         let g_low1 = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
@@ -304,26 +304,26 @@ unsafe fn sse_yuv_to_rgba_row_impl420<const DESTINATION_CHANNELS: u8>(
 
         let u_low = _mm_slli_epi16::<SCALE>(_mm_sub_epi16(u_low_u16, uv_corr));
         let v_low = _mm_slli_epi16::<SCALE>(_mm_sub_epi16(v_low_u16, uv_corr));
-        let y_low0 = _mm_mulhi_epi16(
+        let y_low0 = _mm_mulhrs_epi16(
             _mm_slli_epi16::<SCALE>(_mm_cvtepu8_epi16(y_values0)),
             v_luma_coeff,
         );
-        let y_low1 = _mm_mulhi_epi16(
+        let y_low1 = _mm_mulhrs_epi16(
             _mm_slli_epi16::<SCALE>(_mm_cvtepu8_epi16(y_values1)),
             v_luma_coeff,
         );
 
         let g_coeff = _mm_add_epi16(
-            _mm_mulhi_epi16(v_low, v_g_coeff_1),
-            _mm_mulhi_epi16(u_low, v_g_coeff_2),
+            _mm_mulhrs_epi16(v_low, v_g_coeff_1),
+            _mm_mulhrs_epi16(u_low, v_g_coeff_2),
         );
 
         let r_low0 = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_add_epi16(y_low0, _mm_mulhi_epi16(v_low, v_cr_coeff)),
+            _mm_add_epi16(y_low0, _mm_mulhrs_epi16(v_low, v_cr_coeff)),
             rounding_const,
         ));
         let b_low0 = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_add_epi16(y_low0, _mm_mulhi_epi16(u_low, v_cb_coeff)),
+            _mm_add_epi16(y_low0, _mm_mulhrs_epi16(u_low, v_cb_coeff)),
             rounding_const,
         ));
         let g_low0 = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
@@ -332,11 +332,11 @@ unsafe fn sse_yuv_to_rgba_row_impl420<const DESTINATION_CHANNELS: u8>(
         ));
 
         let r_low1 = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_add_epi16(y_low1, _mm_mulhi_epi16(v_low, v_cr_coeff)),
+            _mm_add_epi16(y_low1, _mm_mulhrs_epi16(v_low, v_cr_coeff)),
             rounding_const,
         ));
         let b_low1 = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_add_epi16(y_low1, _mm_mulhi_epi16(u_low, v_cb_coeff)),
+            _mm_add_epi16(y_low1, _mm_mulhrs_epi16(u_low, v_cb_coeff)),
             rounding_const,
         ));
         let g_low1 = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
diff --git a/src/sse/yuv_to_rgba_alpha.rs b/src/sse/yuv_to_rgba_alpha.rs
index 444e06e..bfefd6d 100644
--- a/src/sse/yuv_to_rgba_alpha.rs
+++ b/src/sse/yuv_to_rgba_alpha.rs
@@ -94,6 +94,9 @@ unsafe fn sse_yuv_to_rgba_alpha_row_impl<const DESTINATION_CHANNELS: u8, const S
     let a_ptr = a_plane.as_ptr();
     let rgba_ptr = rgba.as_mut_ptr();
 
+    const SCALE: i32 = 6;
+    const V_SHR: i32 = 3;
+
     let y_corr = _mm_set1_epi8(range.bias_y as i8);
     let uv_corr = _mm_set1_epi16(range.bias_uv as i16);
     let v_luma_coeff = _mm_set1_epi16(transform.y_coef as i16);
@@ -101,7 +104,7 @@ unsafe fn sse_yuv_to_rgba_alpha_row_impl<const DESTINATION_CHANNELS: u8, const S
     let v_cb_coeff = _mm_set1_epi16(transform.cb_coef as i16);
     let v_g_coeff_1 = _mm_set1_epi16(transform.g_coeff_1 as i16);
     let v_g_coeff_2 = _mm_set1_epi16(transform.g_coeff_2 as i16);
-    let rounding_const = _mm_set1_epi16(1 << 2);
+    let rounding_const = _mm_set1_epi16(1 << (V_SHR - 1));
 
     let zeros = _mm_setzero_si128();
 
@@ -135,53 +138,53 @@ unsafe fn sse_yuv_to_rgba_alpha_row_impl<const DESTINATION_CHANNELS: u8, const S
             }
         }
 
-        let u_high = _mm_slli_epi16::<7>(_mm_sub_epi16(u_high_u16, uv_corr));
-        let v_high = _mm_slli_epi16::<7>(_mm_sub_epi16(v_high_u16, uv_corr));
-        let y_high = _mm_mulhi_epi16(
-            _mm_slli_epi16::<7>(_mm_unpackhi_epi8(y_values, zeros)),
+        let u_high = _mm_slli_epi16::<SCALE>(_mm_sub_epi16(u_high_u16, uv_corr));
+        let v_high = _mm_slli_epi16::<SCALE>(_mm_sub_epi16(v_high_u16, uv_corr));
+        let y_high = _mm_mulhrs_epi16(
+            _mm_slli_epi16::<SCALE>(_mm_unpackhi_epi8(y_values, zeros)),
             v_luma_coeff,
         );
 
-        let r_high = _mm_srai_epi16::<3>(_mm_add_epi16(
-            _mm_add_epi16(y_high, _mm_mulhi_epi16(v_high, v_cr_coeff)),
+        let r_high = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
+            _mm_add_epi16(y_high, _mm_mulhrs_epi16(v_high, v_cr_coeff)),
             rounding_const,
         ));
-        let b_high = _mm_srai_epi16::<3>(_mm_adds_epi16(
-            _mm_add_epi16(y_high, _mm_mulhi_epi16(u_high, v_cb_coeff)),
+        let b_high = _mm_srai_epi16::<V_SHR>(_mm_adds_epi16(
+            _mm_add_epi16(y_high, _mm_mulhrs_epi16(u_high, v_cb_coeff)),
             rounding_const,
         ));
-        let g_high = _mm_srai_epi16::<3>(_mm_add_epi16(
+        let g_high = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
             _mm_sub_epi16(
                 y_high,
                 _mm_add_epi16(
-                    _mm_mulhi_epi16(v_high, v_g_coeff_1),
-                    _mm_mulhi_epi16(u_high, v_g_coeff_2),
+                    _mm_mulhrs_epi16(v_high, v_g_coeff_1),
+                    _mm_mulhrs_epi16(u_high, v_g_coeff_2),
                 ),
             ),
             rounding_const,
         ));
 
-        let u_low = _mm_slli_epi16::<7>(_mm_sub_epi16(u_low_u16, uv_corr));
-        let v_low = _mm_slli_epi16::<7>(_mm_sub_epi16(v_low_u16, uv_corr));
-        let y_low = _mm_mulhi_epi16(
-            _mm_slli_epi16::<7>(_mm_cvtepu8_epi16(y_values)),
+        let u_low = _mm_slli_epi16::<SCALE>(_mm_sub_epi16(u_low_u16, uv_corr));
+        let v_low = _mm_slli_epi16::<SCALE>(_mm_sub_epi16(v_low_u16, uv_corr));
+        let y_low = _mm_mulhrs_epi16(
+            _mm_slli_epi16::<SCALE>(_mm_cvtepu8_epi16(y_values)),
             v_luma_coeff,
         );
 
-        let r_low = _mm_srai_epi16::<3>(_mm_add_epi16(
-            _mm_add_epi16(y_low, _mm_mulhi_epi16(v_low, v_cr_coeff)),
+        let r_low = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
+            _mm_add_epi16(y_low, _mm_mulhrs_epi16(v_low, v_cr_coeff)),
             rounding_const,
         ));
-        let b_low = _mm_srai_epi16::<3>(_mm_add_epi16(
-            _mm_add_epi16(y_low, _mm_mulhi_epi16(u_low, v_cb_coeff)),
+        let b_low = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
+            _mm_add_epi16(y_low, _mm_mulhrs_epi16(u_low, v_cb_coeff)),
             rounding_const,
         ));
-        let g_low = _mm_srai_epi16::<3>(_mm_add_epi16(
+        let g_low = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
             _mm_sub_epi16(
                 y_low,
                 _mm_add_epi16(
-                    _mm_mulhi_epi16(v_low, v_g_coeff_1),
-                    _mm_mulhi_epi16(u_low, v_g_coeff_2),
+                    _mm_mulhrs_epi16(v_low, v_g_coeff_1),
+                    _mm_mulhrs_epi16(u_low, v_g_coeff_2),
                 ),
             ),
             rounding_const,