diff --git a/README.md b/README.md
index 72c10f5..e7ff754 100644
--- a/README.md
+++ b/README.md
@@ -78,7 +78,7 @@ Tests performed on the image 5763x3842
 |------------------------|:----------:|:---------:|
 | utils RGB->YUV 4:2:0   |   3.48ms   |  3.64ms   |
 | libyuv RGB->YUV 4:2:0  |   3.58ms   |  33.87ms  |
-| utils RGBA->YUV 4:2:0  |   4.32ms   |  5.74ms   |
+| utils RGBA->YUV 4:2:0  |   4.32ms   |  5.47ms   |
 | libyuv RGBA->YUV 4:2:0 |   4.87ms   |  23.48ms  |
 | utils RGBA->YUV 4:2:2  |   4.83ms   |  7.08ms   |
 | libyuv RGBA->YUV 4:2:2 |   5.90ms   |  35.23ms  |
@@ -88,9 +88,9 @@ Tests performed on the image 5763x3842
 
 |                        | time(NEON) | Time(AVX) |
 |------------------------|:----------:|:---------:|
-| utils YUV NV12->RGB    |   3.86ms   |  6.48ms   |
+| utils YUV NV12->RGB    |   3.86ms   |  6.24ms   |
 | libyuv YUV NV12->RGB   |   5.20ms   |  45.28ms  |
-| utils YUV 4:2:0->RGB   |   3.28ms   |  5.34ms   |
+| utils YUV 4:2:0->RGB   |   3.28ms   |  5.25ms   |
 | libyuv YUV 4:2:0->RGB  |   5.70ms   |  44.95ms  |
 | utils YUV 4:2:0->RGBA  |   3.82ms   |  5.98ms   |
 | libyuv YUV 4:2:0->RGBA |   6.13ms   |  6.88ms   |
diff --git a/app/benches/yuv8/main.rs b/app/benches/yuv8/main.rs
index c510271..8dd9c50 100644
--- a/app/benches/yuv8/main.rs
+++ b/app/benches/yuv8/main.rs
@@ -26,9 +26,9 @@
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
-use std::alloc::Layout;
 use criterion::{criterion_group, criterion_main, Criterion};
 use image::{GenericImageView, ImageReader};
+use std::alloc::Layout;
 use yuv_sys::{
     rs_ABGRToI420, rs_ABGRToJ422, rs_I420ToABGR, rs_I420ToRGB24, rs_I422ToABGR, rs_I444ToABGR,
     rs_NV21ToABGR, rs_RGB24ToI420,
@@ -98,37 +98,41 @@ pub fn criterion_benchmark(c: &mut Criterion) {
         })
     });
 
-    c.bench_function("libyuv RGB -> YUV 4:2:0", |b| {
-        unsafe {
-            let layout_rgb = Layout::from_size_align(dimensions.0 as usize * dimensions.1 as usize * 3, 16).unwrap();
-            let layout_y = Layout::from_size_align(dimensions.0 as usize * dimensions.1 as usize, 16).unwrap();
-            let layout_uv = Layout::from_size_align((dimensions.0 as usize + 1) / 2 * (dimensions.1 as usize + 1) / 2, 16).unwrap();
-            let target_y = std::alloc::alloc(layout_y);
-            let target_u = std::alloc::alloc(layout_uv);
-            let target_v = std::alloc::alloc(layout_uv);
-            let source_rgb = std::alloc::alloc(layout_rgb);
-            for (x, src) in src_bytes.iter().enumerate() {
-                *source_rgb.add(x) = *src;
-            }
-            b.iter(|| {
-                rs_RGB24ToI420(
-                    source_rgb,
-                    stride as i32,
-                    target_y,
-                    dimensions.0 as i32,
-                    target_u,
-                    (dimensions.0 as i32 + 1) / 2,
-                    target_v,
-                    (dimensions.0 as i32 + 1) / 2,
-                    dimensions.0 as i32,
-                    dimensions.1 as i32,
-                );
-            });
-            std::alloc::dealloc(target_y, layout_y);
-            std::alloc::dealloc(target_u, layout_uv);
-            std::alloc::dealloc(target_v, layout_uv);
-            std::alloc::dealloc(source_rgb, layout_rgb);
+    c.bench_function("libyuv RGB -> YUV 4:2:0", |b| unsafe {
+        let layout_rgb =
+            Layout::from_size_align(dimensions.0 as usize * dimensions.1 as usize * 3, 16).unwrap();
+        let layout_y =
+            Layout::from_size_align(dimensions.0 as usize * dimensions.1 as usize, 16).unwrap();
+        let layout_uv = Layout::from_size_align(
+            (dimensions.0 as usize + 1) / 2 * (dimensions.1 as usize + 1) / 2,
+            16,
+        )
+        .unwrap();
+        let target_y = std::alloc::alloc(layout_y);
+        let target_u = std::alloc::alloc(layout_uv);
+        let target_v = std::alloc::alloc(layout_uv);
+        let source_rgb = std::alloc::alloc(layout_rgb);
+        for (x, src) in src_bytes.iter().enumerate() {
+            *source_rgb.add(x) = *src;
         }
+        b.iter(|| {
+            rs_RGB24ToI420(
+                source_rgb,
+                stride as i32,
+                target_y,
+                dimensions.0 as i32,
+                target_u,
+                (dimensions.0 as i32 + 1) / 2,
+                target_v,
+                (dimensions.0 as i32 + 1) / 2,
+                dimensions.0 as i32,
+                dimensions.1 as i32,
+            );
+        });
+        std::alloc::dealloc(target_y, layout_y);
+        std::alloc::dealloc(target_u, layout_uv);
+        std::alloc::dealloc(target_v, layout_uv);
+        std::alloc::dealloc(source_rgb, layout_rgb);
     });
 
     c.bench_function("yuvutils RGBA -> YUV 4:2:0", |b| {
@@ -149,37 +153,41 @@ pub fn criterion_benchmark(c: &mut Criterion) {
         })
     });
 
-    c.bench_function("libyuv RGBA -> YUV 4:2:0", |b| {
-       unsafe {
-           let layout_rgba = Layout::from_size_align(dimensions.0 as usize * dimensions.1 as usize * 4, 16).unwrap();
-           let layout_y = Layout::from_size_align(dimensions.0 as usize * dimensions.1 as usize, 16).unwrap();
-           let layout_uv = Layout::from_size_align((dimensions.0 as usize + 1) / 2 * (dimensions.1 as usize + 1) / 2, 16).unwrap();
-           let target_y = std::alloc::alloc(layout_y);
-           let target_u = std::alloc::alloc(layout_uv);
-           let target_v = std::alloc::alloc(layout_uv);
-           let source_rgb = std::alloc::alloc(layout_rgba);
-           for (x, src) in src_bytes.iter().enumerate() {
-               *source_rgb.add(x) = *src;
-           }
-           b.iter(|| {
-               rs_ABGRToI420(
-                   source_rgb,
-                   dimensions.0 as i32 * 4i32,
-                   target_y,
-                   dimensions.0 as i32,
-                   target_u,
-                   (dimensions.0 as i32 + 1) / 2,
-                   target_v,
-                   (dimensions.0 as i32 + 1) / 2,
-                   dimensions.0 as i32,
-                   dimensions.1 as i32,
-               );
-           });
-           std::alloc::dealloc(target_y, layout_y);
-           std::alloc::dealloc(target_u, layout_uv);
-           std::alloc::dealloc(target_v, layout_uv);
-           std::alloc::dealloc(source_rgb, layout_rgba);
-       }
+    c.bench_function("libyuv RGBA -> YUV 4:2:0", |b| unsafe {
+        let layout_rgba =
+            Layout::from_size_align(dimensions.0 as usize * dimensions.1 as usize * 4, 16).unwrap();
+        let layout_y =
+            Layout::from_size_align(dimensions.0 as usize * dimensions.1 as usize, 16).unwrap();
+        let layout_uv = Layout::from_size_align(
+            (dimensions.0 as usize + 1) / 2 * (dimensions.1 as usize + 1) / 2,
+            16,
+        )
+        .unwrap();
+        let target_y = std::alloc::alloc(layout_y);
+        let target_u = std::alloc::alloc(layout_uv);
+        let target_v = std::alloc::alloc(layout_uv);
+        let source_rgb = std::alloc::alloc(layout_rgba);
+        for (x, src) in src_bytes.iter().enumerate() {
+            *source_rgb.add(x) = *src;
+        }
+        b.iter(|| {
+            rs_ABGRToI420(
+                source_rgb,
+                dimensions.0 as i32 * 4i32,
+                target_y,
+                dimensions.0 as i32,
+                target_u,
+                (dimensions.0 as i32 + 1) / 2,
+                target_v,
+                (dimensions.0 as i32 + 1) / 2,
+                dimensions.0 as i32,
+                dimensions.1 as i32,
+            );
+        });
+        std::alloc::dealloc(target_y, layout_y);
+        std::alloc::dealloc(target_u, layout_uv);
+        std::alloc::dealloc(target_v, layout_uv);
+        std::alloc::dealloc(source_rgb, layout_rgba);
     });
 
     c.bench_function("yuvutils RGBA -> YUV 4:2:2", |b| {
diff --git a/src/avx2/mod.rs b/src/avx2/mod.rs
index 7f67999..e3811da 100644
--- a/src/avx2/mod.rs
+++ b/src/avx2/mod.rs
@@ -33,6 +33,7 @@ mod rgb_to_nv;
 mod rgb_to_y;
 mod rgb_to_ycgco;
 mod rgba_to_yuv;
+mod rgba_to_yuv420;
 mod ycgco_to_rgb;
 mod ycgco_to_rgba_alpha;
 mod yuv_nv_to_rgba;
@@ -45,12 +46,12 @@ mod yuv_to_rgba_alpha;
 mod yuv_to_yuv2;
 mod yuy2_to_rgb;
 mod yuy2_to_yuv;
-mod rgba_to_yuv420;
 
 pub(crate) use rgb_to_nv::avx2_rgba_to_nv;
 pub(crate) use rgb_to_y::avx2_rgb_to_y_row;
 pub(crate) use rgb_to_ycgco::avx2_rgb_to_ycgco_row;
 pub(crate) use rgba_to_yuv::avx2_rgba_to_yuv;
+pub(crate) use rgba_to_yuv420::avx2_rgba_to_yuv420;
 pub(crate) use ycgco_to_rgb::avx2_ycgco_to_rgb_row;
 pub(crate) use ycgco_to_rgba_alpha::avx2_ycgco_to_rgba_alpha;
 pub(crate) use yuv_nv_to_rgba::avx2_yuv_nv_to_rgba_row;
@@ -63,4 +64,3 @@ pub(crate) use yuv_to_rgba_alpha::avx2_yuv_to_rgba_alpha;
 pub(crate) use yuv_to_yuv2::yuv_to_yuy2_avx2_row;
 pub(crate) use yuy2_to_rgb::yuy2_to_rgb_avx;
 pub(crate) use yuy2_to_yuv::yuy2_to_yuv_avx;
-pub(crate) use rgba_to_yuv420::avx2_rgba_to_yuv420;
\ No newline at end of file
diff --git a/src/avx2/rdp_rgba_to_yuv.rs b/src/avx2/rdp_rgba_to_yuv.rs
deleted file mode 100644
index 35621c2..0000000
--- a/src/avx2/rdp_rgba_to_yuv.rs
+++ /dev/null
@@ -1,279 +0,0 @@
-/*
- * Copyright (c) Radzivon Bartoshyk, 10/2024. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without modification,
- * are permitted provided that the following conditions are met:
- *
- * 1.  Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2.  Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3.  Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-use crate::avx2::avx2_utils::{_mm256_deinterleave_rgba_epi8, avx2_deinterleave_rgb};
-use crate::internals::ProcessedOffset;
-use crate::yuv_support::{CbCrForwardTransform, YuvSourceChannels};
-#[cfg(target_arch = "x86")]
-use std::arch::x86::*;
-#[cfg(target_arch = "x86_64")]
-use std::arch::x86_64::*;
-
-pub fn rdp_avx2_rgba_to_yuv<const ORIGIN_CHANNELS: u8>(
-    transform: &CbCrForwardTransform<i32>,
-    y_plane: &mut [u16],
-    u_plane: &mut [u16],
-    v_plane: &mut [u16],
-    rgba: &[u8],
-    start_cx: usize,
-    width: usize,
-) -> ProcessedOffset {
-    unsafe {
-        rdp_avx2_rgba_to_yuv_impl::<ORIGIN_CHANNELS>(
-            transform, y_plane, u_plane, v_plane, rgba, start_cx, width,
-        )
-    }
-}
-
-#[target_feature(enable = "avx2")]
-unsafe fn rdp_avx2_rgba_to_yuv_impl<const ORIGIN_CHANNELS: u8>(
-    transform: &CbCrForwardTransform<i32>,
-    y_plane: &mut [u16],
-    u_plane: &mut [u16],
-    v_plane: &mut [u16],
-    rgba: &[u8],
-    start_cx: usize,
-    width: usize,
-) -> ProcessedOffset {
-    let source_channels: YuvSourceChannels = ORIGIN_CHANNELS.into();
-    let channels = source_channels.get_channels_count();
-
-    let y_ptr = y_plane;
-    let u_ptr = u_plane;
-    let v_ptr = v_plane;
-    let rgba_ptr = rgba.as_ptr();
-
-    let mut cx = start_cx;
-
-    const V_SCALE: i32 = 7;
-
-    let i_bias_y = _mm256_set1_epi16(-4095);
-    let i_cap_y = _mm256_set1_epi16(4096);
-
-    let y_bias = _mm256_set1_epi16(-4096);
-    let uv_bias = _mm256_set1_epi16(0);
-    let v_yr = _mm256_set1_epi16(transform.yr as i16);
-    let v_yg = _mm256_set1_epi16(transform.yg as i16);
-    let v_yb = _mm256_set1_epi16(transform.yb as i16);
-    let v_cb_r = _mm256_set1_epi16(transform.cb_r as i16);
-    let v_cb_g = _mm256_set1_epi16(transform.cb_g as i16);
-    let v_cb_b = _mm256_set1_epi16(transform.cb_b as i16);
-    let v_cr_r = _mm256_set1_epi16(transform.cr_r as i16);
-    let v_cr_g = _mm256_set1_epi16(transform.cr_g as i16);
-    let v_cr_b = _mm256_set1_epi16(transform.cr_b as i16);
-
-    while cx + 32 < width {
-        let (r_values, g_values, b_values);
-
-        let px = cx * channels;
-
-        match source_channels {
-            YuvSourceChannels::Rgb | YuvSourceChannels::Bgr => {
-                let source_ptr = rgba_ptr.add(px);
-                let row_1 = _mm256_loadu_si256(source_ptr as *const __m256i);
-                let row_2 = _mm256_loadu_si256(source_ptr.add(32) as *const __m256i);
-                let row_3 = _mm256_loadu_si256(source_ptr.add(64) as *const __m256i);
-
-                let (it1, it2, it3) = avx2_deinterleave_rgb(row_1, row_2, row_3);
-                if source_channels == YuvSourceChannels::Rgb {
-                    r_values = it1;
-                    g_values = it2;
-                    b_values = it3;
-                } else {
-                    r_values = it3;
-                    g_values = it2;
-                    b_values = it1;
-                }
-            }
-            YuvSourceChannels::Rgba | YuvSourceChannels::Bgra => {
-                let source_ptr = rgba_ptr.add(px);
-                let row_1 = _mm256_loadu_si256(source_ptr as *const __m256i);
-                let row_2 = _mm256_loadu_si256(source_ptr.add(32) as *const __m256i);
-                let row_3 = _mm256_loadu_si256(source_ptr.add(64) as *const __m256i);
-                let row_4 = _mm256_loadu_si256(source_ptr.add(96) as *const __m256i);
-
-                let (it1, it2, it3, _) = _mm256_deinterleave_rgba_epi8(row_1, row_2, row_3, row_4);
-                if source_channels == YuvSourceChannels::Rgba {
-                    r_values = it1;
-                    g_values = it2;
-                    b_values = it3;
-                } else {
-                    r_values = it3;
-                    g_values = it2;
-                    b_values = it1;
-                }
-            }
-        }
-
-        let r_low =
-            _mm256_slli_epi16::<V_SCALE>(_mm256_cvtepu8_epi16(_mm256_castsi256_si128(r_values)));
-        let r_high = _mm256_slli_epi16::<V_SCALE>(_mm256_cvtepu8_epi16(
-            _mm256_extracti128_si256::<1>(r_values),
-        ));
-        let g_low =
-            _mm256_slli_epi16::<V_SCALE>(_mm256_cvtepu8_epi16(_mm256_castsi256_si128(g_values)));
-        let g_high = _mm256_slli_epi16::<V_SCALE>(_mm256_cvtepu8_epi16(
-            _mm256_extracti128_si256::<1>(g_values),
-        ));
-        let b_low =
-            _mm256_slli_epi16::<V_SCALE>(_mm256_cvtepu8_epi16(_mm256_castsi256_si128(b_values)));
-        let b_high = _mm256_slli_epi16::<V_SCALE>(_mm256_cvtepu8_epi16(
-            _mm256_extracti128_si256::<1>(b_values),
-        ));
-
-        let y_l = _mm256_max_epi16(
-            _mm256_min_epi16(
-                _mm256_add_epi16(
-                    y_bias,
-                    _mm256_add_epi16(
-                        _mm256_add_epi16(
-                            _mm256_mulhi_epi16(r_low, v_yr),
-                            _mm256_mulhi_epi16(g_low, v_yg),
-                        ),
-                        _mm256_mulhi_epi16(b_low, v_yb),
-                    ),
-                ),
-                i_cap_y,
-            ),
-            i_bias_y,
-        );
-
-        let y_h = _mm256_max_epi16(
-            _mm256_min_epi16(
-                _mm256_add_epi16(
-                    y_bias,
-                    _mm256_add_epi16(
-                        _mm256_add_epi16(
-                            _mm256_mulhi_epi16(r_high, v_yr),
-                            _mm256_mulhi_epi16(g_high, v_yg),
-                        ),
-                        _mm256_mulhi_epi16(b_high, v_yb),
-                    ),
-                ),
-                i_cap_y,
-            ),
-            i_bias_y,
-        );
-
-        _mm256_storeu_si256(
-            y_ptr.get_unchecked_mut(cx..).as_mut_ptr() as *mut __m256i,
-            y_l,
-        );
-        _mm256_storeu_si256(
-            y_ptr.get_unchecked_mut((cx + 16)..).as_mut_ptr() as *mut __m256i,
-            y_h,
-        );
-
-        let cb_l = _mm256_max_epi16(
-            _mm256_min_epi16(
-                _mm256_add_epi16(
-                    uv_bias,
-                    _mm256_add_epi16(
-                        _mm256_add_epi16(
-                            _mm256_mulhi_epi16(r_low, v_cb_r),
-                            _mm256_mulhi_epi16(g_low, v_cb_g),
-                        ),
-                        _mm256_mulhi_epi16(b_low, v_cb_b),
-                    ),
-                ),
-                i_cap_y,
-            ),
-            i_bias_y,
-        );
-        let cr_l = _mm256_max_epi16(
-            _mm256_min_epi16(
-                _mm256_add_epi16(
-                    uv_bias,
-                    _mm256_add_epi16(
-                        _mm256_add_epi16(
-                            _mm256_mulhi_epi16(r_low, v_cr_r),
-                            _mm256_mulhi_epi16(g_low, v_cr_g),
-                        ),
-                        _mm256_mulhi_epi16(b_low, v_cr_b),
-                    ),
-                ),
-                i_cap_y,
-            ),
-            i_bias_y,
-        );
-        let cb_h = _mm256_max_epi16(
-            _mm256_min_epi16(
-                _mm256_add_epi16(
-                    uv_bias,
-                    _mm256_add_epi16(
-                        _mm256_add_epi16(
-                            _mm256_mulhi_epi16(r_high, v_cb_r),
-                            _mm256_mulhi_epi16(g_high, v_cb_g),
-                        ),
-                        _mm256_mulhi_epi16(b_high, v_cb_b),
-                    ),
-                ),
-                i_cap_y,
-            ),
-            i_bias_y,
-        );
-        let cr_h = _mm256_max_epi16(
-            _mm256_min_epi16(
-                _mm256_add_epi16(
-                    uv_bias,
-                    _mm256_add_epi16(
-                        _mm256_add_epi16(
-                            _mm256_mulhi_epi16(r_high, v_cr_r),
-                            _mm256_mulhi_epi16(g_high, v_cr_g),
-                        ),
-                        _mm256_mulhi_epi16(b_high, v_cr_b),
-                    ),
-                ),
-                i_cap_y,
-            ),
-            i_bias_y,
-        );
-
-        _mm256_storeu_si256(
-            u_ptr.get_unchecked_mut(cx..).as_mut_ptr() as *mut __m256i,
-            cb_l,
-        );
-        _mm256_storeu_si256(
-            u_ptr.get_unchecked_mut((cx + 16)..).as_mut_ptr() as *mut __m256i,
-            cb_h,
-        );
-        _mm256_storeu_si256(
-            v_ptr.get_unchecked_mut(cx..).as_mut_ptr() as *mut __m256i,
-            cr_l,
-        );
-        _mm256_storeu_si256(
-            v_ptr.get_unchecked_mut((cx + 16)..).as_mut_ptr() as *mut __m256i,
-            cr_h,
-        );
-
-        cx += 32;
-    }
-
-    ProcessedOffset { cx, ux: cx }
-}
diff --git a/src/avx2/rgb_to_nv.rs b/src/avx2/rgb_to_nv.rs
index efb9add..bbba3d9 100644
--- a/src/avx2/rgb_to_nv.rs
+++ b/src/avx2/rgb_to_nv.rs
@@ -93,12 +93,10 @@ unsafe fn avx2_rgba_to_nv_impl<
     let mut cx = start_cx;
     let mut uv_x = start_ux;
 
-    const V_SHR: i32 = 3;
-    const V_SCALE: i32 = 6;
+    const V_SCALE: i32 = 3;
 
-    let rounding_const_bias: i16 = 1 << (V_SHR - 1);
-    let bias_y = range.bias_y as i16 * (1 << V_SHR) + rounding_const_bias;
-    let bias_uv = range.bias_uv as i16 * (1 << V_SHR) + rounding_const_bias;
+    let bias_y = range.bias_y as i16;
+    let bias_uv = range.bias_uv as i16;
 
     let i_bias_y = _mm256_set1_epi16(range.bias_y as i16);
     let i_cap_y = _mm256_set1_epi16(range.range_y as i16 + range.bias_y as i16);
@@ -177,7 +175,7 @@ unsafe fn avx2_rgba_to_nv_impl<
 
         let y_l = _mm256_max_epi16(
             _mm256_min_epi16(
-                _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
+                _mm256_add_epi16(
                     y_bias,
                     _mm256_add_epi16(
                         _mm256_add_epi16(
@@ -186,7 +184,7 @@ unsafe fn avx2_rgba_to_nv_impl<
                         ),
                         _mm256_mulhrs_epi16(b_low, v_yb),
                     ),
-                )),
+                ),
                 i_cap_y,
             ),
             i_bias_y,
@@ -194,7 +192,7 @@ unsafe fn avx2_rgba_to_nv_impl<
 
         let y_h = _mm256_max_epi16(
             _mm256_min_epi16(
-                _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
+                _mm256_add_epi16(
                     y_bias,
                     _mm256_add_epi16(
                         _mm256_add_epi16(
@@ -203,7 +201,7 @@ unsafe fn avx2_rgba_to_nv_impl<
                         ),
                         _mm256_mulhrs_epi16(b_high, v_yb),
                     ),
-                )),
+                ),
                 i_cap_y,
             ),
             i_bias_y,
@@ -215,7 +213,7 @@ unsafe fn avx2_rgba_to_nv_impl<
         if chroma_subsampling == YuvChromaSubsampling::Yuv444 {
             let cb_l = _mm256_max_epi16(
                 _mm256_min_epi16(
-                    _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
+                    (_mm256_add_epi16(
                         uv_bias,
                         _mm256_add_epi16(
                             _mm256_add_epi16(
@@ -231,7 +229,7 @@ unsafe fn avx2_rgba_to_nv_impl<
             );
             let cr_l = _mm256_max_epi16(
                 _mm256_min_epi16(
-                    _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
+                    (_mm256_add_epi16(
                         uv_bias,
                         _mm256_add_epi16(
                             _mm256_add_epi16(
@@ -247,7 +245,7 @@ unsafe fn avx2_rgba_to_nv_impl<
             );
             let cb_h = _mm256_max_epi16(
                 _mm256_min_epi16(
-                    _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
+                    (_mm256_add_epi16(
                         uv_bias,
                         _mm256_add_epi16(
                             _mm256_add_epi16(
@@ -263,7 +261,7 @@ unsafe fn avx2_rgba_to_nv_impl<
             );
             let cr_h = _mm256_max_epi16(
                 _mm256_min_epi16(
-                    _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
+                    (_mm256_add_epi16(
                         uv_bias,
                         _mm256_add_epi16(
                             _mm256_add_epi16(
@@ -297,7 +295,7 @@ unsafe fn avx2_rgba_to_nv_impl<
             let b1 = _mm256_avg_epu16(b_low, b_high);
             let cb = _mm256_max_epi16(
                 _mm256_min_epi16(
-                    _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
+                    (_mm256_add_epi16(
                         uv_bias,
                         _mm256_add_epi16(
                             _mm256_add_epi16(
@@ -313,7 +311,7 @@ unsafe fn avx2_rgba_to_nv_impl<
             );
             let cr = _mm256_max_epi16(
                 _mm256_min_epi16(
-                    _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
+                    (_mm256_add_epi16(
                         uv_bias,
                         _mm256_add_epi16(
                             _mm256_add_epi16(
diff --git a/src/avx2/rgb_to_y.rs b/src/avx2/rgb_to_y.rs
index 29cd446..2dcf7d8 100644
--- a/src/avx2/rgb_to_y.rs
+++ b/src/avx2/rgb_to_y.rs
@@ -66,10 +66,8 @@ pub(crate) unsafe fn avx2_rgb_to_y_row_impl<const ORIGIN_CHANNELS: u8>(
 
     let mut cx = start_cx;
 
-    const V_SHR: i32 = 3;
-    const V_SCALE: i32 = 6;
-    let rounding_const_bias: i16 = 1 << (V_SHR - 1);
-    let bias_y = range.bias_y as i16 * (1 << V_SHR) + rounding_const_bias;
+    const V_SCALE: i32 = 3;
+    let bias_y = range.bias_y as i16;
 
     let i_bias_y = _mm256_set1_epi16(range.bias_y as i16);
     let i_cap_y = _mm256_set1_epi16(range.range_y as i16 + range.bias_y as i16);
@@ -140,7 +138,7 @@ pub(crate) unsafe fn avx2_rgb_to_y_row_impl<const ORIGIN_CHANNELS: u8>(
 
         let y_l = _mm256_max_epi16(
             _mm256_min_epi16(
-                _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
+                _mm256_add_epi16(
                     y_bias,
                     _mm256_add_epi16(
                         _mm256_add_epi16(
@@ -149,7 +147,7 @@ pub(crate) unsafe fn avx2_rgb_to_y_row_impl<const ORIGIN_CHANNELS: u8>(
                         ),
                         _mm256_mulhrs_epi16(b_low, v_yb),
                     ),
-                )),
+                ),
                 i_cap_y,
             ),
             i_bias_y,
@@ -157,7 +155,7 @@ pub(crate) unsafe fn avx2_rgb_to_y_row_impl<const ORIGIN_CHANNELS: u8>(
 
         let y_h = _mm256_max_epi16(
             _mm256_min_epi16(
-                _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
+                _mm256_add_epi16(
                     y_bias,
                     _mm256_add_epi16(
                         _mm256_add_epi16(
@@ -166,7 +164,7 @@ pub(crate) unsafe fn avx2_rgb_to_y_row_impl<const ORIGIN_CHANNELS: u8>(
                         ),
                         _mm256_mulhrs_epi16(b_high, v_yb),
                     ),
-                )),
+                ),
                 i_cap_y,
             ),
             i_bias_y,
diff --git a/src/avx2/rgba_to_yuv.rs b/src/avx2/rgba_to_yuv.rs
index 25c46f4..658f5d2 100644
--- a/src/avx2/rgba_to_yuv.rs
+++ b/src/avx2/rgba_to_yuv.rs
@@ -81,11 +81,9 @@ unsafe fn avx2_rgba_to_yuv_impl<const ORIGIN_CHANNELS: u8, const SAMPLING: u8>(
     let mut cx = start_cx;
     let mut uv_x = start_ux;
 
-    const V_SHR: i32 = 3;
     const V_SCALE: i32 = 6;
-    let rounding_const_bias: i16 = 1 << (V_SHR - 1);
-    let bias_y = range.bias_y as i16 * (1 << V_SHR) + rounding_const_bias;
-    let bias_uv = range.bias_uv as i16 * (1 << V_SHR) + rounding_const_bias;
+    let bias_y = range.bias_y as i16;
+    let bias_uv = range.bias_uv as i16;
 
     let i_bias_y = _mm256_set1_epi16(range.bias_y as i16);
     let i_cap_y = _mm256_set1_epi16(range.range_y as i16 + range.bias_y as i16);
@@ -164,7 +162,7 @@ unsafe fn avx2_rgba_to_yuv_impl<const ORIGIN_CHANNELS: u8, const SAMPLING: u8>(
 
         let y_l = _mm256_max_epi16(
             _mm256_min_epi16(
-                _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
+                _mm256_add_epi16(
                     y_bias,
                     _mm256_add_epi16(
                         _mm256_add_epi16(
@@ -173,7 +171,7 @@ unsafe fn avx2_rgba_to_yuv_impl<const ORIGIN_CHANNELS: u8, const SAMPLING: u8>(
                         ),
                         _mm256_mulhrs_epi16(b_low, v_yb),
                     ),
-                )),
+                ),
                 i_cap_y,
             ),
             i_bias_y,
@@ -181,7 +179,7 @@ unsafe fn avx2_rgba_to_yuv_impl<const ORIGIN_CHANNELS: u8, const SAMPLING: u8>(
 
         let y_h = _mm256_max_epi16(
             _mm256_min_epi16(
-                _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
+                _mm256_add_epi16(
                     y_bias,
                     _mm256_add_epi16(
                         _mm256_add_epi16(
@@ -190,7 +188,7 @@ unsafe fn avx2_rgba_to_yuv_impl<const ORIGIN_CHANNELS: u8, const SAMPLING: u8>(
                         ),
                         _mm256_mulhrs_epi16(b_high, v_yb),
                     ),
-                )),
+                ),
                 i_cap_y,
             ),
             i_bias_y,
@@ -202,7 +200,7 @@ unsafe fn avx2_rgba_to_yuv_impl<const ORIGIN_CHANNELS: u8, const SAMPLING: u8>(
         if chroma_subsampling == YuvChromaSubsampling::Yuv444 {
             let cb_l = _mm256_max_epi16(
                 _mm256_min_epi16(
-                    _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
+                    _mm256_add_epi16(
                         uv_bias,
                         _mm256_add_epi16(
                             _mm256_add_epi16(
@@ -211,14 +209,14 @@ unsafe fn avx2_rgba_to_yuv_impl<const ORIGIN_CHANNELS: u8, const SAMPLING: u8>(
                             ),
                             _mm256_mulhrs_epi16(b_low, v_cb_b),
                         ),
-                    )),
+                    ),
                     i_cap_uv,
                 ),
                 i_bias_y,
             );
             let cr_l = _mm256_max_epi16(
                 _mm256_min_epi16(
-                    _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
+                    _mm256_add_epi16(
                         uv_bias,
                         _mm256_add_epi16(
                             _mm256_add_epi16(
@@ -227,14 +225,14 @@ unsafe fn avx2_rgba_to_yuv_impl<const ORIGIN_CHANNELS: u8, const SAMPLING: u8>(
                             ),
                             _mm256_mulhrs_epi16(b_low, v_cr_b),
                         ),
-                    )),
+                    ),
                     i_cap_uv,
                 ),
                 i_bias_y,
             );
             let cb_h = _mm256_max_epi16(
                 _mm256_min_epi16(
-                    _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
+                    _mm256_add_epi16(
                         uv_bias,
                         _mm256_add_epi16(
                             _mm256_add_epi16(
@@ -243,14 +241,14 @@ unsafe fn avx2_rgba_to_yuv_impl<const ORIGIN_CHANNELS: u8, const SAMPLING: u8>(
                             ),
                             _mm256_mulhrs_epi16(b_high, v_cb_b),
                         ),
-                    )),
+                    ),
                     i_cap_uv,
                 ),
                 i_bias_y,
             );
             let cr_h = _mm256_max_epi16(
                 _mm256_min_epi16(
-                    _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
+                    _mm256_add_epi16(
                         uv_bias,
                         _mm256_add_epi16(
                             _mm256_add_epi16(
@@ -259,7 +257,7 @@ unsafe fn avx2_rgba_to_yuv_impl<const ORIGIN_CHANNELS: u8, const SAMPLING: u8>(
                             ),
                             _mm256_mulhrs_epi16(b_high, v_cr_b),
                         ),
-                    )),
+                    ),
                     i_cap_uv,
                 ),
                 i_bias_y,
@@ -279,7 +277,7 @@ unsafe fn avx2_rgba_to_yuv_impl<const ORIGIN_CHANNELS: u8, const SAMPLING: u8>(
             let b1 = _mm256_avg_epu16(b_low, b_high);
             let cb = _mm256_max_epi16(
                 _mm256_min_epi16(
-                    _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
+                    _mm256_add_epi16(
                         uv_bias,
                         _mm256_add_epi16(
                             _mm256_add_epi16(
@@ -288,14 +286,14 @@ unsafe fn avx2_rgba_to_yuv_impl<const ORIGIN_CHANNELS: u8, const SAMPLING: u8>(
                             ),
                             _mm256_mulhrs_epi16(b1, v_cb_b),
                         ),
-                    )),
+                    ),
                     i_cap_uv,
                 ),
                 i_bias_y,
             );
             let cr = _mm256_max_epi16(
                 _mm256_min_epi16(
-                    _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
+                    _mm256_add_epi16(
                         uv_bias,
                         _mm256_add_epi16(
                             _mm256_add_epi16(
@@ -304,7 +302,7 @@ unsafe fn avx2_rgba_to_yuv_impl<const ORIGIN_CHANNELS: u8, const SAMPLING: u8>(
                             ),
                             _mm256_mulhrs_epi16(b1, v_cr_b),
                         ),
-                    )),
+                    ),
                     i_cap_uv,
                 ),
                 i_bias_y,
diff --git a/src/avx2/rgba_to_yuv420.rs b/src/avx2/rgba_to_yuv420.rs
index f03618d..deb281c 100644
--- a/src/avx2/rgba_to_yuv420.rs
+++ b/src/avx2/rgba_to_yuv420.rs
@@ -31,9 +31,7 @@ use crate::avx2::avx2_utils::{
     _mm256_deinterleave_rgba_epi8, avx2_deinterleave_rgb, avx2_pack_u16,
 };
 use crate::internals::ProcessedOffset;
-use crate::yuv_support::{
-    CbCrForwardTransform, YuvChromaRange, YuvSourceChannels,
-};
+use crate::yuv_support::{CbCrForwardTransform, YuvChromaRange, YuvSourceChannels};
 #[cfg(target_arch = "x86")]
 use std::arch::x86::*;
 #[cfg(target_arch = "x86_64")]
@@ -83,11 +81,9 @@ unsafe fn avx2_rgba_to_yuv_impl420<const ORIGIN_CHANNELS: u8>(
     let mut cx = start_cx;
     let mut uv_x = start_ux;
 
-    const V_SHR: i32 = 3;
-    const V_SCALE: i32 = 6;
-    let rounding_const_bias: i16 = 1 << (V_SHR - 1);
-    let bias_y = range.bias_y as i16 * (1 << V_SHR) + rounding_const_bias;
-    let bias_uv = range.bias_uv as i16 * (1 << V_SHR) + rounding_const_bias;
+    const V_SCALE: i32 = 3;
+    let bias_y = range.bias_y as i16;
+    let bias_uv = range.bias_uv as i16;
 
     let i_bias_y = _mm256_set1_epi16(range.bias_y as i16);
     let i_cap_y = _mm256_set1_epi16(range.range_y as i16 + range.bias_y as i16);
@@ -169,7 +165,8 @@ unsafe fn avx2_rgba_to_yuv_impl420<const ORIGIN_CHANNELS: u8>(
                 let row_31 = _mm256_loadu_si256(source_ptr1.add(64) as *const __m256i);
                 let row_41 = _mm256_loadu_si256(source_ptr1.add(96) as *const __m256i);
 
-                let (it1, it2, it3, _) = _mm256_deinterleave_rgba_epi8(row_11, row_21, row_31, row_41);
+                let (it1, it2, it3, _) =
+                    _mm256_deinterleave_rgba_epi8(row_11, row_21, row_31, row_41);
                 if source_channels == YuvSourceChannels::Rgba {
                     r_values1 = it1;
                     g_values1 = it2;
@@ -200,7 +197,7 @@ unsafe fn avx2_rgba_to_yuv_impl420<const ORIGIN_CHANNELS: u8>(
 
         let y0_l = _mm256_max_epi16(
             _mm256_min_epi16(
-                _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
+                _mm256_add_epi16(
                     y_bias,
                     _mm256_add_epi16(
                         _mm256_add_epi16(
@@ -209,7 +206,7 @@ unsafe fn avx2_rgba_to_yuv_impl420<const ORIGIN_CHANNELS: u8>(
                         ),
                         _mm256_mulhrs_epi16(b0_low, v_yb),
                     ),
-                )),
+                ),
                 i_cap_y,
             ),
             i_bias_y,
@@ -217,7 +214,7 @@ unsafe fn avx2_rgba_to_yuv_impl420<const ORIGIN_CHANNELS: u8>(
 
         let y0_h = _mm256_max_epi16(
             _mm256_min_epi16(
-                _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
+                _mm256_add_epi16(
                     y_bias,
                     _mm256_add_epi16(
                         _mm256_add_epi16(
@@ -226,7 +223,7 @@ unsafe fn avx2_rgba_to_yuv_impl420<const ORIGIN_CHANNELS: u8>(
                         ),
                         _mm256_mulhrs_epi16(b0_high, v_yb),
                     ),
-                )),
+                ),
                 i_cap_y,
             ),
             i_bias_y,
@@ -250,7 +247,7 @@ unsafe fn avx2_rgba_to_yuv_impl420<const ORIGIN_CHANNELS: u8>(
 
         let y1_l = _mm256_max_epi16(
             _mm256_min_epi16(
-                _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
+                _mm256_add_epi16(
                     y_bias,
                     _mm256_add_epi16(
                         _mm256_add_epi16(
@@ -259,7 +256,7 @@ unsafe fn avx2_rgba_to_yuv_impl420<const ORIGIN_CHANNELS: u8>(
                         ),
                         _mm256_mulhrs_epi16(b1_low, v_yb),
                     ),
-                )),
+                ),
                 i_cap_y,
             ),
             i_bias_y,
@@ -267,7 +264,7 @@ unsafe fn avx2_rgba_to_yuv_impl420<const ORIGIN_CHANNELS: u8>(
 
         let y1_h = _mm256_max_epi16(
             _mm256_min_epi16(
-                _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
+                _mm256_add_epi16(
                     y_bias,
                     _mm256_add_epi16(
                         _mm256_add_epi16(
@@ -276,7 +273,7 @@ unsafe fn avx2_rgba_to_yuv_impl420<const ORIGIN_CHANNELS: u8>(
                         ),
                         _mm256_mulhrs_epi16(b1_high, v_yb),
                     ),
-                )),
+                ),
                 i_cap_y,
             ),
             i_bias_y,
@@ -299,7 +296,7 @@ unsafe fn avx2_rgba_to_yuv_impl420<const ORIGIN_CHANNELS: u8>(
         let b_uv = _mm256_avg_epu16(b0_low, b0_high);
         let cb = _mm256_max_epi16(
             _mm256_min_epi16(
-                _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
+                _mm256_add_epi16(
                     uv_bias,
                     _mm256_add_epi16(
                         _mm256_add_epi16(
@@ -308,14 +305,14 @@ unsafe fn avx2_rgba_to_yuv_impl420<const ORIGIN_CHANNELS: u8>(
                         ),
                         _mm256_mulhrs_epi16(b_uv, v_cb_b),
                     ),
-                )),
+                ),
                 i_cap_uv,
             ),
             i_bias_y,
         );
         let cr = _mm256_max_epi16(
             _mm256_min_epi16(
-                _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
+                _mm256_add_epi16(
                     uv_bias,
                     _mm256_add_epi16(
                         _mm256_add_epi16(
@@ -324,7 +321,7 @@ unsafe fn avx2_rgba_to_yuv_impl420<const ORIGIN_CHANNELS: u8>(
                         ),
                         _mm256_mulhrs_epi16(b_uv, v_cr_b),
                     ),
-                )),
+                ),
                 i_cap_uv,
             ),
             i_bias_y,
diff --git a/src/avx2/yuv_nv_to_rgba.rs b/src/avx2/yuv_nv_to_rgba.rs
index 0274e53..1b86fd6 100644
--- a/src/avx2/yuv_nv_to_rgba.rs
+++ b/src/avx2/yuv_nv_to_rgba.rs
@@ -84,8 +84,7 @@ unsafe fn avx2_yuv_nv_to_rgba_row_impl<
     let uv_ptr = uv_plane.as_ptr();
     let rgba_ptr = rgba.as_mut_ptr();
 
-    const SCALE: i32 = 6;
-    const V_SHR: i32 = 3;
+    const SCALE: i32 = 3;
 
     let y_corr = _mm256_set1_epi8(range.bias_y as i8);
     let uv_corr = _mm256_set1_epi16(range.bias_uv as i16);
@@ -95,7 +94,6 @@ unsafe fn avx2_yuv_nv_to_rgba_row_impl<
     let v_g_coeff_1 = _mm256_set1_epi16(transform.g_coeff_1 as i16);
     let v_g_coeff_2 = _mm256_set1_epi16(transform.g_coeff_2 as i16);
     let v_alpha = _mm256_set1_epi8(255u8 as i8);
-    let rounding_const = _mm256_set1_epi16(1 << (V_SHR - 1));
 
     while cx + 32 < width {
         let y_values =
@@ -161,24 +159,15 @@ unsafe fn avx2_yuv_nv_to_rgba_row_impl<
             v_luma_coeff,
         );
 
-        let r_high = _mm256_srli_epi16::<V_SHR>(_mm256_add_epi16(
-            _mm256_add_epi16(y_high, _mm256_mulhrs_epi16(v_high, v_cr_coeff)),
-            rounding_const,
-        ));
-        let b_high = _mm256_srli_epi16::<V_SHR>(_mm256_add_epi16(
-            _mm256_add_epi16(y_high, _mm256_mulhrs_epi16(u_high, v_cb_coeff)),
-            rounding_const,
-        ));
-        let g_high = _mm256_srli_epi16::<V_SHR>(_mm256_add_epi16(
-            _mm256_sub_epi16(
-                y_high,
-                _mm256_add_epi16(
-                    _mm256_mulhrs_epi16(v_high, v_g_coeff_1),
-                    _mm256_mulhrs_epi16(u_high, v_g_coeff_2),
-                ),
+        let r_high = _mm256_add_epi16(y_high, _mm256_mulhrs_epi16(v_high, v_cr_coeff));
+        let b_high = _mm256_add_epi16(y_high, _mm256_mulhrs_epi16(u_high, v_cb_coeff));
+        let g_high = _mm256_sub_epi16(
+            y_high,
+            _mm256_add_epi16(
+                _mm256_mulhrs_epi16(v_high, v_g_coeff_1),
+                _mm256_mulhrs_epi16(u_high, v_g_coeff_2),
             ),
-            rounding_const,
-        ));
+        );
 
         let u_low =
             _mm256_slli_epi16::<SCALE>(_mm256_sub_epi16(_mm256_cvtepu8_epi16(u_low_u8), uv_corr));
@@ -189,24 +178,15 @@ unsafe fn avx2_yuv_nv_to_rgba_row_impl<
             v_luma_coeff,
         );
 
-        let r_low = _mm256_srli_epi16::<V_SHR>(_mm256_add_epi16(
-            _mm256_add_epi16(y_low, _mm256_mulhrs_epi16(v_low, v_cr_coeff)),
-            rounding_const,
-        ));
-        let b_low = _mm256_srli_epi16::<V_SHR>(_mm256_add_epi16(
-            _mm256_add_epi16(y_low, _mm256_mulhrs_epi16(u_low, v_cb_coeff)),
-            rounding_const,
-        ));
-        let g_low = _mm256_srli_epi16::<V_SHR>(_mm256_add_epi16(
-            _mm256_sub_epi16(
-                y_low,
-                _mm256_add_epi16(
-                    _mm256_mulhrs_epi16(v_low, v_g_coeff_1),
-                    _mm256_mulhrs_epi16(u_low, v_g_coeff_2),
-                ),
+        let r_low = _mm256_add_epi16(y_low, _mm256_mulhrs_epi16(v_low, v_cr_coeff));
+        let b_low = _mm256_add_epi16(y_low, _mm256_mulhrs_epi16(u_low, v_cb_coeff));
+        let g_low = _mm256_sub_epi16(
+            y_low,
+            _mm256_add_epi16(
+                _mm256_mulhrs_epi16(v_low, v_g_coeff_1),
+                _mm256_mulhrs_epi16(u_low, v_g_coeff_2),
             ),
-            rounding_const,
-        ));
+        );
 
         let r_values = avx2_pack_u16(r_low, r_high);
         let g_values = avx2_pack_u16(g_low, g_high);
diff --git a/src/avx2/yuv_nv_to_rgba420.rs b/src/avx2/yuv_nv_to_rgba420.rs
index 82637f8..7cc721a 100644
--- a/src/avx2/yuv_nv_to_rgba420.rs
+++ b/src/avx2/yuv_nv_to_rgba420.rs
@@ -75,8 +75,7 @@ unsafe fn avx2_yuv_nv_to_rgba_row_impl420<const UV_ORDER: u8, const DESTINATION_
     let mut uv_x = start_ux;
     let uv_ptr = uv_plane.as_ptr();
 
-    const SCALE: i32 = 6;
-    const V_SHR: i32 = 3;
+    const SCALE: i32 = 3;
 
     let y_corr = _mm256_set1_epi8(range.bias_y as i8);
     let uv_corr = _mm256_set1_epi16(range.bias_uv as i16);
@@ -86,7 +85,6 @@ unsafe fn avx2_yuv_nv_to_rgba_row_impl420<const UV_ORDER: u8, const DESTINATION_
     let v_g_coeff_1 = _mm256_set1_epi16(transform.g_coeff_1 as i16);
     let v_g_coeff_2 = _mm256_set1_epi16(transform.g_coeff_2 as i16);
     let v_alpha = _mm256_set1_epi8(255u8 as i8);
-    let rounding_const = _mm256_set1_epi16(1 << (V_SHR - 1));
 
     while cx + 32 < width {
         let y_values0 = _mm256_subs_epu8(
@@ -142,30 +140,12 @@ unsafe fn avx2_yuv_nv_to_rgba_row_impl420<const UV_ORDER: u8, const DESTINATION_
             _mm256_mulhrs_epi16(u_high, v_g_coeff_2),
         );
 
-        let r_high0 = _mm256_srli_epi16::<V_SHR>(_mm256_add_epi16(
-            _mm256_add_epi16(y_high0, _mm256_mulhrs_epi16(v_high, v_cr_coeff)),
-            rounding_const,
-        ));
-        let b_high0 = _mm256_srli_epi16::<V_SHR>(_mm256_add_epi16(
-            _mm256_add_epi16(y_high0, _mm256_mulhrs_epi16(u_high, v_cb_coeff)),
-            rounding_const,
-        ));
-        let g_high0 = _mm256_srli_epi16::<V_SHR>(_mm256_add_epi16(
-            _mm256_sub_epi16(y_high0, g_coeff_hi),
-            rounding_const,
-        ));
-        let r_high1 = _mm256_srli_epi16::<V_SHR>(_mm256_add_epi16(
-            _mm256_add_epi16(y_high1, _mm256_mulhrs_epi16(v_high, v_cr_coeff)),
-            rounding_const,
-        ));
-        let b_high1 = _mm256_srli_epi16::<V_SHR>(_mm256_add_epi16(
-            _mm256_add_epi16(y_high1, _mm256_mulhrs_epi16(u_high, v_cb_coeff)),
-            rounding_const,
-        ));
-        let g_high1 = _mm256_srli_epi16::<V_SHR>(_mm256_add_epi16(
-            _mm256_sub_epi16(y_high1, g_coeff_hi),
-            rounding_const,
-        ));
+        let r_high0 = _mm256_add_epi16(y_high0, _mm256_mulhrs_epi16(v_high, v_cr_coeff));
+        let b_high0 = _mm256_add_epi16(y_high0, _mm256_mulhrs_epi16(u_high, v_cb_coeff));
+        let g_high0 = _mm256_sub_epi16(y_high0, g_coeff_hi);
+        let r_high1 = _mm256_add_epi16(y_high1, _mm256_mulhrs_epi16(v_high, v_cr_coeff));
+        let b_high1 = _mm256_add_epi16(y_high1, _mm256_mulhrs_epi16(u_high, v_cb_coeff));
+        let g_high1 = _mm256_sub_epi16(y_high1, g_coeff_hi);
 
         let u_low =
             _mm256_slli_epi16::<SCALE>(_mm256_sub_epi16(_mm256_cvtepu8_epi16(u_low_u8), uv_corr));
@@ -185,31 +165,13 @@ unsafe fn avx2_yuv_nv_to_rgba_row_impl420<const UV_ORDER: u8, const DESTINATION_
             _mm256_mulhrs_epi16(u_low, v_g_coeff_2),
         );
 
-        let r_low0 = _mm256_srli_epi16::<V_SHR>(_mm256_add_epi16(
-            _mm256_add_epi16(y_low0, _mm256_mulhrs_epi16(v_low, v_cr_coeff)),
-            rounding_const,
-        ));
-        let b_low0 = _mm256_srli_epi16::<V_SHR>(_mm256_add_epi16(
-            _mm256_add_epi16(y_low0, _mm256_mulhrs_epi16(u_low, v_cb_coeff)),
-            rounding_const,
-        ));
-        let g_low0 = _mm256_srli_epi16::<V_SHR>(_mm256_add_epi16(
-            _mm256_sub_epi16(y_low0, g_coeff_lo),
-            rounding_const,
-        ));
-
-        let r_low1 = _mm256_srli_epi16::<V_SHR>(_mm256_add_epi16(
-            _mm256_add_epi16(y_low1, _mm256_mulhrs_epi16(v_low, v_cr_coeff)),
-            rounding_const,
-        ));
-        let b_low1 = _mm256_srli_epi16::<V_SHR>(_mm256_add_epi16(
-            _mm256_add_epi16(y_low1, _mm256_mulhrs_epi16(u_low, v_cb_coeff)),
-            rounding_const,
-        ));
-        let g_low1 = _mm256_srli_epi16::<V_SHR>(_mm256_add_epi16(
-            _mm256_sub_epi16(y_low1, g_coeff_lo),
-            rounding_const,
-        ));
+        let r_low0 = _mm256_add_epi16(y_low0, _mm256_mulhrs_epi16(v_low, v_cr_coeff));
+        let b_low0 = _mm256_add_epi16(y_low0, _mm256_mulhrs_epi16(u_low, v_cb_coeff));
+        let g_low0 = _mm256_sub_epi16(y_low0, g_coeff_lo);
+
+        let r_low1 = _mm256_add_epi16(y_low1, _mm256_mulhrs_epi16(v_low, v_cr_coeff));
+        let b_low1 = _mm256_add_epi16(y_low1, _mm256_mulhrs_epi16(u_low, v_cb_coeff));
+        let g_low1 = _mm256_sub_epi16(y_low1, g_coeff_lo);
 
         let r_values0 = avx2_pack_u16(r_low0, r_high0);
         let g_values0 = avx2_pack_u16(g_low0, g_high0);
diff --git a/src/avx2/yuv_to_rgba.rs b/src/avx2/yuv_to_rgba.rs
index 36711b3..27f54c7 100644
--- a/src/avx2/yuv_to_rgba.rs
+++ b/src/avx2/yuv_to_rgba.rs
@@ -87,10 +87,7 @@ unsafe fn avx2_yuv_to_rgba_row_impl<const DESTINATION_CHANNELS: u8, const SAMPLI
     let v_g_coeff_2 = _mm256_set1_epi16(transform.g_coeff_2 as i16);
     let v_alpha = _mm256_set1_epi8(255u8 as i8);
 
-    const SCALE: i32 = 6;
-    const V_SHR: i32 = 3;
-
-    let rounding_const = _mm256_set1_epi16(1 << (V_SHR - 1));
+    const SCALE: i32 = 3;
 
     while cx + 32 < width {
         let y_values =
@@ -128,24 +125,15 @@ unsafe fn avx2_yuv_to_rgba_row_impl<const DESTINATION_CHANNELS: u8, const SAMPLI
             v_luma_coeff,
         );
 
-        let r_high = _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
-            _mm256_add_epi16(y_high, _mm256_mulhrs_epi16(v_high, v_cr_coeff)),
-            rounding_const,
-        ));
-        let b_high = _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
-            _mm256_add_epi16(y_high, _mm256_mulhrs_epi16(u_high, v_cb_coeff)),
-            rounding_const,
-        ));
-        let g_high = _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
-            _mm256_sub_epi16(
-                y_high,
-                _mm256_add_epi16(
-                    _mm256_mulhrs_epi16(v_high, v_g_coeff_1),
-                    _mm256_mulhrs_epi16(u_high, v_g_coeff_2),
-                ),
+        let r_high = _mm256_add_epi16(y_high, _mm256_mulhrs_epi16(v_high, v_cr_coeff));
+        let b_high = _mm256_add_epi16(y_high, _mm256_mulhrs_epi16(u_high, v_cb_coeff));
+        let g_high = _mm256_sub_epi16(
+            y_high,
+            _mm256_add_epi16(
+                _mm256_mulhrs_epi16(v_high, v_g_coeff_1),
+                _mm256_mulhrs_epi16(u_high, v_g_coeff_2),
             ),
-            rounding_const,
-        ));
+        );
 
         let u_low = _mm256_slli_epi16::<SCALE>(_mm256_sub_epi16(u_low_u16, uv_corr));
         let v_low = _mm256_slli_epi16::<SCALE>(_mm256_sub_epi16(v_low_u16, uv_corr));
@@ -154,24 +142,15 @@ unsafe fn avx2_yuv_to_rgba_row_impl<const DESTINATION_CHANNELS: u8, const SAMPLI
             v_luma_coeff,
         );
 
-        let r_low = _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
-            _mm256_add_epi16(y_low, _mm256_mulhrs_epi16(v_low, v_cr_coeff)),
-            rounding_const,
-        ));
-        let b_low = _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
-            _mm256_add_epi16(y_low, _mm256_mulhrs_epi16(u_low, v_cb_coeff)),
-            rounding_const,
-        ));
-        let g_low = _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
-            _mm256_sub_epi16(
-                y_low,
-                _mm256_add_epi16(
-                    _mm256_mulhrs_epi16(v_low, v_g_coeff_1),
-                    _mm256_mulhrs_epi16(u_low, v_g_coeff_2),
-                ),
+        let r_low = _mm256_add_epi16(y_low, _mm256_mulhrs_epi16(v_low, v_cr_coeff));
+        let b_low = _mm256_add_epi16(y_low, _mm256_mulhrs_epi16(u_low, v_cb_coeff));
+        let g_low = _mm256_sub_epi16(
+            y_low,
+            _mm256_add_epi16(
+                _mm256_mulhrs_epi16(v_low, v_g_coeff_1),
+                _mm256_mulhrs_epi16(u_low, v_g_coeff_2),
             ),
-            rounding_const,
-        ));
+        );
 
         let r_values = avx2_pack_u16(r_low, r_high);
         let g_values = avx2_pack_u16(g_low, g_high);
diff --git a/src/avx2/yuv_to_rgba420.rs b/src/avx2/yuv_to_rgba420.rs
index f85b167..686afa2 100644
--- a/src/avx2/yuv_to_rgba420.rs
+++ b/src/avx2/yuv_to_rgba420.rs
@@ -87,10 +87,7 @@ unsafe fn avx2_yuv_to_rgba_row_impl420<const DESTINATION_CHANNELS: u8>(
     let v_g_coeff_2 = _mm256_set1_epi16(transform.g_coeff_2 as i16);
     let v_alpha = _mm256_set1_epi8(255u8 as i8);
 
-    const SCALE: i32 = 6;
-    const V_SHR: i32 = 3;
-
-    let rounding_const = _mm256_set1_epi16(1 << (V_SHR - 1));
+    const SCALE: i32 = 3;
 
     while cx + 32 < width {
         let y_values0 = _mm256_subs_epu8(
@@ -130,31 +127,13 @@ unsafe fn avx2_yuv_to_rgba_row_impl420<const DESTINATION_CHANNELS: u8>(
             _mm256_mulhrs_epi16(u_high, v_g_coeff_2),
         );
 
-        let r_high0 = _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
-            _mm256_add_epi16(y_high0, _mm256_mulhrs_epi16(v_high, v_cr_coeff)),
-            rounding_const,
-        ));
-        let b_high0 = _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
-            _mm256_add_epi16(y_high0, _mm256_mulhrs_epi16(u_high, v_cb_coeff)),
-            rounding_const,
-        ));
-        let g_high0 = _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
-            _mm256_sub_epi16(y_high0, g_coeff_hi),
-            rounding_const,
-        ));
+        let r_high0 = _mm256_add_epi16(y_high0, _mm256_mulhrs_epi16(v_high, v_cr_coeff));
+        let b_high0 = _mm256_add_epi16(y_high0, _mm256_mulhrs_epi16(u_high, v_cb_coeff));
+        let g_high0 = _mm256_sub_epi16(y_high0, g_coeff_hi);
 
-        let r_high1 = _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
-            _mm256_add_epi16(y_high1, _mm256_mulhrs_epi16(v_high, v_cr_coeff)),
-            rounding_const,
-        ));
-        let b_high1 = _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
-            _mm256_add_epi16(y_high1, _mm256_mulhrs_epi16(u_high, v_cb_coeff)),
-            rounding_const,
-        ));
-        let g_high1 = _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
-            _mm256_sub_epi16(y_high1, g_coeff_hi),
-            rounding_const,
-        ));
+        let r_high1 = _mm256_add_epi16(y_high1, _mm256_mulhrs_epi16(v_high, v_cr_coeff));
+        let b_high1 = _mm256_add_epi16(y_high1, _mm256_mulhrs_epi16(u_high, v_cb_coeff));
+        let g_high1 = _mm256_sub_epi16(y_high1, g_coeff_hi);
 
         let u_low = _mm256_slli_epi16::<SCALE>(_mm256_sub_epi16(u_low_u16, uv_corr));
         let v_low = _mm256_slli_epi16::<SCALE>(_mm256_sub_epi16(v_low_u16, uv_corr));
@@ -172,32 +151,13 @@ unsafe fn avx2_yuv_to_rgba_row_impl420<const DESTINATION_CHANNELS: u8>(
             _mm256_mulhrs_epi16(u_low, v_g_coeff_2),
         );
 
-        let r_low0 = _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
-            _mm256_add_epi16(y_low0, _mm256_mulhrs_epi16(v_low, v_cr_coeff)),
-            rounding_const,
-        ));
-        let b_low0 = _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
-            _mm256_add_epi16(y_low0, _mm256_mulhrs_epi16(u_low, v_cb_coeff)),
-            rounding_const,
-        ));
-        let g_low0 = _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
-            _mm256_sub_epi16(y_low0, g_coeff_lo),
-            rounding_const,
-        ));
-
-        let r_low1 = _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
-            _mm256_add_epi16(y_low1, _mm256_mulhrs_epi16(v_low, v_cr_coeff)),
-            rounding_const,
-        ));
-        let b_low1 = _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
-            _mm256_add_epi16(y_low1, _mm256_mulhrs_epi16(u_low, v_cb_coeff)),
-            rounding_const,
-        ));
-        let g_low1 = _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
-            _mm256_sub_epi16(y_low1, g_coeff_lo),
-            rounding_const,
-        ));
+        let r_low0 = _mm256_add_epi16(y_low0, _mm256_mulhrs_epi16(v_low, v_cr_coeff));
+        let b_low0 = _mm256_add_epi16(y_low0, _mm256_mulhrs_epi16(u_low, v_cb_coeff));
+        let g_low0 = _mm256_sub_epi16(y_low0, g_coeff_lo);
 
+        let r_low1 = _mm256_add_epi16(y_low1, _mm256_mulhrs_epi16(v_low, v_cr_coeff));
+        let b_low1 = _mm256_add_epi16(y_low1, _mm256_mulhrs_epi16(u_low, v_cb_coeff));
+        let g_low1 = _mm256_sub_epi16(y_low1, g_coeff_lo);
         let r_values0 = avx2_pack_u16(r_low0, r_high0);
         let g_values0 = avx2_pack_u16(g_low0, g_high0);
         let b_values0 = avx2_pack_u16(b_low0, b_high0);
diff --git a/src/avx2/yuv_to_rgba_alpha.rs b/src/avx2/yuv_to_rgba_alpha.rs
index a33bee4..31fa219 100644
--- a/src/avx2/yuv_to_rgba_alpha.rs
+++ b/src/avx2/yuv_to_rgba_alpha.rs
@@ -93,8 +93,7 @@ unsafe fn avx2_yuv_to_rgba_alpha_impl<const DESTINATION_CHANNELS: u8, const SAMP
     let a_ptr = a_plane.as_ptr();
     let rgba_ptr = rgba.as_mut_ptr();
 
-    const SCALE: i32 = 6;
-    const V_SHR: i32 = 3;
+    const SCALE: i32 = 3;
 
     let y_corr = _mm256_set1_epi8(range.bias_y as i8);
     let uv_corr = _mm256_set1_epi16(range.bias_uv as i16);
@@ -103,7 +102,6 @@ unsafe fn avx2_yuv_to_rgba_alpha_impl<const DESTINATION_CHANNELS: u8, const SAMP
     let v_cb_coeff = _mm256_set1_epi16(transform.cb_coef as i16);
     let v_g_coeff_1 = _mm256_set1_epi16(transform.g_coeff_1 as i16);
     let v_g_coeff_2 = _mm256_set1_epi16(transform.g_coeff_2 as i16);
-    let rounding_const = _mm256_set1_epi16(1 << (V_SHR - 1));
 
     while cx + 32 < width {
         let y_values =
@@ -143,24 +141,15 @@ unsafe fn avx2_yuv_to_rgba_alpha_impl<const DESTINATION_CHANNELS: u8, const SAMP
             v_luma_coeff,
         );
 
-        let r_high = _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
-            _mm256_add_epi16(y_high, _mm256_mulhrs_epi16(v_high, v_cr_coeff)),
-            rounding_const,
-        ));
-        let b_high = _mm256_srai_epi16::<3>(_mm256_add_epi16(
-            _mm256_add_epi16(y_high, _mm256_mulhrs_epi16(u_high, v_cb_coeff)),
-            rounding_const,
-        ));
-        let g_high = _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
-            _mm256_sub_epi16(
-                y_high,
-                _mm256_add_epi16(
-                    _mm256_mulhrs_epi16(v_high, v_g_coeff_1),
-                    _mm256_mulhrs_epi16(u_high, v_g_coeff_2),
-                ),
+        let r_high = _mm256_add_epi16(y_high, _mm256_mulhrs_epi16(v_high, v_cr_coeff));
+        let b_high = _mm256_add_epi16(y_high, _mm256_mulhrs_epi16(u_high, v_cb_coeff));
+        let g_high = _mm256_sub_epi16(
+            y_high,
+            _mm256_add_epi16(
+                _mm256_mulhrs_epi16(v_high, v_g_coeff_1),
+                _mm256_mulhrs_epi16(u_high, v_g_coeff_2),
             ),
-            rounding_const,
-        ));
+        );
 
         let u_low = _mm256_slli_epi16::<SCALE>(_mm256_sub_epi16(u_low_u16, uv_corr));
         let v_low = _mm256_slli_epi16::<SCALE>(_mm256_sub_epi16(v_low_u16, uv_corr));
@@ -169,24 +158,15 @@ unsafe fn avx2_yuv_to_rgba_alpha_impl<const DESTINATION_CHANNELS: u8, const SAMP
             v_luma_coeff,
         );
 
-        let r_low = _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
-            _mm256_add_epi16(y_low, _mm256_mulhrs_epi16(v_low, v_cr_coeff)),
-            rounding_const,
-        ));
-        let b_low = _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
-            _mm256_add_epi16(y_low, _mm256_mulhrs_epi16(u_low, v_cb_coeff)),
-            rounding_const,
-        ));
-        let g_low = _mm256_srai_epi16::<V_SHR>(_mm256_add_epi16(
-            _mm256_sub_epi16(
-                y_low,
-                _mm256_add_epi16(
-                    _mm256_mulhrs_epi16(v_low, v_g_coeff_1),
-                    _mm256_mulhrs_epi16(u_low, v_g_coeff_2),
-                ),
+        let r_low = _mm256_add_epi16(y_low, _mm256_mulhrs_epi16(v_low, v_cr_coeff));
+        let b_low = _mm256_add_epi16(y_low, _mm256_mulhrs_epi16(u_low, v_cb_coeff));
+        let g_low = _mm256_sub_epi16(
+            y_low,
+            _mm256_add_epi16(
+                _mm256_mulhrs_epi16(v_low, v_g_coeff_1),
+                _mm256_mulhrs_epi16(u_low, v_g_coeff_2),
             ),
-            rounding_const,
-        ));
+        );
 
         let (r_values, g_values, b_values);
 
diff --git a/src/sse/mod.rs b/src/sse/mod.rs
index 15be6cb..0441f8a 100644
--- a/src/sse/mod.rs
+++ b/src/sse/mod.rs
@@ -33,6 +33,7 @@ mod rgb_to_ycgco;
 mod rgb_to_ycgco_r;
 mod rgb_to_yuv_p16;
 mod rgba_to_yuv;
+mod rgba_to_yuv420;
 pub(crate) mod sse_support;
 mod sse_ycbcr;
 mod sse_ycgco_r;
@@ -50,7 +51,6 @@ mod yuv_to_rgba_alpha;
 mod yuv_to_yuy2;
 mod yuy2_to_rgb;
 mod yuy2_to_yuv;
-mod rgba_to_yuv420;
 
 pub(crate) use rgb_to_nv::sse_rgba_to_nv_row;
 pub(crate) use rgb_to_y::sse_rgb_to_y;
@@ -58,6 +58,7 @@ pub(crate) use rgb_to_ycgco::sse_rgb_to_ycgco_row;
 pub(crate) use rgb_to_ycgco_r::sse_rgb_to_ycgcor_row;
 pub(crate) use rgb_to_yuv_p16::{sse_rgba_to_yuv_p16, sse_rgba_to_yuv_p16_lp};
 pub(crate) use rgba_to_yuv::sse_rgba_to_yuv_row;
+pub(crate) use rgba_to_yuv420::sse_rgba_to_yuv_row420;
 pub(crate) use sse_support::*;
 pub(crate) use ycgco_to_rgb::sse_ycgco_to_rgb_row;
 pub(crate) use ycgco_to_rgb_alpha::sse_ycgco_to_rgb_alpha_row;
@@ -73,4 +74,3 @@ pub(crate) use yuv_to_rgba_alpha::sse_yuv_to_rgba_alpha_row;
 pub(crate) use yuv_to_yuy2::yuv_to_yuy2_sse;
 pub(crate) use yuy2_to_rgb::yuy2_to_rgb_sse;
 pub(crate) use yuy2_to_yuv::yuy2_to_yuv_sse;
-pub(crate) use rgba_to_yuv420::sse_rgba_to_yuv_row420;
\ No newline at end of file
diff --git a/src/sse/rgb_to_nv.rs b/src/sse/rgb_to_nv.rs
index 2fedfae..f281a31 100644
--- a/src/sse/rgb_to_nv.rs
+++ b/src/sse/rgb_to_nv.rs
@@ -95,11 +95,9 @@ unsafe fn sse_rgba_to_nv_row_impl<
     let mut cx = start_cx;
     let mut uv_x = start_ux;
 
-    const V_SHR: i32 = 3;
-    const V_SCALE: i32 = 6;
-    let rounding_const_bias: i16 = 1 << (V_SHR - 1);
-    let bias_y = range.bias_y as i16 * (1 << V_SHR) + rounding_const_bias;
-    let bias_uv = range.bias_uv as i16 * (1 << V_SHR) + rounding_const_bias;
+    const V_SCALE: i32 = 3;
+    let bias_y = range.bias_y as i16;
+    let bias_uv = range.bias_uv as i16;
 
     let i_bias_y = _mm_set1_epi16(range.bias_y as i16);
     let i_cap_y = _mm_set1_epi16(range.range_y as i16 + range.bias_y as i16);
@@ -171,7 +169,7 @@ unsafe fn sse_rgba_to_nv_row_impl<
 
         let y_l = _mm_max_epi16(
             _mm_min_epi16(
-                _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
+                (_mm_add_epi16(
                     y_bias,
                     _mm_add_epi16(
                         _mm_add_epi16(_mm_mulhrs_epi16(r_low, v_yr), _mm_mulhrs_epi16(g_low, v_yg)),
@@ -185,7 +183,7 @@ unsafe fn sse_rgba_to_nv_row_impl<
 
         let y_h = _mm_max_epi16(
             _mm_min_epi16(
-                _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
+                (_mm_add_epi16(
                     y_bias,
                     _mm_add_epi16(
                         _mm_add_epi16(
@@ -206,7 +204,7 @@ unsafe fn sse_rgba_to_nv_row_impl<
         if chroma_subsampling == YuvChromaSubsampling::Yuv444 {
             let cb_l = _mm_max_epi16(
                 _mm_min_epi16(
-                    _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
+                    (_mm_add_epi16(
                         uv_bias,
                         _mm_add_epi16(
                             _mm_add_epi16(
@@ -222,7 +220,7 @@ unsafe fn sse_rgba_to_nv_row_impl<
             );
             let cr_l = _mm_max_epi16(
                 _mm_min_epi16(
-                    _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
+                    _mm_add_epi16(
                         uv_bias,
                         _mm_add_epi16(
                             _mm_add_epi16(
@@ -231,14 +229,14 @@ unsafe fn sse_rgba_to_nv_row_impl<
                             ),
                             _mm_mulhrs_epi16(b_low, v_cr_b),
                         ),
-                    )),
+                    ),
                     i_cap_uv,
                 ),
                 i_bias_y,
             );
             let cb_h = _mm_max_epi16(
                 _mm_min_epi16(
-                    _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
+                    (_mm_add_epi16(
                         uv_bias,
                         _mm_add_epi16(
                             _mm_add_epi16(
@@ -254,7 +252,7 @@ unsafe fn sse_rgba_to_nv_row_impl<
             );
             let cr_h = _mm_max_epi16(
                 _mm_min_epi16(
-                    _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
+                    (_mm_add_epi16(
                         uv_bias,
                         _mm_add_epi16(
                             _mm_add_epi16(
@@ -294,7 +292,7 @@ unsafe fn sse_rgba_to_nv_row_impl<
 
             let cbk = _mm_max_epi16(
                 _mm_min_epi16(
-                    _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
+                    (_mm_add_epi16(
                         uv_bias,
                         _mm_add_epi16(
                             _mm_add_epi16(
@@ -311,7 +309,7 @@ unsafe fn sse_rgba_to_nv_row_impl<
 
             let crk = _mm_max_epi16(
                 _mm_min_epi16(
-                    _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
+                    (_mm_add_epi16(
                         uv_bias,
                         _mm_add_epi16(
                             _mm_add_epi16(
diff --git a/src/sse/rgb_to_y.rs b/src/sse/rgb_to_y.rs
index 5f0f046..5b4dc5f 100644
--- a/src/sse/rgb_to_y.rs
+++ b/src/sse/rgb_to_y.rs
@@ -64,10 +64,8 @@ unsafe fn sse_rgb_to_y_impl<const ORIGIN_CHANNELS: u8>(
 
     let mut cx = start_cx;
 
-    const V_SHR: i32 = 3;
-    const V_SCALE: i32 = 6;
-    let rounding_const_bias: i16 = 1 << (V_SHR - 1);
-    let bias_y = range.bias_y as i16 * (1 << V_SHR) + rounding_const_bias;
+    const V_SCALE: i32 = 3;
+    let bias_y = range.bias_y as i16;
 
     let zeros = _mm_setzero_si128();
 
@@ -130,7 +128,7 @@ unsafe fn sse_rgb_to_y_impl<const ORIGIN_CHANNELS: u8>(
 
         let y_l = _mm_max_epi16(
             _mm_min_epi16(
-                _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
+                (_mm_add_epi16(
                     y_bias,
                     _mm_add_epi16(
                         _mm_add_epi16(_mm_mulhrs_epi16(r_low, v_yr), _mm_mulhrs_epi16(g_low, v_yg)),
@@ -144,7 +142,7 @@ unsafe fn sse_rgb_to_y_impl<const ORIGIN_CHANNELS: u8>(
 
         let y_h = _mm_max_epi16(
             _mm_min_epi16(
-                _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
+                (_mm_add_epi16(
                     y_bias,
                     _mm_add_epi16(
                         _mm_add_epi16(
diff --git a/src/sse/rgba_to_yuv.rs b/src/sse/rgba_to_yuv.rs
index be395ac..dbd1954 100644
--- a/src/sse/rgba_to_yuv.rs
+++ b/src/sse/rgba_to_yuv.rs
@@ -79,11 +79,9 @@ unsafe fn sse_rgba_to_yuv_row_impl<const ORIGIN_CHANNELS: u8, const SAMPLING: u8
     let mut cx = start_cx;
     let mut uv_x = start_ux;
 
-    const V_SHR: i32 = 3;
-    const V_SCALE: i32 = 6;
-    let rounding_const_bias: i16 = 1 << (V_SHR - 1);
-    let bias_y = range.bias_y as i16 * (1 << V_SHR) + rounding_const_bias;
-    let bias_uv = range.bias_uv as i16 * (1 << V_SHR) + rounding_const_bias;
+    const V_SCALE: i32 = 3;
+    let bias_y = range.bias_y as i16;
+    let bias_uv = range.bias_uv as i16;
 
     let i_bias_y = _mm_set1_epi16(range.bias_y as i16);
     let i_cap_y = _mm_set1_epi16(range.range_y as i16 + range.bias_y as i16);
@@ -155,7 +153,7 @@ unsafe fn sse_rgba_to_yuv_row_impl<const ORIGIN_CHANNELS: u8, const SAMPLING: u8
 
         let y_l = _mm_max_epi16(
             _mm_min_epi16(
-                _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
+                (_mm_add_epi16(
                     y_bias,
                     _mm_add_epi16(
                         _mm_add_epi16(_mm_mulhrs_epi16(r_low, v_yr), _mm_mulhrs_epi16(g_low, v_yg)),
@@ -169,7 +167,7 @@ unsafe fn sse_rgba_to_yuv_row_impl<const ORIGIN_CHANNELS: u8, const SAMPLING: u8
 
         let y_h = _mm_max_epi16(
             _mm_min_epi16(
-                _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
+                (_mm_add_epi16(
                     y_bias,
                     _mm_add_epi16(
                         _mm_add_epi16(
@@ -190,7 +188,7 @@ unsafe fn sse_rgba_to_yuv_row_impl<const ORIGIN_CHANNELS: u8, const SAMPLING: u8
         if chroma_subsampling == YuvChromaSubsampling::Yuv444 {
             let cb_l = _mm_max_epi16(
                 _mm_min_epi16(
-                    _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
+                    (_mm_add_epi16(
                         uv_bias,
                         _mm_add_epi16(
                             _mm_add_epi16(
@@ -206,7 +204,7 @@ unsafe fn sse_rgba_to_yuv_row_impl<const ORIGIN_CHANNELS: u8, const SAMPLING: u8
             );
             let cr_l = _mm_max_epi16(
                 _mm_min_epi16(
-                    _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
+                    (_mm_add_epi16(
                         uv_bias,
                         _mm_add_epi16(
                             _mm_add_epi16(
@@ -222,7 +220,7 @@ unsafe fn sse_rgba_to_yuv_row_impl<const ORIGIN_CHANNELS: u8, const SAMPLING: u8
             );
             let cb_h = _mm_max_epi16(
                 _mm_min_epi16(
-                    _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
+                    (_mm_add_epi16(
                         uv_bias,
                         _mm_add_epi16(
                             _mm_add_epi16(
@@ -238,7 +236,7 @@ unsafe fn sse_rgba_to_yuv_row_impl<const ORIGIN_CHANNELS: u8, const SAMPLING: u8
             );
             let cr_h = _mm_max_epi16(
                 _mm_min_epi16(
-                    _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
+                    (_mm_add_epi16(
                         uv_bias,
                         _mm_add_epi16(
                             _mm_add_epi16(
@@ -269,7 +267,7 @@ unsafe fn sse_rgba_to_yuv_row_impl<const ORIGIN_CHANNELS: u8, const SAMPLING: u8
 
             let cbk = _mm_max_epi16(
                 _mm_min_epi16(
-                    _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
+                    (_mm_add_epi16(
                         uv_bias,
                         _mm_add_epi16(
                             _mm_add_epi16(
@@ -286,7 +284,7 @@ unsafe fn sse_rgba_to_yuv_row_impl<const ORIGIN_CHANNELS: u8, const SAMPLING: u8
 
             let crk = _mm_max_epi16(
                 _mm_min_epi16(
-                    _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
+                    (_mm_add_epi16(
                         uv_bias,
                         _mm_add_epi16(
                             _mm_add_epi16(
diff --git a/src/sse/rgba_to_yuv420.rs b/src/sse/rgba_to_yuv420.rs
index b53a6c7..112fbfd 100644
--- a/src/sse/rgba_to_yuv420.rs
+++ b/src/sse/rgba_to_yuv420.rs
@@ -79,11 +79,9 @@ unsafe fn sse_rgba_to_yuv_row_impl420<const ORIGIN_CHANNELS: u8>(
     let mut cx = start_cx;
     let mut uv_x = start_ux;
 
-    const V_SHR: i32 = 3;
-    const V_SCALE: i32 = 6;
-    let rounding_const_bias: i16 = 1 << (V_SHR - 1);
-    let bias_y = range.bias_y as i16 * (1 << V_SHR) + rounding_const_bias;
-    let bias_uv = range.bias_uv as i16 * (1 << V_SHR) + rounding_const_bias;
+    const V_SCALE: i32 = 3;
+    let bias_y = range.bias_y as i16;
+    let bias_uv = range.bias_uv as i16;
 
     let i_bias_y = _mm_set1_epi16(range.bias_y as i16);
     let i_cap_y = _mm_set1_epi16(range.range_y as i16 + range.bias_y as i16);
@@ -189,7 +187,7 @@ unsafe fn sse_rgba_to_yuv_row_impl420<const ORIGIN_CHANNELS: u8>(
 
         let y0_l = _mm_max_epi16(
             _mm_min_epi16(
-                _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
+                (_mm_add_epi16(
                     y_bias,
                     _mm_add_epi16(
                         _mm_add_epi16(
@@ -206,7 +204,7 @@ unsafe fn sse_rgba_to_yuv_row_impl420<const ORIGIN_CHANNELS: u8>(
 
         let y0_h = _mm_max_epi16(
             _mm_min_epi16(
-                _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
+                (_mm_add_epi16(
                     y_bias,
                     _mm_add_epi16(
                         _mm_add_epi16(
@@ -230,7 +228,7 @@ unsafe fn sse_rgba_to_yuv_row_impl420<const ORIGIN_CHANNELS: u8>(
 
         let y1_l = _mm_max_epi16(
             _mm_min_epi16(
-                _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
+                (_mm_add_epi16(
                     y_bias,
                     _mm_add_epi16(
                         _mm_add_epi16(
@@ -247,7 +245,7 @@ unsafe fn sse_rgba_to_yuv_row_impl420<const ORIGIN_CHANNELS: u8>(
 
         let y1_h = _mm_max_epi16(
             _mm_min_epi16(
-                _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
+                (_mm_add_epi16(
                     y_bias,
                     _mm_add_epi16(
                         _mm_add_epi16(
@@ -280,7 +278,7 @@ unsafe fn sse_rgba_to_yuv_row_impl420<const ORIGIN_CHANNELS: u8>(
 
         let cbk = _mm_max_epi16(
             _mm_min_epi16(
-                _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
+                (_mm_add_epi16(
                     uv_bias,
                     _mm_add_epi16(
                         _mm_add_epi16(_mm_mulhrs_epi16(r1, v_cb_r), _mm_mulhrs_epi16(g1, v_cb_g)),
@@ -294,7 +292,7 @@ unsafe fn sse_rgba_to_yuv_row_impl420<const ORIGIN_CHANNELS: u8>(
 
         let crk = _mm_max_epi16(
             _mm_min_epi16(
-                _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
+                (_mm_add_epi16(
                     uv_bias,
                     _mm_add_epi16(
                         _mm_add_epi16(_mm_mulhrs_epi16(r1, v_cr_r), _mm_mulhrs_epi16(g1, v_cr_g)),
diff --git a/src/sse/yuv_nv_to_rgba.rs b/src/sse/yuv_nv_to_rgba.rs
index eeddc51..9df60b1 100644
--- a/src/sse/yuv_nv_to_rgba.rs
+++ b/src/sse/yuv_nv_to_rgba.rs
@@ -86,8 +86,7 @@ unsafe fn sse_yuv_nv_to_rgba_impl<
     let uv_ptr = uv_plane.as_ptr();
     let rgba_ptr = rgba.as_mut_ptr();
 
-    const SCALE: i32 = 6;
-    const V_SHR: i32 = 3;
+    const SCALE: i32 = 3;
 
     let y_corr = _mm_set1_epi8(range.bias_y as i8);
     let uv_corr = _mm_set1_epi16(range.bias_uv as i16);
@@ -97,7 +96,6 @@ unsafe fn sse_yuv_nv_to_rgba_impl<
     let v_g_coeff_1 = _mm_set1_epi16(transform.g_coeff_1 as i16);
     let v_g_coeff_2 = _mm_set1_epi16(transform.g_coeff_2 as i16);
     let v_alpha = _mm_set1_epi8(255u8 as i8);
-    let rounding_const = _mm_set1_epi16(1 << (V_SHR - 1));
 
     let zeros = _mm_setzero_si128();
 
@@ -161,24 +159,15 @@ unsafe fn sse_yuv_nv_to_rgba_impl<
             v_luma_coeff,
         );
 
-        let r_high = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_add_epi16(y_high, _mm_mulhrs_epi16(v_high, v_cr_coeff)),
-            rounding_const,
-        ));
-        let b_high = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_add_epi16(y_high, _mm_mulhrs_epi16(u_high, v_cb_coeff)),
-            rounding_const,
-        ));
-        let g_high = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_sub_epi16(
-                y_high,
-                _mm_add_epi16(
-                    _mm_mulhrs_epi16(v_high, v_g_coeff_1),
-                    _mm_mulhrs_epi16(u_high, v_g_coeff_2),
-                ),
+        let r_high = _mm_add_epi16(y_high, _mm_mulhrs_epi16(v_high, v_cr_coeff));
+        let b_high = _mm_add_epi16(y_high, _mm_mulhrs_epi16(u_high, v_cb_coeff));
+        let g_high = _mm_sub_epi16(
+            y_high,
+            _mm_add_epi16(
+                _mm_mulhrs_epi16(v_high, v_g_coeff_1),
+                _mm_mulhrs_epi16(u_high, v_g_coeff_2),
             ),
-            rounding_const,
-        ));
+        );
 
         let u_low = _mm_slli_epi16::<SCALE>(_mm_sub_epi16(u_low_u16, uv_corr));
         let v_low = _mm_slli_epi16::<SCALE>(_mm_sub_epi16(v_low_u16, uv_corr));
@@ -187,24 +176,15 @@ unsafe fn sse_yuv_nv_to_rgba_impl<
             v_luma_coeff,
         );
 
-        let r_low = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_add_epi16(y_low, _mm_mulhrs_epi16(v_low, v_cr_coeff)),
-            rounding_const,
-        ));
-        let b_low = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_add_epi16(y_low, _mm_mulhrs_epi16(u_low, v_cb_coeff)),
-            rounding_const,
-        ));
-        let g_low = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_sub_epi16(
-                y_low,
-                _mm_add_epi16(
-                    _mm_mulhrs_epi16(v_low, v_g_coeff_1),
-                    _mm_mulhrs_epi16(u_low, v_g_coeff_2),
-                ),
+        let r_low = _mm_add_epi16(y_low, _mm_mulhrs_epi16(v_low, v_cr_coeff));
+        let b_low = _mm_add_epi16(y_low, _mm_mulhrs_epi16(u_low, v_cb_coeff));
+        let g_low = _mm_sub_epi16(
+            y_low,
+            _mm_add_epi16(
+                _mm_mulhrs_epi16(v_low, v_g_coeff_1),
+                _mm_mulhrs_epi16(u_low, v_g_coeff_2),
             ),
-            rounding_const,
-        ));
+        );
 
         let r_values = _mm_packus_epi16(r_low, r_high);
         let g_values = _mm_packus_epi16(g_low, g_high);
@@ -300,24 +280,15 @@ unsafe fn sse_yuv_nv_to_rgba_impl<
             v_luma_coeff,
         );
 
-        let r_low = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_add_epi16(y_low, _mm_mulhrs_epi16(v_low, v_cr_coeff)),
-            rounding_const,
-        ));
-        let b_low = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_add_epi16(y_low, _mm_mulhrs_epi16(u_low, v_cb_coeff)),
-            rounding_const,
-        ));
-        let g_low = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_sub_epi16(
-                y_low,
-                _mm_add_epi16(
-                    _mm_mulhrs_epi16(v_low, v_g_coeff_1),
-                    _mm_mulhrs_epi16(u_low, v_g_coeff_2),
-                ),
+        let r_low = _mm_add_epi16(y_low, _mm_mulhrs_epi16(v_low, v_cr_coeff));
+        let b_low = _mm_add_epi16(y_low, _mm_mulhrs_epi16(u_low, v_cb_coeff));
+        let g_low = _mm_sub_epi16(
+            y_low,
+            _mm_add_epi16(
+                _mm_mulhrs_epi16(v_low, v_g_coeff_1),
+                _mm_mulhrs_epi16(u_low, v_g_coeff_2),
             ),
-            rounding_const,
-        ));
+        );
 
         let r_values = _mm_packus_epi16(r_low, zeros);
         let g_values = _mm_packus_epi16(g_low, zeros);
diff --git a/src/sse/yuv_nv_to_rgba420.rs b/src/sse/yuv_nv_to_rgba420.rs
index 5d3689f..03acdbe 100644
--- a/src/sse/yuv_nv_to_rgba420.rs
+++ b/src/sse/yuv_nv_to_rgba420.rs
@@ -77,8 +77,7 @@ unsafe fn sse_yuv_nv_to_rgba_impl420<const UV_ORDER: u8, const DESTINATION_CHANN
 
     let uv_ptr = uv_plane.as_ptr();
 
-    const SCALE: i32 = 6;
-    const V_SHR: i32 = 3;
+    const SCALE: i32 = 3;
 
     let y_corr = _mm_set1_epi8(range.bias_y as i8);
     let uv_corr = _mm_set1_epi16(range.bias_uv as i16);
@@ -88,8 +87,6 @@ unsafe fn sse_yuv_nv_to_rgba_impl420<const UV_ORDER: u8, const DESTINATION_CHANN
     let v_g_coeff_1 = _mm_set1_epi16(transform.g_coeff_1 as i16);
     let v_g_coeff_2 = _mm_set1_epi16(transform.g_coeff_2 as i16);
     let v_alpha = _mm_set1_epi8(255u8 as i8);
-    let rounding_const = _mm_set1_epi16(1 << (V_SHR - 1));
-
     let zeros = _mm_setzero_si128();
 
     let distribute_shuffle = _mm_setr_epi8(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7);
@@ -143,30 +140,12 @@ unsafe fn sse_yuv_nv_to_rgba_impl420<const UV_ORDER: u8, const DESTINATION_CHANN
             _mm_mulhrs_epi16(u_high, v_g_coeff_2),
         );
 
-        let r_high0 = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_add_epi16(y_high0, _mm_mulhrs_epi16(v_high, v_cr_coeff)),
-            rounding_const,
-        ));
-        let b_high0 = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_add_epi16(y_high0, _mm_mulhrs_epi16(u_high, v_cb_coeff)),
-            rounding_const,
-        ));
-        let g_high0 = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_sub_epi16(y_high0, g_coeff_hi),
-            rounding_const,
-        ));
-        let r_high1 = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_add_epi16(y_high1, _mm_mulhrs_epi16(v_high, v_cr_coeff)),
-            rounding_const,
-        ));
-        let b_high1 = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_add_epi16(y_high1, _mm_mulhrs_epi16(u_high, v_cb_coeff)),
-            rounding_const,
-        ));
-        let g_high1 = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_sub_epi16(y_high1, g_coeff_hi),
-            rounding_const,
-        ));
+        let r_high0 = _mm_add_epi16(y_high0, _mm_mulhrs_epi16(v_high, v_cr_coeff));
+        let b_high0 = _mm_add_epi16(y_high0, _mm_mulhrs_epi16(u_high, v_cb_coeff));
+        let g_high0 = _mm_sub_epi16(y_high0, g_coeff_hi);
+        let r_high1 = _mm_add_epi16(y_high1, _mm_mulhrs_epi16(v_high, v_cr_coeff));
+        let b_high1 = _mm_add_epi16(y_high1, _mm_mulhrs_epi16(u_high, v_cb_coeff));
+        let g_high1 = _mm_sub_epi16(y_high1, g_coeff_hi);
 
         let u_low = _mm_slli_epi16::<SCALE>(_mm_sub_epi16(u_low_u16, uv_corr));
         let v_low = _mm_slli_epi16::<SCALE>(_mm_sub_epi16(v_low_u16, uv_corr));
@@ -184,30 +163,12 @@ unsafe fn sse_yuv_nv_to_rgba_impl420<const UV_ORDER: u8, const DESTINATION_CHANN
             _mm_mulhrs_epi16(u_low, v_g_coeff_2),
         );
 
-        let r_low0 = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_add_epi16(y_low0, _mm_mulhrs_epi16(v_low, v_cr_coeff)),
-            rounding_const,
-        ));
-        let b_low0 = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_add_epi16(y_low0, _mm_mulhrs_epi16(u_low, v_cb_coeff)),
-            rounding_const,
-        ));
-        let g_low0 = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_sub_epi16(y_low0, g_coeff_lo),
-            rounding_const,
-        ));
-        let r_low1 = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_add_epi16(y_low1, _mm_mulhrs_epi16(v_low, v_cr_coeff)),
-            rounding_const,
-        ));
-        let b_low1 = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_add_epi16(y_low1, _mm_mulhrs_epi16(u_low, v_cb_coeff)),
-            rounding_const,
-        ));
-        let g_low1 = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_sub_epi16(y_low1, g_coeff_lo),
-            rounding_const,
-        ));
+        let r_low0 = _mm_add_epi16(y_low0, _mm_mulhrs_epi16(v_low, v_cr_coeff));
+        let b_low0 = _mm_add_epi16(y_low0, _mm_mulhrs_epi16(u_low, v_cb_coeff));
+        let g_low0 = _mm_sub_epi16(y_low0, g_coeff_lo);
+        let r_low1 = _mm_add_epi16(y_low1, _mm_mulhrs_epi16(v_low, v_cr_coeff));
+        let b_low1 = _mm_add_epi16(y_low1, _mm_mulhrs_epi16(u_low, v_cb_coeff));
+        let g_low1 = _mm_sub_epi16(y_low1, g_coeff_lo);
 
         let r_values0 = _mm_packus_epi16(r_low0, r_high0);
         let g_values0 = _mm_packus_epi16(g_low0, g_high0);
@@ -331,31 +292,13 @@ unsafe fn sse_yuv_nv_to_rgba_impl420<const UV_ORDER: u8, const DESTINATION_CHANN
             _mm_mulhrs_epi16(u_low, v_g_coeff_2),
         );
 
-        let r_low0 = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_add_epi16(y_low0, _mm_mulhrs_epi16(v_low, v_cr_coeff)),
-            rounding_const,
-        ));
-        let b_low0 = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_add_epi16(y_low0, _mm_mulhrs_epi16(u_low, v_cb_coeff)),
-            rounding_const,
-        ));
-        let g_low0 = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_sub_epi16(y_low0, g_coeff_lo),
-            rounding_const,
-        ));
-
-        let r_low1 = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_add_epi16(y_low1, _mm_mulhrs_epi16(v_low, v_cr_coeff)),
-            rounding_const,
-        ));
-        let b_low1 = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_add_epi16(y_low1, _mm_mulhrs_epi16(u_low, v_cb_coeff)),
-            rounding_const,
-        ));
-        let g_low1 = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_sub_epi16(y_low1, g_coeff_lo),
-            rounding_const,
-        ));
+        let r_low0 = _mm_add_epi16(y_low0, _mm_mulhrs_epi16(v_low, v_cr_coeff));
+        let b_low0 = _mm_add_epi16(y_low0, _mm_mulhrs_epi16(u_low, v_cb_coeff));
+        let g_low0 = _mm_sub_epi16(y_low0, g_coeff_lo);
+
+        let r_low1 = _mm_add_epi16(y_low1, _mm_mulhrs_epi16(v_low, v_cr_coeff));
+        let b_low1 = _mm_add_epi16(y_low1, _mm_mulhrs_epi16(u_low, v_cb_coeff));
+        let g_low1 = _mm_sub_epi16(y_low1, g_coeff_lo);
 
         let r_values0 = _mm_packus_epi16(r_low0, zeros);
         let g_values0 = _mm_packus_epi16(g_low0, zeros);
diff --git a/src/sse/yuv_to_rgba.rs b/src/sse/yuv_to_rgba.rs
index 0ba8153..edc7500 100644
--- a/src/sse/yuv_to_rgba.rs
+++ b/src/sse/yuv_to_rgba.rs
@@ -80,8 +80,7 @@ unsafe fn sse_yuv_to_rgba_row_impl<const DESTINATION_CHANNELS: u8, const SAMPLIN
     let v_ptr = v_plane.as_ptr();
     let rgba_ptr = rgba.as_mut_ptr();
 
-    const SCALE: i32 = 6;
-    const V_SHR: i32 = 3;
+    const SCALE: i32 = 3;
 
     let y_corr = _mm_set1_epi8(range.bias_y as i8);
     let uv_corr = _mm_set1_epi16(range.bias_uv as i16);
@@ -91,7 +90,6 @@ unsafe fn sse_yuv_to_rgba_row_impl<const DESTINATION_CHANNELS: u8, const SAMPLIN
     let v_g_coeff_1 = _mm_set1_epi16(transform.g_coeff_1 as i16);
     let v_g_coeff_2 = _mm_set1_epi16(transform.g_coeff_2 as i16);
     let v_alpha = _mm_set1_epi8(255u8 as i8);
-    let rounding_const = _mm_set1_epi16(1 << (V_SHR - 1));
 
     let zeros = _mm_setzero_si128();
 
@@ -124,55 +122,37 @@ unsafe fn sse_yuv_to_rgba_row_impl<const DESTINATION_CHANNELS: u8, const SAMPLIN
 
         let u_high = _mm_slli_epi16::<SCALE>(_mm_sub_epi16(u_high_u16, uv_corr));
         let v_high = _mm_slli_epi16::<SCALE>(_mm_sub_epi16(v_high_u16, uv_corr));
-        let y_high = _mm_mulhi_epi16(
+        let y_high = _mm_mulhrs_epi16(
             _mm_slli_epi16::<SCALE>(_mm_unpackhi_epi8(y_values, zeros)),
             v_luma_coeff,
         );
 
-        let r_high = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_add_epi16(y_high, _mm_mulhi_epi16(v_high, v_cr_coeff)),
-            rounding_const,
-        ));
-        let b_high = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_add_epi16(y_high, _mm_mulhi_epi16(u_high, v_cb_coeff)),
-            rounding_const,
-        ));
-        let g_high = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_sub_epi16(
-                y_high,
-                _mm_add_epi16(
-                    _mm_mulhi_epi16(v_high, v_g_coeff_1),
-                    _mm_mulhi_epi16(u_high, v_g_coeff_2),
-                ),
+        let r_high = _mm_add_epi16(y_high, _mm_mulhrs_epi16(v_high, v_cr_coeff));
+        let b_high = _mm_add_epi16(y_high, _mm_mulhrs_epi16(u_high, v_cb_coeff));
+        let g_high = _mm_sub_epi16(
+            y_high,
+            _mm_add_epi16(
+                _mm_mulhrs_epi16(v_high, v_g_coeff_1),
+                _mm_mulhrs_epi16(u_high, v_g_coeff_2),
             ),
-            rounding_const,
-        ));
+        );
 
         let u_low = _mm_slli_epi16::<SCALE>(_mm_sub_epi16(u_low_u16, uv_corr));
         let v_low = _mm_slli_epi16::<SCALE>(_mm_sub_epi16(v_low_u16, uv_corr));
-        let y_low = _mm_mulhi_epi16(
+        let y_low = _mm_mulhrs_epi16(
             _mm_slli_epi16::<SCALE>(_mm_cvtepu8_epi16(y_values)),
             v_luma_coeff,
         );
 
-        let r_low = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_add_epi16(y_low, _mm_mulhi_epi16(v_low, v_cr_coeff)),
-            rounding_const,
-        ));
-        let b_low = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_add_epi16(y_low, _mm_mulhi_epi16(u_low, v_cb_coeff)),
-            rounding_const,
-        ));
-        let g_low = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_sub_epi16(
-                y_low,
-                _mm_add_epi16(
-                    _mm_mulhi_epi16(v_low, v_g_coeff_1),
-                    _mm_mulhi_epi16(u_low, v_g_coeff_2),
-                ),
+        let r_low = _mm_add_epi16(y_low, _mm_mulhrs_epi16(v_low, v_cr_coeff));
+        let b_low = _mm_add_epi16(y_low, _mm_mulhrs_epi16(u_low, v_cb_coeff));
+        let g_low = _mm_sub_epi16(
+            y_low,
+            _mm_add_epi16(
+                _mm_mulhrs_epi16(v_low, v_g_coeff_1),
+                _mm_mulhrs_epi16(u_low, v_g_coeff_2),
             ),
-            rounding_const,
-        ));
+        );
 
         let r_values = _mm_packus_epi16(r_low, r_high);
         let g_values = _mm_packus_epi16(g_low, g_high);
@@ -252,29 +232,20 @@ unsafe fn sse_yuv_to_rgba_row_impl<const DESTINATION_CHANNELS: u8, const SAMPLIN
 
         let u_low = _mm_slli_epi16::<SCALE>(_mm_sub_epi16(u_low_u16, uv_corr));
         let v_low = _mm_slli_epi16::<SCALE>(_mm_sub_epi16(v_low_u16, uv_corr));
-        let y_low = _mm_mulhi_epi16(
+        let y_low = _mm_mulhrs_epi16(
             _mm_slli_epi16::<SCALE>(_mm_cvtepu8_epi16(y_values)),
             v_luma_coeff,
         );
 
-        let r_low = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_add_epi16(y_low, _mm_mulhi_epi16(v_low, v_cr_coeff)),
-            rounding_const,
-        ));
-        let b_low = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_add_epi16(y_low, _mm_mulhi_epi16(u_low, v_cb_coeff)),
-            rounding_const,
-        ));
-        let g_low = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_sub_epi16(
-                y_low,
-                _mm_add_epi16(
-                    _mm_mulhi_epi16(v_low, v_g_coeff_1),
-                    _mm_mulhi_epi16(u_low, v_g_coeff_2),
-                ),
+        let r_low = _mm_add_epi16(y_low, _mm_mulhrs_epi16(v_low, v_cr_coeff));
+        let b_low = _mm_add_epi16(y_low, _mm_mulhrs_epi16(u_low, v_cb_coeff));
+        let g_low = _mm_sub_epi16(
+            y_low,
+            _mm_add_epi16(
+                _mm_mulhrs_epi16(v_low, v_g_coeff_1),
+                _mm_mulhrs_epi16(u_low, v_g_coeff_2),
             ),
-            rounding_const,
-        ));
+        );
 
         let r_values = _mm_packus_epi16(r_low, zeros);
         let g_values = _mm_packus_epi16(g_low, zeros);
diff --git a/src/sse/yuv_to_rgba420.rs b/src/sse/yuv_to_rgba420.rs
index bf45971..4b0f599 100644
--- a/src/sse/yuv_to_rgba420.rs
+++ b/src/sse/yuv_to_rgba420.rs
@@ -80,8 +80,7 @@ unsafe fn sse_yuv_to_rgba_row_impl420<const DESTINATION_CHANNELS: u8>(
     let u_ptr = u_plane.as_ptr();
     let v_ptr = v_plane.as_ptr();
 
-    const SCALE: i32 = 6;
-    const V_SHR: i32 = 3;
+    const SCALE: i32 = 3;
 
     let y_corr = _mm_set1_epi8(range.bias_y as i8);
     let uv_corr = _mm_set1_epi16(range.bias_uv as i16);
@@ -91,7 +90,6 @@ unsafe fn sse_yuv_to_rgba_row_impl420<const DESTINATION_CHANNELS: u8>(
     let v_g_coeff_1 = _mm_set1_epi16(transform.g_coeff_1 as i16);
     let v_g_coeff_2 = _mm_set1_epi16(transform.g_coeff_2 as i16);
     let v_alpha = _mm_set1_epi8(255u8 as i8);
-    let rounding_const = _mm_set1_epi16(1 << (V_SHR - 1));
 
     let zeros = _mm_setzero_si128();
     let reshuffle = _mm_setr_epi8(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7);
@@ -130,31 +128,13 @@ unsafe fn sse_yuv_to_rgba_row_impl420<const DESTINATION_CHANNELS: u8>(
             _mm_mulhrs_epi16(u_high, v_g_coeff_2),
         );
 
-        let r_high0 = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_add_epi16(y_high0, _mm_mulhrs_epi16(v_high, v_cr_coeff)),
-            rounding_const,
-        ));
-        let b_high0 = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_add_epi16(y_high0, _mm_mulhrs_epi16(u_high, v_cb_coeff)),
-            rounding_const,
-        ));
-        let g_high0 = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_sub_epi16(y_high0, g_coeff_hi),
-            rounding_const,
-        ));
-
-        let r_high1 = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_add_epi16(y_high1, _mm_mulhrs_epi16(v_high, v_cr_coeff)),
-            rounding_const,
-        ));
-        let b_high1 = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_add_epi16(y_high1, _mm_mulhrs_epi16(u_high, v_cb_coeff)),
-            rounding_const,
-        ));
-        let g_high1 = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_sub_epi16(y_high1, g_coeff_hi),
-            rounding_const,
-        ));
+        let r_high0 = _mm_add_epi16(y_high0, _mm_mulhrs_epi16(v_high, v_cr_coeff));
+        let b_high0 = _mm_add_epi16(y_high0, _mm_mulhrs_epi16(u_high, v_cb_coeff));
+        let g_high0 = _mm_sub_epi16(y_high0, g_coeff_hi);
+
+        let r_high1 = _mm_add_epi16(y_high1, _mm_mulhrs_epi16(v_high, v_cr_coeff));
+        let b_high1 = _mm_add_epi16(y_high1, _mm_mulhrs_epi16(u_high, v_cb_coeff));
+        let g_high1 = _mm_sub_epi16(y_high1, g_coeff_hi);
 
         let u_low = _mm_slli_epi16::<SCALE>(_mm_sub_epi16(u_low_u16, uv_corr));
         let v_low = _mm_slli_epi16::<SCALE>(_mm_sub_epi16(v_low_u16, uv_corr));
@@ -172,31 +152,13 @@ unsafe fn sse_yuv_to_rgba_row_impl420<const DESTINATION_CHANNELS: u8>(
             _mm_mulhrs_epi16(u_low, v_g_coeff_2),
         );
 
-        let r_low0 = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_add_epi16(y_low0, _mm_mulhrs_epi16(v_low, v_cr_coeff)),
-            rounding_const,
-        ));
-        let b_low0 = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_add_epi16(y_low0, _mm_mulhrs_epi16(u_low, v_cb_coeff)),
-            rounding_const,
-        ));
-        let g_low0 = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_sub_epi16(y_low0, g_coeff_lo),
-            rounding_const,
-        ));
-
-        let r_low1 = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_add_epi16(y_low1, _mm_mulhrs_epi16(v_low, v_cr_coeff)),
-            rounding_const,
-        ));
-        let b_low1 = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_add_epi16(y_low1, _mm_mulhrs_epi16(u_low, v_cb_coeff)),
-            rounding_const,
-        ));
-        let g_low1 = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_sub_epi16(y_low1, g_coeff_lo),
-            rounding_const,
-        ));
+        let r_low0 = _mm_add_epi16(y_low0, _mm_mulhrs_epi16(v_low, v_cr_coeff));
+        let b_low0 = _mm_add_epi16(y_low0, _mm_mulhrs_epi16(u_low, v_cb_coeff));
+        let g_low0 = _mm_sub_epi16(y_low0, g_coeff_lo);
+
+        let r_low1 = _mm_add_epi16(y_low1, _mm_mulhrs_epi16(v_low, v_cr_coeff));
+        let b_low1 = _mm_add_epi16(y_low1, _mm_mulhrs_epi16(u_low, v_cb_coeff));
+        let g_low1 = _mm_sub_epi16(y_low1, g_coeff_lo);
 
         let r_values0 = _mm_packus_epi16(r_low0, r_high0);
         let g_values0 = _mm_packus_epi16(g_low0, g_high0);
@@ -318,31 +280,13 @@ unsafe fn sse_yuv_to_rgba_row_impl420<const DESTINATION_CHANNELS: u8>(
             _mm_mulhrs_epi16(u_low, v_g_coeff_2),
         );
 
-        let r_low0 = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_add_epi16(y_low0, _mm_mulhrs_epi16(v_low, v_cr_coeff)),
-            rounding_const,
-        ));
-        let b_low0 = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_add_epi16(y_low0, _mm_mulhrs_epi16(u_low, v_cb_coeff)),
-            rounding_const,
-        ));
-        let g_low0 = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_sub_epi16(y_low0, g_coeff),
-            rounding_const,
-        ));
-
-        let r_low1 = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_add_epi16(y_low1, _mm_mulhrs_epi16(v_low, v_cr_coeff)),
-            rounding_const,
-        ));
-        let b_low1 = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_add_epi16(y_low1, _mm_mulhrs_epi16(u_low, v_cb_coeff)),
-            rounding_const,
-        ));
-        let g_low1 = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_sub_epi16(y_low1, g_coeff),
-            rounding_const,
-        ));
+        let r_low0 = _mm_add_epi16(y_low0, _mm_mulhrs_epi16(v_low, v_cr_coeff));
+        let b_low0 = _mm_add_epi16(y_low0, _mm_mulhrs_epi16(u_low, v_cb_coeff));
+        let g_low0 = _mm_sub_epi16(y_low0, g_coeff);
+
+        let r_low1 = _mm_add_epi16(y_low1, _mm_mulhrs_epi16(v_low, v_cr_coeff));
+        let b_low1 = _mm_add_epi16(y_low1, _mm_mulhrs_epi16(u_low, v_cb_coeff));
+        let g_low1 = _mm_sub_epi16(y_low1, g_coeff);
 
         let r_values0 = _mm_packus_epi16(r_low0, zeros);
         let g_values0 = _mm_packus_epi16(g_low0, zeros);
diff --git a/src/sse/yuv_to_rgba_alpha.rs b/src/sse/yuv_to_rgba_alpha.rs
index bfefd6d..76b2c90 100644
--- a/src/sse/yuv_to_rgba_alpha.rs
+++ b/src/sse/yuv_to_rgba_alpha.rs
@@ -94,8 +94,7 @@ unsafe fn sse_yuv_to_rgba_alpha_row_impl<const DESTINATION_CHANNELS: u8, const S
     let a_ptr = a_plane.as_ptr();
     let rgba_ptr = rgba.as_mut_ptr();
 
-    const SCALE: i32 = 6;
-    const V_SHR: i32 = 3;
+    const SCALE: i32 = 3;
 
     let y_corr = _mm_set1_epi8(range.bias_y as i8);
     let uv_corr = _mm_set1_epi16(range.bias_uv as i16);
@@ -104,7 +103,6 @@ unsafe fn sse_yuv_to_rgba_alpha_row_impl<const DESTINATION_CHANNELS: u8, const S
     let v_cb_coeff = _mm_set1_epi16(transform.cb_coef as i16);
     let v_g_coeff_1 = _mm_set1_epi16(transform.g_coeff_1 as i16);
     let v_g_coeff_2 = _mm_set1_epi16(transform.g_coeff_2 as i16);
-    let rounding_const = _mm_set1_epi16(1 << (V_SHR - 1));
 
     let zeros = _mm_setzero_si128();
 
@@ -145,24 +143,15 @@ unsafe fn sse_yuv_to_rgba_alpha_row_impl<const DESTINATION_CHANNELS: u8, const S
             v_luma_coeff,
         );
 
-        let r_high = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_add_epi16(y_high, _mm_mulhrs_epi16(v_high, v_cr_coeff)),
-            rounding_const,
-        ));
-        let b_high = _mm_srai_epi16::<V_SHR>(_mm_adds_epi16(
-            _mm_add_epi16(y_high, _mm_mulhrs_epi16(u_high, v_cb_coeff)),
-            rounding_const,
-        ));
-        let g_high = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_sub_epi16(
-                y_high,
-                _mm_add_epi16(
-                    _mm_mulhrs_epi16(v_high, v_g_coeff_1),
-                    _mm_mulhrs_epi16(u_high, v_g_coeff_2),
-                ),
+        let r_high = _mm_add_epi16(y_high, _mm_mulhrs_epi16(v_high, v_cr_coeff));
+        let b_high = _mm_add_epi16(y_high, _mm_mulhrs_epi16(u_high, v_cb_coeff));
+        let g_high = _mm_sub_epi16(
+            y_high,
+            _mm_add_epi16(
+                _mm_mulhrs_epi16(v_high, v_g_coeff_1),
+                _mm_mulhrs_epi16(u_high, v_g_coeff_2),
             ),
-            rounding_const,
-        ));
+        );
 
         let u_low = _mm_slli_epi16::<SCALE>(_mm_sub_epi16(u_low_u16, uv_corr));
         let v_low = _mm_slli_epi16::<SCALE>(_mm_sub_epi16(v_low_u16, uv_corr));
@@ -171,24 +160,15 @@ unsafe fn sse_yuv_to_rgba_alpha_row_impl<const DESTINATION_CHANNELS: u8, const S
             v_luma_coeff,
         );
 
-        let r_low = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_add_epi16(y_low, _mm_mulhrs_epi16(v_low, v_cr_coeff)),
-            rounding_const,
-        ));
-        let b_low = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_add_epi16(y_low, _mm_mulhrs_epi16(u_low, v_cb_coeff)),
-            rounding_const,
-        ));
-        let g_low = _mm_srai_epi16::<V_SHR>(_mm_add_epi16(
-            _mm_sub_epi16(
-                y_low,
-                _mm_add_epi16(
-                    _mm_mulhrs_epi16(v_low, v_g_coeff_1),
-                    _mm_mulhrs_epi16(u_low, v_g_coeff_2),
-                ),
+        let r_low = _mm_add_epi16(y_low, _mm_mulhrs_epi16(v_low, v_cr_coeff));
+        let b_low = _mm_add_epi16(y_low, _mm_mulhrs_epi16(u_low, v_cb_coeff));
+        let g_low = _mm_sub_epi16(
+            y_low,
+            _mm_add_epi16(
+                _mm_mulhrs_epi16(v_low, v_g_coeff_1),
+                _mm_mulhrs_epi16(u_low, v_g_coeff_2),
             ),
-            rounding_const,
-        ));
+        );
 
         let (r_values, g_values, b_values);