Improvements

awxkee · Nov 25, 2024 · b666b69 · b666b69
1 parent 454265a
commit b666b69
Show file tree

Hide file tree

Showing 11 changed files with 110 additions and 130 deletions.
diff --git a/README.md b/README.md
@@ -76,13 +76,13 @@ Tests performed on the image 5763x3842
 
 |                        | time(NEON) | Time(AVX) |
 |------------------------|:----------:|:---------:|
-| utils RGB->YUV 4:2:0   |   3.48ms   |  3.53ms   |
+| utils RGB->YUV 4:2:0   |   3.23ms   |  3.53ms   |
 | libyuv RGB->YUV 4:2:0  |   3.58ms   |  33.87ms  |
-| utils RGBA->YUV 4:2:0  |   4.32ms   |  5.47ms   |
+| utils RGBA->YUV 4:2:0  |   4.10ms   |  5.47ms   |
 | libyuv RGBA->YUV 4:2:0 |   4.87ms   |  23.48ms  |
-| utils RGBA->YUV 4:2:2  |   4.83ms   |  7.08ms   |
+| utils RGBA->YUV 4:2:2  |   4.50ms   |  7.08ms   |
 | libyuv RGBA->YUV 4:2:2 |   5.90ms   |  35.23ms  |
-| utils RGBA->YUV 4:4:4  |   5.34ms   |  7.97ms   |
+| utils RGBA->YUV 4:4:4  |   4.77ms   |  7.97ms   |
 
 ### Decoding
 
@@ -92,7 +92,7 @@ Tests performed on the image 5763x3842
 | libyuv YUV NV12->RGB   |   5.20ms   |  45.28ms  |
 | utils YUV 4:2:0->RGB   |   3.28ms   |  5.25ms   |
 | libyuv YUV 4:2:0->RGB  |   5.70ms   |  44.95ms  |
-| utils YUV 4:2:0->RGBA  |   3.82ms   |  5.98ms   |
+| utils YUV 4:2:0->RGBA  |   3.77ms   |  5.98ms   |
 | libyuv YUV 4:2:0->RGBA |   6.13ms   |  6.88ms   |
 | utils YUV 4:2:2->RGBA  |   4.88ms   |  6.91ms   |
 | libyuv YUV 4:2:2->RGBA |   5.91ms   |  6.91ms   |

diff --git a/src/neon/rgb_to_y.rs b/src/neon/rgb_to_y.rs
@@ -41,10 +41,8 @@ pub(crate) unsafe fn neon_rgb_to_y_row<const ORIGIN_CHANNELS: u8>(
 ) -> usize {
     let source_channels: YuvSourceChannels = ORIGIN_CHANNELS.into();
     let channels = source_channels.get_channels_count();
-    const V_SHR: i32 = 4;
-    const V_SCALE: i32 = 7;
-    let rounding_const_bias: i16 = 1 << (V_SHR - 1);
-    let bias_y = range.bias_y as i16 * (1 << V_SHR) + rounding_const_bias;
+    const V_SCALE: i32 = 3;
+    let bias_y = range.bias_y as i16;
 
     let y_ptr = y_plane;
     let rgba_ptr = rgba.as_ptr();
@@ -101,7 +99,7 @@ pub(crate) unsafe fn neon_rgb_to_y_row<const ORIGIN_CHANNELS: u8>(
         y_high = vmaxq_s16(y_high, v_zeros);
 
         let y_high = vminq_u16(
-            vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::<V_SHR>(y_high), i_bias_y)),
+            vreinterpretq_u16_s16(vmaxq_s16((y_high), i_bias_y)),
             i_cap_y,
         );
 
@@ -114,10 +112,7 @@ pub(crate) unsafe fn neon_rgb_to_y_row<const ORIGIN_CHANNELS: u8>(
         y_low = vqrdmlahq_s16(y_low, b_low, v_yb);
         y_low = vmaxq_s16(y_low, v_zeros);
 
-        let y_low = vminq_u16(
-            vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::<V_SHR>(y_low), i_bias_y)),
-            i_cap_y,
-        );
+        let y_low = vminq_u16(vreinterpretq_u16_s16(vmaxq_s16((y_low), i_bias_y)), i_cap_y);
 
         let y = vcombine_u8(vqmovn_u16(y_low), vqmovn_u16(y_high));
         vst1q_u8(y_ptr.add(cx), y);

diff --git a/src/neon/rgba_to_nv.rs b/src/neon/rgba_to_nv.rs
@@ -55,11 +55,9 @@ pub(crate) unsafe fn neon_rgbx_to_nv_row_rdm<
     let source_channels: YuvSourceChannels = ORIGIN_CHANNELS.into();
     let channels = source_channels.get_channels_count();
 
-    const V_SHR: i32 = 4;
-    const V_SCALE: i32 = 7;
-    let rounding_const_bias: i16 = 1 << (V_SHR - 1);
-    let bias_y = range.bias_y as i16 * (1 << V_SHR) + rounding_const_bias;
-    let bias_uv = range.bias_uv as i16 * (1 << V_SHR) + rounding_const_bias;
+    const V_SCALE: i32 = 3;
+    let bias_y = range.bias_y as i16;
+    let bias_uv = range.bias_uv as i16;
 
     let y_ptr = y_plane.as_mut_ptr();
     let uv_ptr = uv_plane.as_mut_ptr();
@@ -129,7 +127,7 @@ pub(crate) unsafe fn neon_rgbx_to_nv_row_rdm<
         y_high = vqrdmlahq_laneq_s16::<2>(y_high, b_high, v_weights);
 
         let y_high = vminq_u16(
-            vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::<V_SHR>(y_high), i_bias_y)),
+            vreinterpretq_u16_s16(vmaxq_s16((y_high), i_bias_y)),
             i_cap_y,
         );
 
@@ -141,10 +139,7 @@ pub(crate) unsafe fn neon_rgbx_to_nv_row_rdm<
         y_low = vqrdmlahq_laneq_s16::<1>(y_low, g_low, v_weights);
         y_low = vqrdmlahq_laneq_s16::<2>(y_low, b_low, v_weights);
 
-        let y_low = vminq_u16(
-            vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::<V_SHR>(y_low), i_bias_y)),
-            i_cap_y,
-        );
+        let y_low = vminq_u16(vreinterpretq_u16_s16(vmaxq_s16((y_low), i_bias_y)), i_cap_y);
 
         let y = vcombine_u8(vqmovn_u16(y_low), vqmovn_u16(y_high));
         vst1q_u8(y_ptr.add(cx), y);
@@ -155,7 +150,7 @@ pub(crate) unsafe fn neon_rgbx_to_nv_row_rdm<
             cb_high = vqrdmlahq_laneq_s16::<5>(cb_high, b_high, v_weights);
 
             let cb_high = vminq_u16(
-                vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::<V_SHR>(cb_high), i_bias_y)),
+                vreinterpretq_u16_s16(vmaxq_s16((cb_high), i_bias_y)),
                 i_cap_uv,
             );
 
@@ -164,7 +159,7 @@ pub(crate) unsafe fn neon_rgbx_to_nv_row_rdm<
             cr_high = vqrdmlahq_laneq_s16::<0>(cr_high, b_high, v_cr_b);
 
             let cr_high = vminq_u16(
-                vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::<V_SHR>(cr_high), i_bias_y)),
+                vreinterpretq_u16_s16(vmaxq_s16((cr_high), i_bias_y)),
                 i_cap_uv,
             );
 
@@ -173,7 +168,7 @@ pub(crate) unsafe fn neon_rgbx_to_nv_row_rdm<
             cb_low = vqrdmlahq_laneq_s16::<5>(cb_low, b_low, v_weights);
 
             let cb_low = vminq_u16(
-                vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::<V_SHR>(cb_low), i_bias_y)),
+                vreinterpretq_u16_s16(vmaxq_s16((cb_low), i_bias_y)),
                 i_cap_uv,
             );
 
@@ -182,7 +177,7 @@ pub(crate) unsafe fn neon_rgbx_to_nv_row_rdm<
             cr_low = vqrdmlahq_laneq_s16::<0>(cr_low, b_low, v_cr_b);
 
             let cr_low = vminq_u16(
-                vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::<V_SHR>(cr_low), i_bias_y)),
+                vreinterpretq_u16_s16(vmaxq_s16((cr_low), i_bias_y)),
                 i_cap_uv,
             );
             let cb = vcombine_u8(vqmovn_u16(cb_low), vqmovn_u16(cb_high));
@@ -218,7 +213,7 @@ pub(crate) unsafe fn neon_rgbx_to_nv_row_rdm<
             cbl = vqrdmlahq_laneq_s16::<5>(cbl, b1, v_weights);
 
             let cb = vqmovn_u16(vminq_u16(
-                vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::<V_SHR>(cbl), i_bias_y)),
+                vreinterpretq_u16_s16(vmaxq_s16((cbl), i_bias_y)),
                 i_cap_uv,
             ));
 
@@ -227,7 +222,7 @@ pub(crate) unsafe fn neon_rgbx_to_nv_row_rdm<
             crl = vqrdmlahq_laneq_s16::<0>(crl, b1, v_cr_b);
 
             let cr = vqmovn_u16(vminq_u16(
-                vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::<V_SHR>(crl), i_bias_y)),
+                vreinterpretq_u16_s16(vmaxq_s16((crl), i_bias_y)),
                 i_cap_uv,
             ));
 

diff --git a/src/neon/rgba_to_yuv.rs b/src/neon/rgba_to_yuv.rs
@@ -53,11 +53,9 @@ pub(crate) unsafe fn neon_rgba_to_yuv_rdm<
     let source_channels: YuvSourceChannels = ORIGIN_CHANNELS.into();
     let channels = source_channels.get_channels_count();
 
-    const V_SHR: i32 = 4;
-    const V_SCALE: i32 = 7;
-    let rounding_const_bias: i16 = 1 << (V_SHR - 1);
-    let bias_y = range.bias_y as i16 * (1 << V_SHR) + rounding_const_bias;
-    let bias_uv = range.bias_uv as i16 * (1 << V_SHR) + rounding_const_bias;
+    const V_SCALE: i32 = 3;
+    let bias_y = range.bias_y as i16;
+    let bias_uv = range.bias_uv as i16;
 
     let y_ptr = y_plane;
     let u_ptr = u_plane;
@@ -128,7 +126,7 @@ pub(crate) unsafe fn neon_rgba_to_yuv_rdm<
         y_high = vqrdmlahq_laneq_s16::<2>(y_high, b0hi, v_weights);
 
         let y_high = vminq_u16(
-            vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::<V_SHR>(y_high), i_bias_y)),
+            vreinterpretq_u16_s16(vmaxq_s16((y_high), i_bias_y)),
             i_cap_y,
         );
 
@@ -140,10 +138,7 @@ pub(crate) unsafe fn neon_rgba_to_yuv_rdm<
         y_low = vqrdmlahq_laneq_s16::<1>(y_low, g_low, v_weights);
         y_low = vqrdmlahq_laneq_s16::<2>(y_low, b_low, v_weights);
 
-        let y_low = vminq_u16(
-            vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::<V_SHR>(y_low), i_bias_y)),
-            i_cap_y,
-        );
+        let y_low = vminq_u16(vreinterpretq_u16_s16(vmaxq_s16((y_low), i_bias_y)), i_cap_y);
 
         let y = vcombine_u8(vqmovn_u16(y_low), vqmovn_u16(y_high));
         vst1q_u8(y_ptr.get_unchecked_mut(cx..).as_mut_ptr(), y);
@@ -154,7 +149,7 @@ pub(crate) unsafe fn neon_rgba_to_yuv_rdm<
             cb_high = vqrdmlahq_laneq_s16::<5>(cb_high, b0hi, v_weights);
 
             let cb_high = vminq_u16(
-                vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::<V_SHR>(cb_high), i_bias_y)),
+                vreinterpretq_u16_s16(vmaxq_s16((cb_high), i_bias_y)),
                 i_cap_uv,
             );
 
@@ -163,7 +158,7 @@ pub(crate) unsafe fn neon_rgba_to_yuv_rdm<
             cr_high = vqrdmlahq_laneq_s16::<0>(cr_high, b0hi, v_cr_b);
 
             let cr_high = vminq_u16(
-                vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::<V_SHR>(cr_high), i_bias_y)),
+                vreinterpretq_u16_s16(vmaxq_s16((cr_high), i_bias_y)),
                 i_cap_uv,
             );
 
@@ -172,7 +167,7 @@ pub(crate) unsafe fn neon_rgba_to_yuv_rdm<
             cb_low = vqrdmlahq_laneq_s16::<5>(cb_low, b_low, v_weights);
 
             let cb_low = vminq_u16(
-                vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::<V_SHR>(cb_low), i_bias_y)),
+                vreinterpretq_u16_s16(vmaxq_s16((cb_low), i_bias_y)),
                 i_cap_uv,
             );
 
@@ -181,7 +176,7 @@ pub(crate) unsafe fn neon_rgba_to_yuv_rdm<
             cr_low = vqrdmlahq_laneq_s16::<0>(cr_low, b_low, v_cr_b);
 
             let cr_low = vminq_u16(
-                vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::<V_SHR>(cr_low), i_bias_y)),
+                vreinterpretq_u16_s16(vmaxq_s16((cr_low), i_bias_y)),
                 i_cap_uv,
             );
             let cb = vcombine_u8(vqmovn_u16(cb_low), vqmovn_u16(cb_high));
@@ -209,7 +204,7 @@ pub(crate) unsafe fn neon_rgba_to_yuv_rdm<
             cbl = vqrdmlahq_laneq_s16::<5>(cbl, b1, v_weights);
 
             let cb = vqmovn_u16(vminq_u16(
-                vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::<V_SHR>(cbl), i_bias_y)),
+                vreinterpretq_u16_s16(vmaxq_s16((cbl), i_bias_y)),
                 i_cap_uv,
             ));
 
@@ -218,7 +213,7 @@ pub(crate) unsafe fn neon_rgba_to_yuv_rdm<
             crl = vqrdmlahq_laneq_s16::<0>(crl, b1, v_cr_b);
 
             let cr = vqmovn_u16(vminq_u16(
-                vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::<V_SHR>(crl), i_bias_y)),
+                vreinterpretq_u16_s16(vmaxq_s16((crl), i_bias_y)),
                 i_cap_uv,
             ));
 

diff --git a/src/neon/rgba_to_yuv420.rs b/src/neon/rgba_to_yuv420.rs
@@ -48,11 +48,9 @@ pub(crate) unsafe fn neon_rgba_to_yuv_rdm420<const ORIGIN_CHANNELS: u8, const PR
     let source_channels: YuvSourceChannels = ORIGIN_CHANNELS.into();
     let channels = source_channels.get_channels_count();
 
-    const V_SHR: i32 = 4;
-    const V_SCALE: i32 = 7;
-    let rounding_const_bias: i16 = 1 << (V_SHR - 1);
-    let bias_y = range.bias_y as i16 * (1 << V_SHR) + rounding_const_bias;
-    let bias_uv = range.bias_uv as i16 * (1 << V_SHR) + rounding_const_bias;
+    const V_SCALE: i32 = 3;
+    let bias_y = range.bias_y as i16;
+    let bias_uv = range.bias_uv as i16;
 
     let u_ptr = u_plane;
     let v_ptr = v_plane;
@@ -150,7 +148,7 @@ pub(crate) unsafe fn neon_rgba_to_yuv_rdm420<const ORIGIN_CHANNELS: u8, const PR
         y0_high = vqrdmlahq_laneq_s16::<2>(y0_high, b0hi, v_weights);
 
         let y0_high = vminq_u16(
-            vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::<V_SHR>(y0_high), i_bias_y)),
+            vreinterpretq_u16_s16(vmaxq_s16((y0_high), i_bias_y)),
             i_cap_y,
         );
 
@@ -159,7 +157,7 @@ pub(crate) unsafe fn neon_rgba_to_yuv_rdm420<const ORIGIN_CHANNELS: u8, const PR
         y1_high = vqrdmlahq_laneq_s16::<2>(y1_high, b1hi, v_weights);
 
         let y1_high = vminq_u16(
-            vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::<V_SHR>(y1_high), i_bias_y)),
+            vreinterpretq_u16_s16(vmaxq_s16((y1_high), i_bias_y)),
             i_cap_y,
         );
 
@@ -176,7 +174,7 @@ pub(crate) unsafe fn neon_rgba_to_yuv_rdm420<const ORIGIN_CHANNELS: u8, const PR
         y0_low = vqrdmlahq_laneq_s16::<2>(y0_low, b0_low, v_weights);
 
         let y0_low = vminq_u16(
-            vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::<V_SHR>(y0_low), i_bias_y)),
+            vreinterpretq_u16_s16(vmaxq_s16((y0_low), i_bias_y)),
             i_cap_y,
         );
 
@@ -185,7 +183,7 @@ pub(crate) unsafe fn neon_rgba_to_yuv_rdm420<const ORIGIN_CHANNELS: u8, const PR
         y1_low = vqrdmlahq_laneq_s16::<2>(y1_low, b1_low, v_weights);
 
         let y1_low = vminq_u16(
-            vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::<V_SHR>(y1_low), i_bias_y)),
+            vreinterpretq_u16_s16(vmaxq_s16((y1_low), i_bias_y)),
             i_cap_y,
         );
 
@@ -207,7 +205,7 @@ pub(crate) unsafe fn neon_rgba_to_yuv_rdm420<const ORIGIN_CHANNELS: u8, const PR
         cbl = vqrdmlahq_laneq_s16::<5>(cbl, b1, v_weights);
 
         let cb = vqmovn_u16(vminq_u16(
-            vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::<V_SHR>(cbl), i_bias_y)),
+            vreinterpretq_u16_s16(vmaxq_s16((cbl), i_bias_y)),
             i_cap_uv,
         ));
 
@@ -216,7 +214,7 @@ pub(crate) unsafe fn neon_rgba_to_yuv_rdm420<const ORIGIN_CHANNELS: u8, const PR
         crl = vqrdmlahq_laneq_s16::<0>(crl, b1, v_cr_b);
 
         let cr = vqmovn_u16(vminq_u16(
-            vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::<V_SHR>(crl), i_bias_y)),
+            vreinterpretq_u16_s16(vmaxq_s16((crl), i_bias_y)),
             i_cap_uv,
         ));
 

diff --git a/src/neon/y_to_rgb.rs b/src/neon/y_to_rgb.rs
@@ -51,22 +51,24 @@ pub(crate) unsafe fn neon_y_to_rgb_row_rdm<const DESTINATION_CHANNELS: u8>(
 
     let mut cx = start_cx;
 
+    const V_SCALE: i32 = 3;
+
     while cx + 16 < width {
         let y_values = vsubq_u8(vld1q_u8(y_ptr.add(cx)), y_corr);
 
         let y_high = vqrdmulhq_n_s16(
-            vreinterpretq_s16_u16(vshll_high_n_u8::<7>(y_values)),
+            vreinterpretq_s16_u16(vshll_high_n_u8::<V_SCALE>(y_values)),
             transform.y_coef as i16,
         );
 
-        let r_high = vqrshrun_n_s16::<4>(y_high);
+        let r_high = vqmovun_s16(y_high);
 
         let y_low = vqrdmulhq_n_s16(
-            vreinterpretq_s16_u16(vshll_n_u8::<7>(vget_low_u8(y_values))),
+            vreinterpretq_s16_u16(vshll_n_u8::<V_SCALE>(vget_low_u8(y_values))),
             transform.y_coef as i16,
         );
 
-        let r_low = vqrshrun_n_s16::<4>(y_low);
+        let r_low = vqmovun_s16(y_low);
 
         let r_values = vcombine_u8(r_low, r_high);
 

diff --git a/src/neon/yuv_nv_to_rgba.rs b/src/neon/yuv_nv_to_rgba.rs
@@ -78,8 +78,7 @@ pub(crate) unsafe fn neon_yuv_nv_to_rgba_row_rdm<
 
     let v_weights = vld1q_s16(weights_arr.as_ptr());
 
-    const SCALE: i32 = 7;
-    const V_SHR: i32 = 4;
+    const SCALE: i32 = 3;
 
     while cx + 16 < width {
         let y_values = vqsubq_u8(vld1q_u8(y_ptr.add(cx)), y_corr);
@@ -126,9 +125,9 @@ pub(crate) unsafe fn neon_yuv_nv_to_rgba_row_rdm<
             v_weights,
         );
 
-        let r_high = vqrshrun_n_s16::<V_SHR>(vqrdmlahq_laneq_s16::<1>(y_high, v_high, v_weights));
-        let b_high = vqrshrun_n_s16::<V_SHR>(vqrdmlahq_laneq_s16::<2>(y_high, u_high, v_weights));
-        let g_high = vqrshrun_n_s16::<V_SHR>(vqrdmlahq_laneq_s16::<4>(
+        let r_high = vqmovun_s16(vqrdmlahq_laneq_s16::<1>(y_high, v_high, v_weights));
+        let b_high = vqmovun_s16(vqrdmlahq_laneq_s16::<2>(y_high, u_high, v_weights));
+        let g_high = vqmovun_s16(vqrdmlahq_laneq_s16::<4>(
             vqrdmlahq_laneq_s16::<3>(y_high, v_high, v_weights),
             u_high,
             v_weights,
@@ -144,9 +143,9 @@ pub(crate) unsafe fn neon_yuv_nv_to_rgba_row_rdm<
         let y_v_shl = vshll_n_u8::<SCALE>(vget_low_u8(y_values));
         let y_low = vqrdmulhq_laneq_s16::<0>(vreinterpretq_s16_u16(y_v_shl), v_weights);
 
-        let r_low = vqrshrun_n_s16::<V_SHR>(vqrdmlahq_laneq_s16::<1>(y_low, v_low, v_weights));
-        let b_low = vqrshrun_n_s16::<V_SHR>(vqrdmlahq_laneq_s16::<2>(y_low, u_low, v_weights));
-        let g_low = vqrshrun_n_s16::<V_SHR>(vqrdmlahq_laneq_s16::<4>(
+        let r_low = vqmovun_s16(vqrdmlahq_laneq_s16::<1>(y_low, v_low, v_weights));
+        let b_low = vqmovun_s16(vqrdmlahq_laneq_s16::<2>(y_low, u_low, v_weights));
+        let g_low = vqmovun_s16(vqrdmlahq_laneq_s16::<4>(
             vqrdmlahq_laneq_s16::<3>(y_low, v_low, v_weights),
             u_low,
             v_weights,
@@ -235,9 +234,9 @@ pub(crate) unsafe fn neon_yuv_nv_to_rgba_row_rdm<
             v_weights,
         );
 
-        let r_low = vqrshrun_n_s16::<V_SHR>(vqrdmlahq_laneq_s16::<1>(y_low, v_low, v_weights));
-        let b_low = vqrshrun_n_s16::<V_SHR>(vqrdmlahq_laneq_s16::<2>(y_low, u_low, v_weights));
-        let g_low = vqrshrun_n_s16::<V_SHR>(vqrdmlahq_laneq_s16::<4>(
+        let r_low = vqmovun_s16(vqrdmlahq_laneq_s16::<1>(y_low, v_low, v_weights));
+        let b_low = vqmovun_s16(vqrdmlahq_laneq_s16::<2>(y_low, u_low, v_weights));
+        let g_low = vqmovun_s16(vqrdmlahq_laneq_s16::<4>(
             vqrdmlahq_laneq_s16::<3>(y_low, v_low, v_weights),
             u_low,
             v_weights,