From 79008a5c34b949f4c2561f890bc4e569657e98ac Mon Sep 17 00:00:00 2001 From: Radzivon Bartoshyk Date: Mon, 25 Nov 2024 01:08:47 +0000 Subject: [PATCH] NEON improvements --- README.md | 12 ++-- app/benches/yuv8/main.rs | 62 ++++++++-------- src/neon/rgba_to_nv.rs | 68 +++++++++--------- src/neon/rgba_to_yuv.rs | 74 ++++++++++--------- src/neon/yuv_nv_to_rgba.rs | 39 +++++----- src/neon/yuv_nv_to_rgba420.rs | 112 +++++++++++------------------ src/neon/yuv_to_rgba.rs | 97 ++++++++++++------------- src/neon/yuv_to_rgba420.rs | 131 +++++++++++++--------------------- src/neon/yuv_to_rgba_alpha.rs | 72 +++++++++---------- 9 files changed, 304 insertions(+), 363 deletions(-) diff --git a/README.md b/README.md index 3d9a94c..5112d8a 100644 --- a/README.md +++ b/README.md @@ -76,7 +76,7 @@ Tests performed on the image 5763x3842 | | time(NEON) | Time(AVX) | |------------------------|:----------:|:---------:| -| utils RGB->YUV 4:2:0 | 4.37ms | 6.14ms | +| utils RGB->YUV 4:2:0 | 4.09ms | 6.14ms | | libyuv RGB->YUV 4:2:0 | 3.66ms | 33.87ms | | utils RGBA->YUV 4:2:0 | 4.88ms | 7.34ms | | libyuv RGBA->YUV 4:2:0 | 4.87ms | 23.48ms | @@ -88,15 +88,15 @@ Tests performed on the image 5763x3842 | | time(NEON) | Time(AVX) | |------------------------|:----------:|:---------:| -| utils YUV NV12->RGB | 4.08ms | 6.48ms | +| utils YUV NV12->RGB | 3.92ms | 6.48ms | | libyuv YUV NV12->RGB | 5.20ms | 45.28ms | -| utils YUV 4:2:0->RGB | 3.49ms | 5.44ms | +| utils YUV 4:2:0->RGB | 3.28ms | 5.44ms | | libyuv YUV 4:2:0->RGB | 5.70ms | 44.95ms | -| utils YUV 4:2:0->RGBA | 4.02ms | 5.98ms | +| utils YUV 4:2:0->RGBA | 3.85ms | 5.98ms | | libyuv YUV 4:2:0->RGBA | 6.13ms | 6.88ms | -| utils YUV 4:2:2->RGBA | 5.39ms | 6.91ms | +| utils YUV 4:2:2->RGBA | 4.94ms | 6.91ms | | libyuv YUV 4:2:2->RGBA | 5.91ms | 6.91ms | -| utils YUV 4:4:4->RGBA | 5.04ms | 7.20ms | +| utils YUV 4:4:4->RGBA | 4.83ms | 7.20ms | | libyuv YUV 4:4:4->RGBA | 4.82ms | 7.30ms | This project is licensed under either of diff --git a/app/benches/yuv8/main.rs b/app/benches/yuv8/main.rs index de8c94d..bcda09d 100644 --- a/app/benches/yuv8/main.rs +++ b/app/benches/yuv8/main.rs @@ -79,7 +79,7 @@ pub fn criterion_benchmark(c: &mut Criterion) { let fixed_planar = planar_image.to_fixed(); // let rgba_image = img.to_rgba8(); - // + // c.bench_function("yuvutils RGB -> YUV 4:2:0", |b| { // let mut test_planar = YuvPlanarImageMut::::alloc( // dimensions.0, @@ -217,36 +217,36 @@ pub fn criterion_benchmark(c: &mut Criterion) { // .unwrap(); // }) // }); - - c.bench_function("yuvutils YUV NV12 -> RGB", |b| { - let mut rgb_bytes = vec![0u8; dimensions.0 as usize * 4 * dimensions.1 as usize]; - b.iter(|| { - yuv_nv12_to_rgba( - &fixed_bi_planar, - &mut rgb_bytes, - dimensions.0 * 4u32, - YuvRange::Limited, - YuvStandardMatrix::Bt601, - ) - .unwrap(); - }) - }); - - c.bench_function("livyuv YUV NV12 -> RGB", |b| { - let mut rgb_bytes = vec![0u8; dimensions.0 as usize * 4 * dimensions.1 as usize]; - b.iter(|| unsafe { - rs_NV21ToABGR( - fixed_bi_planar.y_plane.as_ptr(), - fixed_bi_planar.y_stride as i32, - fixed_bi_planar.uv_plane.as_ptr(), - fixed_bi_planar.uv_stride as i32, - rgb_bytes.as_mut_ptr(), - dimensions.0 as i32 * 4, - fixed_bi_planar.width as i32, - fixed_bi_planar.height as i32, - ); - }) - }); + // + // c.bench_function("yuvutils YUV NV12 -> RGB", |b| { + // let mut rgb_bytes = vec![0u8; dimensions.0 as usize * 4 * dimensions.1 as usize]; + // b.iter(|| { + // yuv_nv12_to_rgba( + // &fixed_bi_planar, + // &mut rgb_bytes, + // dimensions.0 * 4u32, + // YuvRange::Limited, + // YuvStandardMatrix::Bt601, + // ) + // .unwrap(); + // }) + // }); + // + // c.bench_function("livyuv YUV NV12 -> RGB", |b| { + // let mut rgb_bytes = vec![0u8; dimensions.0 as usize * 4 * dimensions.1 as usize]; + // b.iter(|| unsafe { + // rs_NV21ToABGR( + // fixed_bi_planar.y_plane.as_ptr(), + // fixed_bi_planar.y_stride as i32, + // fixed_bi_planar.uv_plane.as_ptr(), + // fixed_bi_planar.uv_stride as i32, + // rgb_bytes.as_mut_ptr(), + // dimensions.0 as i32 * 4, + // fixed_bi_planar.width as i32, + // fixed_bi_planar.height as i32, + // ); + // }) + // }); c.bench_function("yuvutils YUV 4:2:0 -> RGB", |b| { let mut rgb_bytes = vec![0u8; dimensions.0 as usize * 3 * dimensions.1 as usize]; diff --git a/src/neon/rgba_to_nv.rs b/src/neon/rgba_to_nv.rs index bee5e49..52fd434 100644 --- a/src/neon/rgba_to_nv.rs +++ b/src/neon/rgba_to_nv.rs @@ -71,16 +71,20 @@ pub(crate) unsafe fn neon_rgbx_to_nv_row_rdm< let y_bias = vdupq_n_s16(bias_y); let uv_bias = vdupq_n_s16(bias_uv); - let v_yr = vdupq_n_s16(transform.yr as i16); - let v_yg = vdupq_n_s16(transform.yg as i16); - let v_yb = vdupq_n_s16(transform.yb as i16); - let v_cb_r = vdupq_n_s16(transform.cb_r as i16); - let v_cb_g = vdupq_n_s16(transform.cb_g as i16); - let v_cb_b = vdupq_n_s16(transform.cb_b as i16); - let v_cr_r = vdupq_n_s16(transform.cr_r as i16); - let v_cr_g = vdupq_n_s16(transform.cr_g as i16); let v_cr_b = vdupq_n_s16(transform.cr_b as i16); + let weights_arr: [i16; 8] = [ + transform.yr as i16, + transform.yg as i16, + transform.yb as i16, + transform.cb_r as i16, + transform.cb_g as i16, + transform.cb_b as i16, + transform.cr_r as i16, + transform.cr_g as i16, + ]; + let v_weights = vld1q_s16(weights_arr.as_ptr()); + let mut cx = start_cx; let mut ux = start_ux; @@ -120,9 +124,9 @@ pub(crate) unsafe fn neon_rgbx_to_nv_row_rdm< let g_high = vreinterpretq_s16_u16(vshll_high_n_u8::(g_values_u8)); let b_high = vreinterpretq_s16_u16(vshll_high_n_u8::(b_values_u8)); - let mut y_high = vqrdmlahq_s16(y_bias, r_high, v_yr); - y_high = vqrdmlahq_s16(y_high, g_high, v_yg); - y_high = vqrdmlahq_s16(y_high, b_high, v_yb); + let mut y_high = vqrdmlahq_laneq_s16::<0>(y_bias, r_high, v_weights); + y_high = vqrdmlahq_laneq_s16::<1>(y_high, g_high, v_weights); + y_high = vqrdmlahq_laneq_s16::<2>(y_high, b_high, v_weights); let y_high = vminq_u16( vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::(y_high), i_bias_y)), @@ -133,9 +137,9 @@ pub(crate) unsafe fn neon_rgbx_to_nv_row_rdm< let g_low = vreinterpretq_s16_u16(vshll_n_u8::(vget_low_u8(g_values_u8))); let b_low = vreinterpretq_s16_u16(vshll_n_u8::(vget_low_u8(b_values_u8))); - let mut y_low = vqrdmlahq_s16(y_bias, r_low, v_yr); - y_low = vqrdmlahq_s16(y_low, g_low, v_yg); - y_low = vqrdmlahq_s16(y_low, b_low, v_yb); + let mut y_low = vqrdmlahq_laneq_s16::<0>(y_bias, r_low, v_weights); + y_low = vqrdmlahq_laneq_s16::<1>(y_low, g_low, v_weights); + y_low = vqrdmlahq_laneq_s16::<2>(y_low, b_low, v_weights); let y_low = vminq_u16( vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::(y_low), i_bias_y)), @@ -146,36 +150,36 @@ pub(crate) unsafe fn neon_rgbx_to_nv_row_rdm< vst1q_u8(y_ptr.add(cx), y); if chroma_subsampling == YuvChromaSubsampling::Yuv444 { - let mut cb_high = vqrdmlahq_s16(uv_bias, r_high, v_cb_r); - cb_high = vqrdmlahq_s16(cb_high, g_high, v_cb_g); - cb_high = vqrdmlahq_s16(cb_high, b_high, v_cb_b); + let mut cb_high = vqrdmlahq_laneq_s16::<3>(uv_bias, r_high, v_weights); + cb_high = vqrdmlahq_laneq_s16::<4>(cb_high, g_high, v_weights); + cb_high = vqrdmlahq_laneq_s16::<5>(cb_high, b_high, v_weights); let cb_high = vminq_u16( vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::(cb_high), i_bias_y)), i_cap_uv, ); - let mut cr_high = vqrdmlahq_s16(uv_bias, r_high, v_cr_r); - cr_high = vqrdmlahq_s16(cr_high, g_high, v_cr_g); - cr_high = vqrdmlahq_s16(cr_high, b_high, v_cr_b); + let mut cr_high = vqrdmlahq_laneq_s16::<6>(uv_bias, r_high, v_weights); + cr_high = vqrdmlahq_laneq_s16::<7>(cr_high, g_high, v_weights); + cr_high = vqrdmlahq_laneq_s16::<0>(cr_high, b_high, v_cr_b); let cr_high = vminq_u16( vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::(cr_high), i_bias_y)), i_cap_uv, ); - let mut cb_low = vqrdmlahq_s16(uv_bias, r_low, v_cb_r); - cb_low = vqrdmlahq_s16(cb_low, g_low, v_cb_g); - cb_low = vqrdmlahq_s16(cb_low, b_low, v_cb_b); + let mut cb_low = vqrdmlahq_laneq_s16::<3>(uv_bias, r_low, v_weights); + cb_low = vqrdmlahq_laneq_s16::<4>(cb_low, g_low, v_weights); + cb_low = vqrdmlahq_laneq_s16::<5>(cb_low, b_low, v_weights); let cb_low = vminq_u16( vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::(cb_low), i_bias_y)), i_cap_uv, ); - let mut cr_low = vqrdmlahq_s16(uv_bias, r_low, v_cr_r); - cr_low = vqrdmlahq_s16(cr_low, g_low, v_cr_g); - cr_low = vqrdmlahq_s16(cr_low, b_low, v_cr_b); + let mut cr_low = vqrdmlahq_laneq_s16::<6>(uv_bias, r_low, v_weights); + cr_low = vqrdmlahq_laneq_s16::<7>(cr_low, g_low, v_weights); + cr_low = vqrdmlahq_laneq_s16::<0>(cr_low, b_low, v_cr_b); let cr_low = vminq_u16( vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::(cr_low), i_bias_y)), @@ -209,18 +213,18 @@ pub(crate) unsafe fn neon_rgbx_to_nv_row_rdm< b_values_u8, )))); - let mut cbl = vqrdmlahq_s16(uv_bias, r1, v_cb_r); - cbl = vqrdmlahq_s16(cbl, g1, v_cb_g); - cbl = vqrdmlahq_s16(cbl, b1, v_cb_b); + let mut cbl = vqrdmlahq_laneq_s16::<3>(uv_bias, r1, v_weights); + cbl = vqrdmlahq_laneq_s16::<4>(cbl, g1, v_weights); + cbl = vqrdmlahq_laneq_s16::<5>(cbl, b1, v_weights); let cb = vqmovn_u16(vminq_u16( vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::(cbl), i_bias_y)), i_cap_uv, )); - let mut crl = vqrdmlahq_s16(uv_bias, r1, v_cr_r); - crl = vqrdmlahq_s16(crl, g1, v_cr_g); - crl = vqrdmlahq_s16(crl, b1, v_cr_b); + let mut crl = vqrdmlahq_laneq_s16::<6>(uv_bias, r1, v_weights); + crl = vqrdmlahq_laneq_s16::<7>(crl, g1, v_weights); + crl = vqrdmlahq_laneq_s16::<0>(crl, b1, v_cr_b); let cr = vqmovn_u16(vminq_u16( vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::(crl), i_bias_y)), diff --git a/src/neon/rgba_to_yuv.rs b/src/neon/rgba_to_yuv.rs index b10ff2c..52faea6 100644 --- a/src/neon/rgba_to_yuv.rs +++ b/src/neon/rgba_to_yuv.rs @@ -71,16 +71,20 @@ pub(crate) unsafe fn neon_rgba_to_yuv_rdm< let y_bias = vdupq_n_s16(bias_y); let uv_bias = vdupq_n_s16(bias_uv); - let v_yr = vdupq_n_s16(transform.yr as i16); - let v_yg = vdupq_n_s16(transform.yg as i16); - let v_yb = vdupq_n_s16(transform.yb as i16); - let v_cb_r = vdupq_n_s16(transform.cb_r as i16); - let v_cb_g = vdupq_n_s16(transform.cb_g as i16); - let v_cb_b = vdupq_n_s16(transform.cb_b as i16); - let v_cr_r = vdupq_n_s16(transform.cr_r as i16); - let v_cr_g = vdupq_n_s16(transform.cr_g as i16); let v_cr_b = vdupq_n_s16(transform.cr_b as i16); + let weights_arr: [i16; 8] = [ + transform.yr as i16, + transform.yg as i16, + transform.yb as i16, + transform.cb_r as i16, + transform.cb_g as i16, + transform.cb_b as i16, + transform.cr_r as i16, + transform.cr_g as i16, + ]; + let v_weights = vld1q_s16(weights_arr.as_ptr()); + let mut cx = start_cx; let mut ux = start_ux; @@ -120,22 +124,22 @@ pub(crate) unsafe fn neon_rgba_to_yuv_rdm< let g0hi = vreinterpretq_s16_u16(vshll_high_n_u8::(g_values_u8)); let b0hi = vreinterpretq_s16_u16(vshll_high_n_u8::(b_values_u8)); - let mut y_high = vqrdmlahq_s16(y_bias, r0hi, v_yr); - y_high = vqrdmlahq_s16(y_high, g0hi, v_yg); - y_high = vqrdmlahq_s16(y_high, b0hi, v_yb); + let mut y_high = vqrdmlahq_laneq_s16::<0>(y_bias, r0hi, v_weights); + y_high = vqrdmlahq_laneq_s16::<1>(y_high, g0hi, v_weights); + y_high = vqrdmlahq_laneq_s16::<2>(y_high, b0hi, v_weights); let y_high = vminq_u16( vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::(y_high), i_bias_y)), i_cap_y, ); - let r0lo = vreinterpretq_s16_u16(vshll_n_u8::(vget_low_u8(r_values_u8))); - let g0lo = vreinterpretq_s16_u16(vshll_n_u8::(vget_low_u8(g_values_u8))); - let b0lo = vreinterpretq_s16_u16(vshll_n_u8::(vget_low_u8(b_values_u8))); + let r_low = vreinterpretq_s16_u16(vshll_n_u8::(vget_low_u8(r_values_u8))); + let g_low = vreinterpretq_s16_u16(vshll_n_u8::(vget_low_u8(g_values_u8))); + let b_low = vreinterpretq_s16_u16(vshll_n_u8::(vget_low_u8(b_values_u8))); - let mut y_low = vqrdmlahq_s16(y_bias, r0lo, v_yr); - y_low = vqrdmlahq_s16(y_low, g0lo, v_yg); - y_low = vqrdmlahq_s16(y_low, b0lo, v_yb); + let mut y_low = vqrdmlahq_laneq_s16::<0>(y_bias, r_low, v_weights); + y_low = vqrdmlahq_laneq_s16::<1>(y_low, g_low, v_weights); + y_low = vqrdmlahq_laneq_s16::<2>(y_low, b_low, v_weights); let y_low = vminq_u16( vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::(y_low), i_bias_y)), @@ -146,36 +150,36 @@ pub(crate) unsafe fn neon_rgba_to_yuv_rdm< vst1q_u8(y_ptr.get_unchecked_mut(cx..).as_mut_ptr(), y); if chroma_subsampling == YuvChromaSubsampling::Yuv444 { - let mut cb_high = vqrdmlahq_s16(uv_bias, r0hi, v_cb_r); - cb_high = vqrdmlahq_s16(cb_high, g0hi, v_cb_g); - cb_high = vqrdmlahq_s16(cb_high, b0hi, v_cb_b); + let mut cb_high = vqrdmlahq_laneq_s16::<3>(uv_bias, r0hi, v_weights); + cb_high = vqrdmlahq_laneq_s16::<4>(cb_high, g0hi, v_weights); + cb_high = vqrdmlahq_laneq_s16::<5>(cb_high, b0hi, v_weights); let cb_high = vminq_u16( vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::(cb_high), i_bias_y)), i_cap_uv, ); - let mut cr_high = vqrdmlahq_s16(uv_bias, r0hi, v_cr_r); - cr_high = vqrdmlahq_s16(cr_high, g0hi, v_cr_g); - cr_high = vqrdmlahq_s16(cr_high, b0hi, v_cr_b); + let mut cr_high = vqrdmlahq_laneq_s16::<6>(uv_bias, r0hi, v_weights); + cr_high = vqrdmlahq_laneq_s16::<7>(cr_high, g0hi, v_weights); + cr_high = vqrdmlahq_laneq_s16::<0>(cr_high, b0hi, v_cr_b); let cr_high = vminq_u16( vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::(cr_high), i_bias_y)), i_cap_uv, ); - let mut cb_low = vqrdmlahq_s16(uv_bias, r0lo, v_cb_r); - cb_low = vqrdmlahq_s16(cb_low, g0lo, v_cb_g); - cb_low = vqrdmlahq_s16(cb_low, b0lo, v_cb_b); + let mut cb_low = vqrdmlahq_laneq_s16::<3>(uv_bias, r_low, v_weights); + cb_low = vqrdmlahq_laneq_s16::<4>(cb_low, g_low, v_weights); + cb_low = vqrdmlahq_laneq_s16::<5>(cb_low, b_low, v_weights); let cb_low = vminq_u16( vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::(cb_low), i_bias_y)), i_cap_uv, ); - let mut cr_low = vqrdmlahq_s16(uv_bias, r0lo, v_cr_r); - cr_low = vqrdmlahq_s16(cr_low, g0lo, v_cr_g); - cr_low = vqrdmlahq_s16(cr_low, b0lo, v_cr_b); + let mut cr_low = vqrdmlahq_laneq_s16::<6>(uv_bias, r_low, v_weights); + cr_low = vqrdmlahq_laneq_s16::<7>(cr_low, g_low, v_weights); + cr_low = vqrdmlahq_laneq_s16::<0>(cr_low, b_low, v_cr_b); let cr_low = vminq_u16( vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::(cr_low), i_bias_y)), @@ -201,18 +205,18 @@ pub(crate) unsafe fn neon_rgba_to_yuv_rdm< b_values_u8, )))); - let mut cbl = vqrdmlahq_s16(uv_bias, r1, v_cb_r); - cbl = vqrdmlahq_s16(cbl, g1, v_cb_g); - cbl = vqrdmlahq_s16(cbl, b1, v_cb_b); + let mut cbl = vqrdmlahq_laneq_s16::<3>(uv_bias, r1, v_weights); + cbl = vqrdmlahq_laneq_s16::<4>(cbl, g1, v_weights); + cbl = vqrdmlahq_laneq_s16::<5>(cbl, b1, v_weights); let cb = vqmovn_u16(vminq_u16( vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::(cbl), i_bias_y)), i_cap_uv, )); - let mut crl = vqrdmlahq_s16(uv_bias, r1, v_cr_r); - crl = vqrdmlahq_s16(crl, g1, v_cr_g); - crl = vqrdmlahq_s16(crl, b1, v_cr_b); + let mut crl = vqrdmlahq_laneq_s16::<6>(uv_bias, r1, v_weights); + crl = vqrdmlahq_laneq_s16::<7>(crl, g1, v_weights); + crl = vqrdmlahq_laneq_s16::<0>(crl, b1, v_cr_b); let cr = vqmovn_u16(vminq_u16( vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::(crl), i_bias_y)), diff --git a/src/neon/yuv_nv_to_rgba.rs b/src/neon/yuv_nv_to_rgba.rs index 06be690..13d227b 100644 --- a/src/neon/yuv_nv_to_rgba.rs +++ b/src/neon/yuv_nv_to_rgba.rs @@ -65,6 +65,9 @@ pub(crate) unsafe fn neon_yuv_nv_to_rgba_row_rdm< let mut cx = start_cx; let mut ux = start_ux; + const SCALE: i32 = 7; + const V_SHR: i32 = 4; + while cx + 16 < width { let y_values = vqsubq_u8(vld1q_u8(y_ptr.add(cx)), y_corr); @@ -97,26 +100,26 @@ pub(crate) unsafe fn neon_yuv_nv_to_rgba_row_rdm< } } - let u_high = vshlq_n_s16::<7>(vsubq_s16( + let u_high = vshlq_n_s16::(vsubq_s16( vreinterpretq_s16_u16(vmovl_u8(u_high_u8)), uv_corr, )); - let v_high = vshlq_n_s16::<7>(vsubq_s16( + let v_high = vshlq_n_s16::(vsubq_s16( vreinterpretq_s16_u16(vmovl_u8(v_high_u8)), uv_corr, )); - let y_v_shl = vshll_high_n_u8::<7>(y_values); + let y_v_shl = vshll_high_n_u8::(y_values); let y_high = vqrdmulhq_n_s16(vreinterpretq_s16_u16(y_v_shl), transform.y_coef as i16); - let r_high = vqrshrun_n_s16::<4>(vaddq_s16( + let r_high = vqrshrun_n_s16::(vaddq_s16( y_high, vqrdmulhq_n_s16(v_high, transform.cr_coef as i16), )); - let b_high = vqrshrun_n_s16::<4>(vaddq_s16( + let b_high = vqrshrun_n_s16::(vaddq_s16( y_high, vqrdmulhq_n_s16(u_high, transform.cb_coef as i16), )); - let g_high = vqrshrun_n_s16::<4>(vsubq_s16( + let g_high = vqrshrun_n_s16::(vsubq_s16( y_high, vaddq_s16( vqrdmulhq_n_s16(v_high, transform.g_coeff_1 as i16), @@ -124,26 +127,26 @@ pub(crate) unsafe fn neon_yuv_nv_to_rgba_row_rdm< ), )); - let u_low = vshlq_n_s16::<7>(vsubq_s16( + let u_low = vshlq_n_s16::(vsubq_s16( vreinterpretq_s16_u16(vmovl_u8(u_low_u8)), uv_corr, )); - let v_low = vshlq_n_s16::<7>(vsubq_s16( + let v_low = vshlq_n_s16::(vsubq_s16( vreinterpretq_s16_u16(vmovl_u8(v_low_u8)), uv_corr, )); - let y_v_shl = vshll_n_u8::<7>(vget_low_u8(y_values)); + let y_v_shl = vshll_n_u8::(vget_low_u8(y_values)); let y_low = vqrdmulhq_n_s16(vreinterpretq_s16_u16(y_v_shl), transform.y_coef as i16); - let r_low = vqrshrun_n_s16::<4>(vaddq_s16( + let r_low = vqrshrun_n_s16::(vaddq_s16( y_low, vqrdmulhq_n_s16(v_low, transform.cr_coef as i16), )); - let b_low = vqrshrun_n_s16::<4>(vaddq_s16( + let b_low = vqrshrun_n_s16::(vaddq_s16( y_low, vqrdmulhq_n_s16(u_low, transform.cb_coef as i16), )); - let g_low = vqrshrun_n_s16::<4>(vsubq_s16( + let g_low = vqrshrun_n_s16::(vsubq_s16( y_low, vaddq_s16( vqrdmulhq_n_s16(v_low, transform.g_coeff_1 as i16), @@ -221,28 +224,28 @@ pub(crate) unsafe fn neon_yuv_nv_to_rgba_row_rdm< } } - let u_low = vshlq_n_s16::<7>(vsubq_s16( + let u_low = vshlq_n_s16::(vsubq_s16( vreinterpretq_s16_u16(vmovl_u8(u_low_u8)), uv_corr, )); - let v_low = vshlq_n_s16::<7>(vsubq_s16( + let v_low = vshlq_n_s16::(vsubq_s16( vreinterpretq_s16_u16(vmovl_u8(v_low_u8)), uv_corr, )); let y_low = vqrdmulhq_n_s16( - vreinterpretq_s16_u16(vshll_n_u8::<7>(y_values)), + vreinterpretq_s16_u16(vshll_n_u8::(y_values)), transform.y_coef as i16, ); - let r_low = vqrshrun_n_s16::<4>(vaddq_s16( + let r_low = vqrshrun_n_s16::(vaddq_s16( y_low, vqrdmulhq_n_s16(v_low, transform.cr_coef as i16), )); - let b_low = vqrshrun_n_s16::<4>(vaddq_s16( + let b_low = vqrshrun_n_s16::(vaddq_s16( y_low, vqrdmulhq_n_s16(u_low, transform.cb_coef as i16), )); - let g_low = vqrshrun_n_s16::<4>(vsubq_s16( + let g_low = vqrshrun_n_s16::(vsubq_s16( y_low, vaddq_s16( vqrdmulhq_n_s16(v_low, transform.g_coeff_1 as i16), diff --git a/src/neon/yuv_nv_to_rgba420.rs b/src/neon/yuv_nv_to_rgba420.rs index 543c145..bba0ef6 100644 --- a/src/neon/yuv_nv_to_rgba420.rs +++ b/src/neon/yuv_nv_to_rgba420.rs @@ -64,6 +64,19 @@ pub(crate) unsafe fn neon_yuv_nv_to_rgba_row_rdm420< const SCALE: i32 = 7; const V_SHR: i32 = 4; + let weights_arr: [i16; 8] = [ + transform.y_coef as i16, + transform.cr_coef as i16, + transform.cb_coef as i16, + transform.g_coeff_1 as i16, + transform.g_coeff_2 as i16, + 0, + 0, + 0, + ]; + + let v_weights = vld1q_s16(weights_arr.as_ptr()); + while cx + 16 < width { let y_values0 = vqsubq_u8(vld1q_u8(y_plane0.get_unchecked(cx..).as_ptr()), y_corr); let y_values1 = vqsubq_u8(vld1q_u8(y_plane1.get_unchecked(cx..).as_ptr()), y_corr); @@ -88,32 +101,21 @@ pub(crate) unsafe fn neon_yuv_nv_to_rgba_row_rdm420< )); let y_v_shl0 = vshll_high_n_u8::(y_values0); let y_v_shl1 = vshll_high_n_u8::(y_values1); - let y_high0 = vqrdmulhq_n_s16(vreinterpretq_s16_u16(y_v_shl0), transform.y_coef as i16); - let y_high1 = vqrdmulhq_n_s16(vreinterpretq_s16_u16(y_v_shl1), transform.y_coef as i16); + let y_high0 = vqrdmulhq_laneq_s16::<0>(vreinterpretq_s16_u16(y_v_shl0), v_weights); + let y_high1 = vqrdmulhq_laneq_s16::<0>(vreinterpretq_s16_u16(y_v_shl1), v_weights); - let g_coeff_hi = vaddq_s16( - vqrdmulhq_n_s16(v_high, transform.g_coeff_1 as i16), - vqrdmulhq_n_s16(u_high, transform.g_coeff_2 as i16), + let g_coeff_hi = vqrdmlahq_laneq_s16::<4>( + vqrdmulhq_laneq_s16::<3>(v_high, v_weights), + u_high, + v_weights, ); - let r_high0 = vqrshrun_n_s16::(vaddq_s16( - y_high0, - vqrdmulhq_n_s16(v_high, transform.cr_coef as i16), - )); - let b_high0 = vqrshrun_n_s16::(vaddq_s16( - y_high0, - vqrdmulhq_n_s16(u_high, transform.cb_coef as i16), - )); + let r_high0 = vqrshrun_n_s16::(vqrdmlahq_laneq_s16::<1>(y_high0, v_high, v_weights)); + let b_high0 = vqrshrun_n_s16::(vqrdmlahq_laneq_s16::<2>(y_high0, u_high, v_weights)); let g_high0 = vqrshrun_n_s16::(vsubq_s16(y_high0, g_coeff_hi)); - let r_high1 = vqrshrun_n_s16::(vaddq_s16( - y_high1, - vqrdmulhq_n_s16(v_high, transform.cr_coef as i16), - )); - let b_high1 = vqrshrun_n_s16::(vaddq_s16( - y_high1, - vqrdmulhq_n_s16(u_high, transform.cb_coef as i16), - )); + let r_high1 = vqrshrun_n_s16::(vqrdmlahq_laneq_s16::<1>(y_high1, v_high, v_weights)); + let b_high1 = vqrshrun_n_s16::(vqrdmlahq_laneq_s16::<2>(y_high1, u_high, v_weights)); let g_high1 = vqrshrun_n_s16::(vsubq_s16(y_high1, g_coeff_hi)); let u_low = vshlq_n_s16::(vsubq_s16( @@ -126,32 +128,18 @@ pub(crate) unsafe fn neon_yuv_nv_to_rgba_row_rdm420< )); let y_v_shl0 = vshll_n_u8::(vget_low_u8(y_values0)); let y_v_shl1 = vshll_n_u8::(vget_low_u8(y_values1)); - let y_low0 = vqrdmulhq_n_s16(vreinterpretq_s16_u16(y_v_shl0), transform.y_coef as i16); - let y_low1 = vqrdmulhq_n_s16(vreinterpretq_s16_u16(y_v_shl1), transform.y_coef as i16); + let y_low0 = vqrdmulhq_laneq_s16::<0>(vreinterpretq_s16_u16(y_v_shl0), v_weights); + let y_low1 = vqrdmulhq_laneq_s16::<0>(vreinterpretq_s16_u16(y_v_shl1), v_weights); - let g_coeff_lo = vaddq_s16( - vqrdmulhq_n_s16(v_low, transform.g_coeff_1 as i16), - vqrdmulhq_n_s16(u_low, transform.g_coeff_2 as i16), - ); + let g_coeff_lo = + vqrdmlahq_laneq_s16::<4>(vqrdmulhq_laneq_s16::<3>(v_low, v_weights), u_low, v_weights); - let r_low0 = vqrshrun_n_s16::(vaddq_s16( - y_low0, - vqrdmulhq_n_s16(v_low, transform.cr_coef as i16), - )); - let b_low0 = vqrshrun_n_s16::(vaddq_s16( - y_low0, - vqrdmulhq_n_s16(u_low, transform.cb_coef as i16), - )); + let r_low0 = vqrshrun_n_s16::(vqrdmlahq_laneq_s16::<1>(y_low0, v_low, v_weights)); + let b_low0 = vqrshrun_n_s16::(vqrdmlahq_laneq_s16::<2>(y_low0, u_low, v_weights)); let g_low0 = vqrshrun_n_s16::(vsubq_s16(y_low0, g_coeff_lo)); - let r_low1 = vqrshrun_n_s16::(vaddq_s16( - y_low1, - vqrdmulhq_n_s16(v_low, transform.cr_coef as i16), - )); - let b_low1 = vqrshrun_n_s16::(vaddq_s16( - y_low1, - vqrdmulhq_n_s16(u_low, transform.cb_coef as i16), - )); + let r_low1 = vqrshrun_n_s16::(vqrdmlahq_laneq_s16::<1>(y_low1, v_low, v_weights)); + let b_low1 = vqrshrun_n_s16::(vqrdmlahq_laneq_s16::<2>(y_low1, u_low, v_weights)); let g_low1 = vqrshrun_n_s16::(vsubq_s16(y_low1, g_coeff_lo)); let r_values0 = vcombine_u8(r_low0, r_high0); @@ -235,38 +223,20 @@ pub(crate) unsafe fn neon_yuv_nv_to_rgba_row_rdm420< vreinterpretq_s16_u16(vmovl_u8(v_low_u8)), uv_corr, )); - let y_low0 = vqrdmulhq_n_s16( - vreinterpretq_s16_u16(vshll_n_u8::(y_values0)), - transform.y_coef as i16, - ); - let y_low1 = vqrdmulhq_n_s16( - vreinterpretq_s16_u16(vshll_n_u8::(y_values1)), - transform.y_coef as i16, - ); + let y_v_shl0 = vshll_n_u8::(y_values0); + let y_v_shl1 = vshll_n_u8::(y_values1); + let y_low0 = vqrdmulhq_laneq_s16::<0>(vreinterpretq_s16_u16(y_v_shl0), v_weights); + let y_low1 = vqrdmulhq_laneq_s16::<0>(vreinterpretq_s16_u16(y_v_shl1), v_weights); - let g_coeff_lo = vaddq_s16( - vqrdmulhq_n_s16(v_low, transform.g_coeff_1 as i16), - vqrdmulhq_n_s16(u_low, transform.g_coeff_2 as i16), - ); + let g_coeff_lo = + vqrdmlahq_laneq_s16::<4>(vqrdmulhq_laneq_s16::<3>(v_low, v_weights), u_low, v_weights); - let r_low0 = vqrshrun_n_s16::(vaddq_s16( - y_low0, - vqrdmulhq_n_s16(v_low, transform.cr_coef as i16), - )); - let b_low0 = vqrshrun_n_s16::(vaddq_s16( - y_low0, - vqrdmulhq_n_s16(u_low, transform.cb_coef as i16), - )); + let r_low0 = vqrshrun_n_s16::(vqrdmlahq_laneq_s16::<1>(y_low0, v_low, v_weights)); + let b_low0 = vqrshrun_n_s16::(vqrdmlahq_laneq_s16::<2>(y_low0, u_low, v_weights)); let g_low0 = vqrshrun_n_s16::(vsubq_s16(y_low0, g_coeff_lo)); - let r_low1 = vqrshrun_n_s16::(vaddq_s16( - y_low1, - vqrdmulhq_n_s16(v_low, transform.cr_coef as i16), - )); - let b_low1 = vqrshrun_n_s16::(vaddq_s16( - y_low1, - vqrdmulhq_n_s16(u_low, transform.cb_coef as i16), - )); + let r_low1 = vqrshrun_n_s16::(vqrdmlahq_laneq_s16::<1>(y_low1, v_low, v_weights)); + let b_low1 = vqrshrun_n_s16::(vqrdmlahq_laneq_s16::<2>(y_low1, u_low, v_weights)); let g_low1 = vqrshrun_n_s16::(vsubq_s16(y_low1, g_coeff_lo)); let r_values0 = r_low0; diff --git a/src/neon/yuv_to_rgba.rs b/src/neon/yuv_to_rgba.rs index 3360e4e..67c6e8d 100644 --- a/src/neon/yuv_to_rgba.rs +++ b/src/neon/yuv_to_rgba.rs @@ -65,6 +65,22 @@ pub(crate) unsafe fn neon_yuv_to_rgba_row_rdm< let uv_corr = vdupq_n_s16(range.bias_uv as i16); let v_alpha = vdupq_n_u8(255u8); + let weights_arr: [i16; 8] = [ + transform.y_coef as i16, + transform.cr_coef as i16, + transform.cb_coef as i16, + transform.g_coeff_1 as i16, + transform.g_coeff_2 as i16, + 0, + 0, + 0, + ]; + + let v_weights = vld1q_s16(weights_arr.as_ptr()); + + const SCALE: i32 = 7; + const V_SHR: i32 = 4; + while cx + 16 < width { let y_values = vqsubq_u8(vld1q_u8(y_ptr.add(cx)), y_corr); @@ -94,60 +110,46 @@ pub(crate) unsafe fn neon_yuv_to_rgba_row_rdm< } } - let u_high = vshlq_n_s16::<7>(vsubq_s16( + let u_high = vshlq_n_s16::(vsubq_s16( vreinterpretq_s16_u16(vmovl_u8(u_high_u8)), uv_corr, )); - let v_high = vshlq_n_s16::<7>(vsubq_s16( + let v_high = vshlq_n_s16::(vsubq_s16( vreinterpretq_s16_u16(vmovl_u8(v_high_u8)), uv_corr, )); - let y_high = vqrdmulhq_n_s16( - vreinterpretq_s16_u16(vshll_high_n_u8::<7>(y_values)), - transform.y_coef as i16, + let y_high = vqrdmulhq_laneq_s16::<0>( + vreinterpretq_s16_u16(vshll_high_n_u8::(y_values)), + v_weights, ); - let r_high = vqrshrun_n_s16::<4>(vaddq_s16( - y_high, - vqrdmulhq_n_s16(v_high, transform.cr_coef as i16), - )); - let b_high = vqrshrun_n_s16::<4>(vaddq_s16( - y_high, - vqrdmulhq_n_s16(u_high, transform.cb_coef as i16), - )); - let g_high = vqrshrun_n_s16::<4>(vsubq_s16( + let r_high = vqrshrun_n_s16::(vqrdmlahq_laneq_s16::<1>(y_high, v_high, v_weights)); + let b_high = vqrshrun_n_s16::(vqrdmlahq_laneq_s16::<2>(y_high, u_high, v_weights)); + let g_high = vqrshrun_n_s16::(vsubq_s16( y_high, - vaddq_s16( - vqrdmulhq_n_s16(v_high, transform.g_coeff_1 as i16), - vqrdmulhq_n_s16(u_high, transform.g_coeff_2 as i16), + vqrdmlahq_laneq_s16::<4>( + vqrdmulhq_laneq_s16::<3>(v_high, v_weights), + u_high, + v_weights, ), )); - let u_low = vshlq_n_s16::<7>(vsubq_s16( + let u_low = vshlq_n_s16::(vsubq_s16( vreinterpretq_s16_u16(vmovl_u8(u_low_u8)), uv_corr, )); - let v_low = vshlq_n_s16::<7>(vsubq_s16( + let v_low = vshlq_n_s16::(vsubq_s16( vreinterpretq_s16_u16(vmovl_u8(v_low_u8)), uv_corr, )); - let y_v_shl = vshll_n_u8::<7>(vget_low_u8(y_values)); - let y_low = vqrdmulhq_n_s16(vreinterpretq_s16_u16(y_v_shl), transform.y_coef as i16); + let y_v_shl = vshll_n_u8::(vget_low_u8(y_values)); + let y_low = vqrdmulhq_laneq_s16::<0>(vreinterpretq_s16_u16(y_v_shl), v_weights); - let r_low = vqrshrun_n_s16::<4>(vaddq_s16( - y_low, - vqrdmulhq_n_s16(v_low, transform.cr_coef as i16), - )); - let b_low = vqrshrun_n_s16::<4>(vaddq_s16( + let r_low = vqrshrun_n_s16::(vqrdmlahq_laneq_s16::<1>(y_low, v_low, v_weights)); + let b_low = vqrshrun_n_s16::(vqrdmlahq_laneq_s16::<2>(y_low, u_low, v_weights)); + let g_low = vqrshrun_n_s16::(vsubq_s16( y_low, - vqrdmulhq_n_s16(u_low, transform.cb_coef as i16), - )); - let g_low = vqrshrun_n_s16::<4>(vsubq_s16( - y_low, - vaddq_s16( - vqrdmulhq_n_s16(v_low, transform.g_coeff_1 as i16), - vqrdmulhq_n_s16(u_low, transform.g_coeff_2 as i16), - ), + vqrdmlahq_laneq_s16::<4>(vqrdmulhq_laneq_s16::<3>(v_low, v_weights), u_low, v_weights), )); let r_values = vcombine_u8(r_low, r_high); @@ -210,33 +212,24 @@ pub(crate) unsafe fn neon_yuv_to_rgba_row_rdm< } } - let u_low = vshlq_n_s16::<7>(vsubq_s16( + let u_low = vshlq_n_s16::(vsubq_s16( vreinterpretq_s16_u16(vmovl_u8(u_low_u8)), uv_corr, )); - let v_low = vshlq_n_s16::<7>(vsubq_s16( + let v_low = vshlq_n_s16::(vsubq_s16( vreinterpretq_s16_u16(vmovl_u8(v_low_u8)), uv_corr, )); - let y_low = vqrdmulhq_n_s16( - vreinterpretq_s16_u16(vshll_n_u8::<7>(y_values)), - transform.y_coef as i16, + let y_low = vqrdmulhq_laneq_s16::<0>( + vreinterpretq_s16_u16(vshll_n_u8::(y_values)), + v_weights, ); - let r_low = vqrshrun_n_s16::<4>(vaddq_s16( - y_low, - vqrdmulhq_n_s16(v_low, transform.cr_coef as i16), - )); - let b_low = vqrshrun_n_s16::<4>(vaddq_s16( + let r_low = vqrshrun_n_s16::(vqrdmlahq_laneq_s16::<1>(y_low, v_low, v_weights)); + let b_low = vqrshrun_n_s16::(vqrdmlahq_laneq_s16::<2>(y_low, u_low, v_weights)); + let g_low = vqrshrun_n_s16::(vsubq_s16( y_low, - vqrdmulhq_n_s16(u_low, transform.cb_coef as i16), - )); - let g_low = vqrshrun_n_s16::<4>(vsubq_s16( - y_low, - vaddq_s16( - vqrdmulhq_n_s16(v_low, transform.g_coeff_1 as i16), - vqrdmulhq_n_s16(u_low, transform.g_coeff_2 as i16), - ), + vqrdmlahq_laneq_s16::<4>(vqrdmulhq_laneq_s16::<3>(v_low, v_weights), u_low, v_weights), )); let r_values = r_low; diff --git a/src/neon/yuv_to_rgba420.rs b/src/neon/yuv_to_rgba420.rs index c44679f..146e0c1 100644 --- a/src/neon/yuv_to_rgba420.rs +++ b/src/neon/yuv_to_rgba420.rs @@ -62,6 +62,19 @@ pub(crate) unsafe fn neon_yuv_to_rgba_row_rdm420 const SCALE: i32 = 7; const V_SHR: i32 = 4; + let weights_arr: [i16; 8] = [ + transform.y_coef as i16, + transform.cr_coef as i16, + transform.cb_coef as i16, + transform.g_coeff_1 as i16, + transform.g_coeff_2 as i16, + 0, + 0, + 0, + ]; + + let v_weights = vld1q_s16(weights_arr.as_ptr()); + while cx + 16 < width { let y_values0 = vqsubq_u8(vld1q_u8(y_plane0.get_unchecked(cx..).as_ptr()), y_corr); let y_values1 = vqsubq_u8(vld1q_u8(y_plane1.get_unchecked(cx..).as_ptr()), y_corr); @@ -82,39 +95,24 @@ pub(crate) unsafe fn neon_yuv_to_rgba_row_rdm420 vreinterpretq_s16_u16(vmovl_u8(v_high_u8)), uv_corr, )); - let y_high0 = vqrdmulhq_n_s16( - vreinterpretq_s16_u16(vshll_high_n_u8::(y_values0)), - transform.y_coef as i16, - ); - let y_high1 = vqrdmulhq_n_s16( - vreinterpretq_s16_u16(vshll_high_n_u8::(y_values1)), - transform.y_coef as i16, + let y_v_shl0 = vshll_high_n_u8::(y_values0); + let y_v_shl1 = vshll_high_n_u8::(y_values1); + let y_high0 = vqrdmulhq_laneq_s16::<0>(vreinterpretq_s16_u16(y_v_shl0), v_weights); + let y_high1 = vqrdmulhq_laneq_s16::<0>(vreinterpretq_s16_u16(y_v_shl1), v_weights); + + let g_coeff_hi = vqrdmlahq_laneq_s16::<4>( + vqrdmulhq_laneq_s16::<3>(v_high, v_weights), + u_high, + v_weights, ); - let g_coeff_hi0 = vaddq_s16( - vqrdmulhq_n_s16(v_high, transform.g_coeff_1 as i16), - vqrdmulhq_n_s16(u_high, transform.g_coeff_2 as i16), - ); + let r_high0 = vqrshrun_n_s16::(vqrdmlahq_laneq_s16::<1>(y_high0, v_high, v_weights)); + let b_high0 = vqrshrun_n_s16::(vqrdmlahq_laneq_s16::<2>(y_high0, u_high, v_weights)); + let g_high0 = vqrshrun_n_s16::(vsubq_s16(y_high0, g_coeff_hi)); - let r_high0 = vqrshrun_n_s16::(vaddq_s16( - y_high0, - vqrdmulhq_n_s16(v_high, transform.cr_coef as i16), - )); - let b_high0 = vqrshrun_n_s16::(vaddq_s16( - y_high0, - vqrdmulhq_n_s16(u_high, transform.cb_coef as i16), - )); - let g_high0 = vqrshrun_n_s16::(vsubq_s16(y_high0, g_coeff_hi0)); - - let r_high1 = vqrshrun_n_s16::(vaddq_s16( - y_high1, - vqrdmulhq_n_s16(v_high, transform.cr_coef as i16), - )); - let b_high1 = vqrshrun_n_s16::(vaddq_s16( - y_high1, - vqrdmulhq_n_s16(u_high, transform.cb_coef as i16), - )); - let g_high1 = vqrshrun_n_s16::(vsubq_s16(y_high1, g_coeff_hi0)); + let r_high1 = vqrshrun_n_s16::(vqrdmlahq_laneq_s16::<1>(y_high1, v_high, v_weights)); + let b_high1 = vqrshrun_n_s16::(vqrdmlahq_laneq_s16::<2>(y_high1, u_high, v_weights)); + let g_high1 = vqrshrun_n_s16::(vsubq_s16(y_high1, g_coeff_hi)); let u_low = vshlq_n_s16::(vsubq_s16( vreinterpretq_s16_u16(vmovl_u8(u_low_u8)), @@ -126,32 +124,18 @@ pub(crate) unsafe fn neon_yuv_to_rgba_row_rdm420 )); let y_v_shl0 = vshll_n_u8::(vget_low_u8(y_values0)); let y_v_shl1 = vshll_n_u8::(vget_low_u8(y_values1)); - let y_low0 = vqrdmulhq_n_s16(vreinterpretq_s16_u16(y_v_shl0), transform.y_coef as i16); - let y_low1 = vqrdmulhq_n_s16(vreinterpretq_s16_u16(y_v_shl1), transform.y_coef as i16); + let y_low0 = vqrdmulhq_laneq_s16::<0>(vreinterpretq_s16_u16(y_v_shl0), v_weights); + let y_low1 = vqrdmulhq_laneq_s16::<0>(vreinterpretq_s16_u16(y_v_shl1), v_weights); - let g_coeff_lo = vaddq_s16( - vqrdmulhq_n_s16(v_low, transform.g_coeff_1 as i16), - vqrdmulhq_n_s16(u_low, transform.g_coeff_2 as i16), - ); + let g_coeff_lo = + vqrdmlahq_laneq_s16::<4>(vqrdmulhq_laneq_s16::<3>(v_low, v_weights), u_low, v_weights); - let r_low0 = vqrshrun_n_s16::(vaddq_s16( - y_low0, - vqrdmulhq_n_s16(v_low, transform.cr_coef as i16), - )); - let b_low0 = vqrshrun_n_s16::(vaddq_s16( - y_low0, - vqrdmulhq_n_s16(u_low, transform.cb_coef as i16), - )); + let r_low0 = vqrshrun_n_s16::(vqrdmlahq_laneq_s16::<1>(y_low0, v_low, v_weights)); + let b_low0 = vqrshrun_n_s16::(vqrdmlahq_laneq_s16::<2>(y_low0, u_low, v_weights)); let g_low0 = vqrshrun_n_s16::(vsubq_s16(y_low0, g_coeff_lo)); - let r_low1 = vqrshrun_n_s16::(vaddq_s16( - y_low1, - vqrdmulhq_n_s16(v_low, transform.cr_coef as i16), - )); - let b_low1 = vqrshrun_n_s16::(vaddq_s16( - y_low1, - vqrdmulhq_n_s16(u_low, transform.cb_coef as i16), - )); + let r_low1 = vqrshrun_n_s16::(vqrdmlahq_laneq_s16::<1>(y_low1, v_low, v_weights)); + let b_low1 = vqrshrun_n_s16::(vqrdmlahq_laneq_s16::<2>(y_low1, u_low, v_weights)); let g_low1 = vqrshrun_n_s16::(vsubq_s16(y_low1, g_coeff_lo)); let r_values0 = vcombine_u8(r_low0, r_high0); @@ -223,38 +207,21 @@ pub(crate) unsafe fn neon_yuv_to_rgba_row_rdm420 vreinterpretq_s16_u16(vmovl_u8(v_low_u8)), uv_corr, )); - let y0 = vqrdmulhq_n_s16( - vreinterpretq_s16_u16(vshll_n_u8::(y_values0)), - transform.y_coef as i16, - ); - let y1 = vqrdmulhq_n_s16( - vreinterpretq_s16_u16(vshll_n_u8::(y_values1)), - transform.y_coef as i16, - ); + let y_v_shl0 = vshll_n_u8::(y_values0); + let y_v_shl1 = vshll_n_u8::(y_values1); + let y_low0 = vqrdmulhq_laneq_s16::<0>(vreinterpretq_s16_u16(y_v_shl0), v_weights); + let y_low1 = vqrdmulhq_laneq_s16::<0>(vreinterpretq_s16_u16(y_v_shl1), v_weights); - let g_coeff = vaddq_s16( - vqrdmulhq_n_s16(v_low, transform.g_coeff_1 as i16), - vqrdmulhq_n_s16(u_low, transform.g_coeff_2 as i16), - ); + let g_coeff_lo = + vqrdmlahq_laneq_s16::<4>(vqrdmulhq_laneq_s16::<3>(v_low, v_weights), u_low, v_weights); - let r0 = vqrshrun_n_s16::(vaddq_s16( - y0, - vqrdmulhq_n_s16(v_low, transform.cr_coef as i16), - )); - let b0 = vqrshrun_n_s16::(vaddq_s16( - y0, - vqrdmulhq_n_s16(u_low, transform.cb_coef as i16), - )); - let g0 = vqrshrun_n_s16::(vsubq_s16(y0, g_coeff)); - let r1 = vqrshrun_n_s16::(vaddq_s16( - y1, - vqrdmulhq_n_s16(v_low, transform.cr_coef as i16), - )); - let b1 = vqrshrun_n_s16::(vaddq_s16( - y1, - vqrdmulhq_n_s16(u_low, transform.cb_coef as i16), - )); - let g1 = vqrshrun_n_s16::(vsubq_s16(y1, g_coeff)); + let r0 = vqrshrun_n_s16::(vqrdmlahq_laneq_s16::<1>(y_low0, v_low, v_weights)); + let b0 = vqrshrun_n_s16::(vqrdmlahq_laneq_s16::<2>(y_low0, u_low, v_weights)); + let g0 = vqrshrun_n_s16::(vsubq_s16(y_low0, g_coeff_lo)); + + let r1 = vqrshrun_n_s16::(vqrdmlahq_laneq_s16::<1>(y_low1, v_low, v_weights)); + let b1 = vqrshrun_n_s16::(vqrdmlahq_laneq_s16::<2>(y_low1, u_low, v_weights)); + let g1 = vqrshrun_n_s16::(vsubq_s16(y_low1, g_coeff_lo)); let dst_shift = cx * channels; diff --git a/src/neon/yuv_to_rgba_alpha.rs b/src/neon/yuv_to_rgba_alpha.rs index d18fb23..7cdbea1 100644 --- a/src/neon/yuv_to_rgba_alpha.rs +++ b/src/neon/yuv_to_rgba_alpha.rs @@ -66,6 +66,22 @@ pub(crate) unsafe fn neon_yuv_to_rgba_alpha_rdm< let y_corr = vdupq_n_u8(range.bias_y as u8); let uv_corr = vdupq_n_s16(range.bias_uv as i16); + let weights_arr: [i16; 8] = [ + transform.y_coef as i16, + transform.cr_coef as i16, + transform.cb_coef as i16, + transform.g_coeff_1 as i16, + transform.g_coeff_2 as i16, + 0, + 0, + 0, + ]; + + let v_weights = vld1q_s16(weights_arr.as_ptr()); + + const SCALE: i32 = 7; + const V_SHR: i32 = 4; + while cx + 16 < width { let y_values = vqsubq_u8(vld1q_u8(y_ptr.add(cx)), y_corr); let a_values = vld1q_u8(a_ptr.add(cx)); @@ -96,62 +112,46 @@ pub(crate) unsafe fn neon_yuv_to_rgba_alpha_rdm< } } - let u_high = vshlq_n_s16::<7>(vsubq_s16( + let u_high = vshlq_n_s16::(vsubq_s16( vreinterpretq_s16_u16(vmovl_u8(u_high_u8)), uv_corr, )); - let v_high = vshlq_n_s16::<7>(vsubq_s16( + let v_high = vshlq_n_s16::(vsubq_s16( vreinterpretq_s16_u16(vmovl_u8(v_high_u8)), uv_corr, )); - let y_high = vqrdmulhq_n_s16( - vreinterpretq_s16_u16(vshll_high_n_u8::<7>(y_values)), - transform.y_coef as i16, + let y_high = vqrdmulhq_laneq_s16::<0>( + vreinterpretq_s16_u16(vshll_high_n_u8::(y_values)), + v_weights, ); - let r_high = vqrshrun_n_s16::<4>(vaddq_s16( - y_high, - vqrdmulhq_n_s16(v_high, transform.cr_coef as i16), - )); - let b_high = vqrshrun_n_s16::<4>(vaddq_s16( + let r_high = vqrshrun_n_s16::(vqrdmlahq_laneq_s16::<1>(y_high, v_high, v_weights)); + let b_high = vqrshrun_n_s16::(vqrdmlahq_laneq_s16::<2>(y_high, u_high, v_weights)); + let g_high = vqrshrun_n_s16::(vsubq_s16( y_high, - vqrdmulhq_n_s16(u_high, transform.cb_coef as i16), - )); - let g_high = vqrshrun_n_s16::<4>(vsubq_s16( - y_high, - vaddq_s16( - vqrdmulhq_n_s16(v_high, transform.g_coeff_1 as i16), - vqrdmulhq_n_s16(u_high, transform.g_coeff_2 as i16), + vqrdmlahq_laneq_s16::<4>( + vqrdmulhq_laneq_s16::<3>(v_high, v_weights), + u_high, + v_weights, ), )); - let u_low = vshlq_n_s16::<7>(vsubq_s16( + let u_low = vshlq_n_s16::(vsubq_s16( vreinterpretq_s16_u16(vmovl_u8(u_low_u8)), uv_corr, )); - let v_low = vshlq_n_s16::<7>(vsubq_s16( + let v_low = vshlq_n_s16::(vsubq_s16( vreinterpretq_s16_u16(vmovl_u8(v_low_u8)), uv_corr, )); - let y_low = vqrdmulhq_n_s16( - vreinterpretq_s16_u16(vshll_n_u8::<7>(vget_low_u8(y_values))), - transform.y_coef as i16, - ); + let y_v_shl = vshll_n_u8::(vget_low_u8(y_values)); + let y_low = vqrdmulhq_laneq_s16::<0>(vreinterpretq_s16_u16(y_v_shl), v_weights); - let r_low = vqrshrun_n_s16::<4>(vaddq_s16( + let r_low = vqrshrun_n_s16::(vqrdmlahq_laneq_s16::<1>(y_low, v_low, v_weights)); + let b_low = vqrshrun_n_s16::(vqrdmlahq_laneq_s16::<2>(y_low, u_low, v_weights)); + let g_low = vqrshrun_n_s16::(vsubq_s16( y_low, - vqrdmulhq_n_s16(v_low, transform.cr_coef as i16), - )); - let b_low = vqrshrun_n_s16::<4>(vaddq_s16( - y_low, - vqrdmulhq_n_s16(u_low, transform.cb_coef as i16), - )); - let g_low = vqrshrun_n_s16::<4>(vsubq_s16( - y_low, - vaddq_s16( - vqrdmulhq_n_s16(v_low, transform.g_coeff_1 as i16), - vqrdmulhq_n_s16(u_low, transform.g_coeff_2 as i16), - ), + vqrdmlahq_laneq_s16::<4>(vqrdmulhq_laneq_s16::<3>(v_low, v_weights), u_low, v_weights), )); let mut r_values = vcombine_u8(r_low, r_high);