Skip to content

Commit

Permalink
NEON improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
awxkee committed Nov 25, 2024
1 parent 16c23de commit 79008a5
Show file tree
Hide file tree
Showing 9 changed files with 304 additions and 363 deletions.
12 changes: 6 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ Tests performed on the image 5763x3842

| | time(NEON) | Time(AVX) |
|------------------------|:----------:|:---------:|
| utils RGB->YUV 4:2:0 | 4.37ms | 6.14ms |
| utils RGB->YUV 4:2:0 | 4.09ms | 6.14ms |
| libyuv RGB->YUV 4:2:0 | 3.66ms | 33.87ms |
| utils RGBA->YUV 4:2:0 | 4.88ms | 7.34ms |
| libyuv RGBA->YUV 4:2:0 | 4.87ms | 23.48ms |
Expand All @@ -88,15 +88,15 @@ Tests performed on the image 5763x3842

| | time(NEON) | Time(AVX) |
|------------------------|:----------:|:---------:|
| utils YUV NV12->RGB | 4.08ms | 6.48ms |
| utils YUV NV12->RGB | 3.92ms | 6.48ms |
| libyuv YUV NV12->RGB | 5.20ms | 45.28ms |
| utils YUV 4:2:0->RGB | 3.49ms | 5.44ms |
| utils YUV 4:2:0->RGB | 3.28ms | 5.44ms |
| libyuv YUV 4:2:0->RGB | 5.70ms | 44.95ms |
| utils YUV 4:2:0->RGBA | 4.02ms | 5.98ms |
| utils YUV 4:2:0->RGBA | 3.85ms | 5.98ms |
| libyuv YUV 4:2:0->RGBA | 6.13ms | 6.88ms |
| utils YUV 4:2:2->RGBA | 5.39ms | 6.91ms |
| utils YUV 4:2:2->RGBA | 4.94ms | 6.91ms |
| libyuv YUV 4:2:2->RGBA | 5.91ms | 6.91ms |
| utils YUV 4:4:4->RGBA | 5.04ms | 7.20ms |
| utils YUV 4:4:4->RGBA | 4.83ms | 7.20ms |
| libyuv YUV 4:4:4->RGBA | 4.82ms | 7.30ms |

This project is licensed under either of
Expand Down
62 changes: 31 additions & 31 deletions app/benches/yuv8/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ pub fn criterion_benchmark(c: &mut Criterion) {
let fixed_planar = planar_image.to_fixed();

// let rgba_image = img.to_rgba8();
//

// c.bench_function("yuvutils RGB -> YUV 4:2:0", |b| {
// let mut test_planar = YuvPlanarImageMut::<u8>::alloc(
// dimensions.0,
Expand Down Expand Up @@ -217,36 +217,36 @@ pub fn criterion_benchmark(c: &mut Criterion) {
// .unwrap();
// })
// });

c.bench_function("yuvutils YUV NV12 -> RGB", |b| {
let mut rgb_bytes = vec![0u8; dimensions.0 as usize * 4 * dimensions.1 as usize];
b.iter(|| {
yuv_nv12_to_rgba(
&fixed_bi_planar,
&mut rgb_bytes,
dimensions.0 * 4u32,
YuvRange::Limited,
YuvStandardMatrix::Bt601,
)
.unwrap();
})
});

c.bench_function("livyuv YUV NV12 -> RGB", |b| {
let mut rgb_bytes = vec![0u8; dimensions.0 as usize * 4 * dimensions.1 as usize];
b.iter(|| unsafe {
rs_NV21ToABGR(
fixed_bi_planar.y_plane.as_ptr(),
fixed_bi_planar.y_stride as i32,
fixed_bi_planar.uv_plane.as_ptr(),
fixed_bi_planar.uv_stride as i32,
rgb_bytes.as_mut_ptr(),
dimensions.0 as i32 * 4,
fixed_bi_planar.width as i32,
fixed_bi_planar.height as i32,
);
})
});
//
// c.bench_function("yuvutils YUV NV12 -> RGB", |b| {
// let mut rgb_bytes = vec![0u8; dimensions.0 as usize * 4 * dimensions.1 as usize];
// b.iter(|| {
// yuv_nv12_to_rgba(
// &fixed_bi_planar,
// &mut rgb_bytes,
// dimensions.0 * 4u32,
// YuvRange::Limited,
// YuvStandardMatrix::Bt601,
// )
// .unwrap();
// })
// });
//
// c.bench_function("livyuv YUV NV12 -> RGB", |b| {
// let mut rgb_bytes = vec![0u8; dimensions.0 as usize * 4 * dimensions.1 as usize];
// b.iter(|| unsafe {
// rs_NV21ToABGR(
// fixed_bi_planar.y_plane.as_ptr(),
// fixed_bi_planar.y_stride as i32,
// fixed_bi_planar.uv_plane.as_ptr(),
// fixed_bi_planar.uv_stride as i32,
// rgb_bytes.as_mut_ptr(),
// dimensions.0 as i32 * 4,
// fixed_bi_planar.width as i32,
// fixed_bi_planar.height as i32,
// );
// })
// });

c.bench_function("yuvutils YUV 4:2:0 -> RGB", |b| {
let mut rgb_bytes = vec![0u8; dimensions.0 as usize * 3 * dimensions.1 as usize];
Expand Down
68 changes: 36 additions & 32 deletions src/neon/rgba_to_nv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -71,16 +71,20 @@ pub(crate) unsafe fn neon_rgbx_to_nv_row_rdm<

let y_bias = vdupq_n_s16(bias_y);
let uv_bias = vdupq_n_s16(bias_uv);
let v_yr = vdupq_n_s16(transform.yr as i16);
let v_yg = vdupq_n_s16(transform.yg as i16);
let v_yb = vdupq_n_s16(transform.yb as i16);
let v_cb_r = vdupq_n_s16(transform.cb_r as i16);
let v_cb_g = vdupq_n_s16(transform.cb_g as i16);
let v_cb_b = vdupq_n_s16(transform.cb_b as i16);
let v_cr_r = vdupq_n_s16(transform.cr_r as i16);
let v_cr_g = vdupq_n_s16(transform.cr_g as i16);
let v_cr_b = vdupq_n_s16(transform.cr_b as i16);

let weights_arr: [i16; 8] = [
transform.yr as i16,
transform.yg as i16,
transform.yb as i16,
transform.cb_r as i16,
transform.cb_g as i16,
transform.cb_b as i16,
transform.cr_r as i16,
transform.cr_g as i16,
];
let v_weights = vld1q_s16(weights_arr.as_ptr());

let mut cx = start_cx;
let mut ux = start_ux;

Expand Down Expand Up @@ -120,9 +124,9 @@ pub(crate) unsafe fn neon_rgbx_to_nv_row_rdm<
let g_high = vreinterpretq_s16_u16(vshll_high_n_u8::<V_SCALE>(g_values_u8));
let b_high = vreinterpretq_s16_u16(vshll_high_n_u8::<V_SCALE>(b_values_u8));

let mut y_high = vqrdmlahq_s16(y_bias, r_high, v_yr);
y_high = vqrdmlahq_s16(y_high, g_high, v_yg);
y_high = vqrdmlahq_s16(y_high, b_high, v_yb);
let mut y_high = vqrdmlahq_laneq_s16::<0>(y_bias, r_high, v_weights);
y_high = vqrdmlahq_laneq_s16::<1>(y_high, g_high, v_weights);
y_high = vqrdmlahq_laneq_s16::<2>(y_high, b_high, v_weights);

let y_high = vminq_u16(
vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::<V_SHR>(y_high), i_bias_y)),
Expand All @@ -133,9 +137,9 @@ pub(crate) unsafe fn neon_rgbx_to_nv_row_rdm<
let g_low = vreinterpretq_s16_u16(vshll_n_u8::<V_SCALE>(vget_low_u8(g_values_u8)));
let b_low = vreinterpretq_s16_u16(vshll_n_u8::<V_SCALE>(vget_low_u8(b_values_u8)));

let mut y_low = vqrdmlahq_s16(y_bias, r_low, v_yr);
y_low = vqrdmlahq_s16(y_low, g_low, v_yg);
y_low = vqrdmlahq_s16(y_low, b_low, v_yb);
let mut y_low = vqrdmlahq_laneq_s16::<0>(y_bias, r_low, v_weights);
y_low = vqrdmlahq_laneq_s16::<1>(y_low, g_low, v_weights);
y_low = vqrdmlahq_laneq_s16::<2>(y_low, b_low, v_weights);

let y_low = vminq_u16(
vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::<V_SHR>(y_low), i_bias_y)),
Expand All @@ -146,36 +150,36 @@ pub(crate) unsafe fn neon_rgbx_to_nv_row_rdm<
vst1q_u8(y_ptr.add(cx), y);

if chroma_subsampling == YuvChromaSubsampling::Yuv444 {
let mut cb_high = vqrdmlahq_s16(uv_bias, r_high, v_cb_r);
cb_high = vqrdmlahq_s16(cb_high, g_high, v_cb_g);
cb_high = vqrdmlahq_s16(cb_high, b_high, v_cb_b);
let mut cb_high = vqrdmlahq_laneq_s16::<3>(uv_bias, r_high, v_weights);
cb_high = vqrdmlahq_laneq_s16::<4>(cb_high, g_high, v_weights);
cb_high = vqrdmlahq_laneq_s16::<5>(cb_high, b_high, v_weights);

let cb_high = vminq_u16(
vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::<V_SHR>(cb_high), i_bias_y)),
i_cap_uv,
);

let mut cr_high = vqrdmlahq_s16(uv_bias, r_high, v_cr_r);
cr_high = vqrdmlahq_s16(cr_high, g_high, v_cr_g);
cr_high = vqrdmlahq_s16(cr_high, b_high, v_cr_b);
let mut cr_high = vqrdmlahq_laneq_s16::<6>(uv_bias, r_high, v_weights);
cr_high = vqrdmlahq_laneq_s16::<7>(cr_high, g_high, v_weights);
cr_high = vqrdmlahq_laneq_s16::<0>(cr_high, b_high, v_cr_b);

let cr_high = vminq_u16(
vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::<V_SHR>(cr_high), i_bias_y)),
i_cap_uv,
);

let mut cb_low = vqrdmlahq_s16(uv_bias, r_low, v_cb_r);
cb_low = vqrdmlahq_s16(cb_low, g_low, v_cb_g);
cb_low = vqrdmlahq_s16(cb_low, b_low, v_cb_b);
let mut cb_low = vqrdmlahq_laneq_s16::<3>(uv_bias, r_low, v_weights);
cb_low = vqrdmlahq_laneq_s16::<4>(cb_low, g_low, v_weights);
cb_low = vqrdmlahq_laneq_s16::<5>(cb_low, b_low, v_weights);

let cb_low = vminq_u16(
vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::<V_SHR>(cb_low), i_bias_y)),
i_cap_uv,
);

let mut cr_low = vqrdmlahq_s16(uv_bias, r_low, v_cr_r);
cr_low = vqrdmlahq_s16(cr_low, g_low, v_cr_g);
cr_low = vqrdmlahq_s16(cr_low, b_low, v_cr_b);
let mut cr_low = vqrdmlahq_laneq_s16::<6>(uv_bias, r_low, v_weights);
cr_low = vqrdmlahq_laneq_s16::<7>(cr_low, g_low, v_weights);
cr_low = vqrdmlahq_laneq_s16::<0>(cr_low, b_low, v_cr_b);

let cr_low = vminq_u16(
vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::<V_SHR>(cr_low), i_bias_y)),
Expand Down Expand Up @@ -209,18 +213,18 @@ pub(crate) unsafe fn neon_rgbx_to_nv_row_rdm<
b_values_u8,
))));

let mut cbl = vqrdmlahq_s16(uv_bias, r1, v_cb_r);
cbl = vqrdmlahq_s16(cbl, g1, v_cb_g);
cbl = vqrdmlahq_s16(cbl, b1, v_cb_b);
let mut cbl = vqrdmlahq_laneq_s16::<3>(uv_bias, r1, v_weights);
cbl = vqrdmlahq_laneq_s16::<4>(cbl, g1, v_weights);
cbl = vqrdmlahq_laneq_s16::<5>(cbl, b1, v_weights);

let cb = vqmovn_u16(vminq_u16(
vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::<V_SHR>(cbl), i_bias_y)),
i_cap_uv,
));

let mut crl = vqrdmlahq_s16(uv_bias, r1, v_cr_r);
crl = vqrdmlahq_s16(crl, g1, v_cr_g);
crl = vqrdmlahq_s16(crl, b1, v_cr_b);
let mut crl = vqrdmlahq_laneq_s16::<6>(uv_bias, r1, v_weights);
crl = vqrdmlahq_laneq_s16::<7>(crl, g1, v_weights);
crl = vqrdmlahq_laneq_s16::<0>(crl, b1, v_cr_b);

let cr = vqmovn_u16(vminq_u16(
vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::<V_SHR>(crl), i_bias_y)),
Expand Down
74 changes: 39 additions & 35 deletions src/neon/rgba_to_yuv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -71,16 +71,20 @@ pub(crate) unsafe fn neon_rgba_to_yuv_rdm<

let y_bias = vdupq_n_s16(bias_y);
let uv_bias = vdupq_n_s16(bias_uv);
let v_yr = vdupq_n_s16(transform.yr as i16);
let v_yg = vdupq_n_s16(transform.yg as i16);
let v_yb = vdupq_n_s16(transform.yb as i16);
let v_cb_r = vdupq_n_s16(transform.cb_r as i16);
let v_cb_g = vdupq_n_s16(transform.cb_g as i16);
let v_cb_b = vdupq_n_s16(transform.cb_b as i16);
let v_cr_r = vdupq_n_s16(transform.cr_r as i16);
let v_cr_g = vdupq_n_s16(transform.cr_g as i16);
let v_cr_b = vdupq_n_s16(transform.cr_b as i16);

let weights_arr: [i16; 8] = [
transform.yr as i16,
transform.yg as i16,
transform.yb as i16,
transform.cb_r as i16,
transform.cb_g as i16,
transform.cb_b as i16,
transform.cr_r as i16,
transform.cr_g as i16,
];
let v_weights = vld1q_s16(weights_arr.as_ptr());

let mut cx = start_cx;
let mut ux = start_ux;

Expand Down Expand Up @@ -120,22 +124,22 @@ pub(crate) unsafe fn neon_rgba_to_yuv_rdm<
let g0hi = vreinterpretq_s16_u16(vshll_high_n_u8::<V_SCALE>(g_values_u8));
let b0hi = vreinterpretq_s16_u16(vshll_high_n_u8::<V_SCALE>(b_values_u8));

let mut y_high = vqrdmlahq_s16(y_bias, r0hi, v_yr);
y_high = vqrdmlahq_s16(y_high, g0hi, v_yg);
y_high = vqrdmlahq_s16(y_high, b0hi, v_yb);
let mut y_high = vqrdmlahq_laneq_s16::<0>(y_bias, r0hi, v_weights);
y_high = vqrdmlahq_laneq_s16::<1>(y_high, g0hi, v_weights);
y_high = vqrdmlahq_laneq_s16::<2>(y_high, b0hi, v_weights);

let y_high = vminq_u16(
vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::<V_SHR>(y_high), i_bias_y)),
i_cap_y,
);

let r0lo = vreinterpretq_s16_u16(vshll_n_u8::<V_SCALE>(vget_low_u8(r_values_u8)));
let g0lo = vreinterpretq_s16_u16(vshll_n_u8::<V_SCALE>(vget_low_u8(g_values_u8)));
let b0lo = vreinterpretq_s16_u16(vshll_n_u8::<V_SCALE>(vget_low_u8(b_values_u8)));
let r_low = vreinterpretq_s16_u16(vshll_n_u8::<V_SCALE>(vget_low_u8(r_values_u8)));
let g_low = vreinterpretq_s16_u16(vshll_n_u8::<V_SCALE>(vget_low_u8(g_values_u8)));
let b_low = vreinterpretq_s16_u16(vshll_n_u8::<V_SCALE>(vget_low_u8(b_values_u8)));

let mut y_low = vqrdmlahq_s16(y_bias, r0lo, v_yr);
y_low = vqrdmlahq_s16(y_low, g0lo, v_yg);
y_low = vqrdmlahq_s16(y_low, b0lo, v_yb);
let mut y_low = vqrdmlahq_laneq_s16::<0>(y_bias, r_low, v_weights);
y_low = vqrdmlahq_laneq_s16::<1>(y_low, g_low, v_weights);
y_low = vqrdmlahq_laneq_s16::<2>(y_low, b_low, v_weights);

let y_low = vminq_u16(
vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::<V_SHR>(y_low), i_bias_y)),
Expand All @@ -146,36 +150,36 @@ pub(crate) unsafe fn neon_rgba_to_yuv_rdm<
vst1q_u8(y_ptr.get_unchecked_mut(cx..).as_mut_ptr(), y);

if chroma_subsampling == YuvChromaSubsampling::Yuv444 {
let mut cb_high = vqrdmlahq_s16(uv_bias, r0hi, v_cb_r);
cb_high = vqrdmlahq_s16(cb_high, g0hi, v_cb_g);
cb_high = vqrdmlahq_s16(cb_high, b0hi, v_cb_b);
let mut cb_high = vqrdmlahq_laneq_s16::<3>(uv_bias, r0hi, v_weights);
cb_high = vqrdmlahq_laneq_s16::<4>(cb_high, g0hi, v_weights);
cb_high = vqrdmlahq_laneq_s16::<5>(cb_high, b0hi, v_weights);

let cb_high = vminq_u16(
vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::<V_SHR>(cb_high), i_bias_y)),
i_cap_uv,
);

let mut cr_high = vqrdmlahq_s16(uv_bias, r0hi, v_cr_r);
cr_high = vqrdmlahq_s16(cr_high, g0hi, v_cr_g);
cr_high = vqrdmlahq_s16(cr_high, b0hi, v_cr_b);
let mut cr_high = vqrdmlahq_laneq_s16::<6>(uv_bias, r0hi, v_weights);
cr_high = vqrdmlahq_laneq_s16::<7>(cr_high, g0hi, v_weights);
cr_high = vqrdmlahq_laneq_s16::<0>(cr_high, b0hi, v_cr_b);

let cr_high = vminq_u16(
vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::<V_SHR>(cr_high), i_bias_y)),
i_cap_uv,
);

let mut cb_low = vqrdmlahq_s16(uv_bias, r0lo, v_cb_r);
cb_low = vqrdmlahq_s16(cb_low, g0lo, v_cb_g);
cb_low = vqrdmlahq_s16(cb_low, b0lo, v_cb_b);
let mut cb_low = vqrdmlahq_laneq_s16::<3>(uv_bias, r_low, v_weights);
cb_low = vqrdmlahq_laneq_s16::<4>(cb_low, g_low, v_weights);
cb_low = vqrdmlahq_laneq_s16::<5>(cb_low, b_low, v_weights);

let cb_low = vminq_u16(
vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::<V_SHR>(cb_low), i_bias_y)),
i_cap_uv,
);

let mut cr_low = vqrdmlahq_s16(uv_bias, r0lo, v_cr_r);
cr_low = vqrdmlahq_s16(cr_low, g0lo, v_cr_g);
cr_low = vqrdmlahq_s16(cr_low, b0lo, v_cr_b);
let mut cr_low = vqrdmlahq_laneq_s16::<6>(uv_bias, r_low, v_weights);
cr_low = vqrdmlahq_laneq_s16::<7>(cr_low, g_low, v_weights);
cr_low = vqrdmlahq_laneq_s16::<0>(cr_low, b_low, v_cr_b);

let cr_low = vminq_u16(
vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::<V_SHR>(cr_low), i_bias_y)),
Expand All @@ -201,18 +205,18 @@ pub(crate) unsafe fn neon_rgba_to_yuv_rdm<
b_values_u8,
))));

let mut cbl = vqrdmlahq_s16(uv_bias, r1, v_cb_r);
cbl = vqrdmlahq_s16(cbl, g1, v_cb_g);
cbl = vqrdmlahq_s16(cbl, b1, v_cb_b);
let mut cbl = vqrdmlahq_laneq_s16::<3>(uv_bias, r1, v_weights);
cbl = vqrdmlahq_laneq_s16::<4>(cbl, g1, v_weights);
cbl = vqrdmlahq_laneq_s16::<5>(cbl, b1, v_weights);

let cb = vqmovn_u16(vminq_u16(
vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::<V_SHR>(cbl), i_bias_y)),
i_cap_uv,
));

let mut crl = vqrdmlahq_s16(uv_bias, r1, v_cr_r);
crl = vqrdmlahq_s16(crl, g1, v_cr_g);
crl = vqrdmlahq_s16(crl, b1, v_cr_b);
let mut crl = vqrdmlahq_laneq_s16::<6>(uv_bias, r1, v_weights);
crl = vqrdmlahq_laneq_s16::<7>(crl, g1, v_weights);
crl = vqrdmlahq_laneq_s16::<0>(crl, b1, v_cr_b);

let cr = vqmovn_u16(vminq_u16(
vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::<V_SHR>(crl), i_bias_y)),
Expand Down
Loading

0 comments on commit 79008a5

Please sign in to comment.