Skip to content

Commit

Permalink
Improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
awxkee committed Nov 25, 2024
1 parent 454265a commit b666b69
Show file tree
Hide file tree
Showing 11 changed files with 110 additions and 130 deletions.
10 changes: 5 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,13 +76,13 @@ Tests performed on the image 5763x3842

| | time(NEON) | Time(AVX) |
|------------------------|:----------:|:---------:|
| utils RGB->YUV 4:2:0 | 3.48ms | 3.53ms |
| utils RGB->YUV 4:2:0 | 3.23ms | 3.53ms |
| libyuv RGB->YUV 4:2:0 | 3.58ms | 33.87ms |
| utils RGBA->YUV 4:2:0 | 4.32ms | 5.47ms |
| utils RGBA->YUV 4:2:0 | 4.10ms | 5.47ms |
| libyuv RGBA->YUV 4:2:0 | 4.87ms | 23.48ms |
| utils RGBA->YUV 4:2:2 | 4.83ms | 7.08ms |
| utils RGBA->YUV 4:2:2 | 4.50ms | 7.08ms |
| libyuv RGBA->YUV 4:2:2 | 5.90ms | 35.23ms |
| utils RGBA->YUV 4:4:4 | 5.34ms | 7.97ms |
| utils RGBA->YUV 4:4:4 | 4.77ms | 7.97ms |

### Decoding

Expand All @@ -92,7 +92,7 @@ Tests performed on the image 5763x3842
| libyuv YUV NV12->RGB | 5.20ms | 45.28ms |
| utils YUV 4:2:0->RGB | 3.28ms | 5.25ms |
| libyuv YUV 4:2:0->RGB | 5.70ms | 44.95ms |
| utils YUV 4:2:0->RGBA | 3.82ms | 5.98ms |
| utils YUV 4:2:0->RGBA | 3.77ms | 5.98ms |
| libyuv YUV 4:2:0->RGBA | 6.13ms | 6.88ms |
| utils YUV 4:2:2->RGBA | 4.88ms | 6.91ms |
| libyuv YUV 4:2:2->RGBA | 5.91ms | 6.91ms |
Expand Down
13 changes: 4 additions & 9 deletions src/neon/rgb_to_y.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,8 @@ pub(crate) unsafe fn neon_rgb_to_y_row<const ORIGIN_CHANNELS: u8>(
) -> usize {
let source_channels: YuvSourceChannels = ORIGIN_CHANNELS.into();
let channels = source_channels.get_channels_count();
const V_SHR: i32 = 4;
const V_SCALE: i32 = 7;
let rounding_const_bias: i16 = 1 << (V_SHR - 1);
let bias_y = range.bias_y as i16 * (1 << V_SHR) + rounding_const_bias;
const V_SCALE: i32 = 3;
let bias_y = range.bias_y as i16;

let y_ptr = y_plane;
let rgba_ptr = rgba.as_ptr();
Expand Down Expand Up @@ -101,7 +99,7 @@ pub(crate) unsafe fn neon_rgb_to_y_row<const ORIGIN_CHANNELS: u8>(
y_high = vmaxq_s16(y_high, v_zeros);

let y_high = vminq_u16(
vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::<V_SHR>(y_high), i_bias_y)),
vreinterpretq_u16_s16(vmaxq_s16((y_high), i_bias_y)),

Check warning on line 102 in src/neon/rgb_to_y.rs

View workflow job for this annotation

GitHub Actions / Build

unnecessary parentheses around function argument
i_cap_y,
);

Expand All @@ -114,10 +112,7 @@ pub(crate) unsafe fn neon_rgb_to_y_row<const ORIGIN_CHANNELS: u8>(
y_low = vqrdmlahq_s16(y_low, b_low, v_yb);
y_low = vmaxq_s16(y_low, v_zeros);

let y_low = vminq_u16(
vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::<V_SHR>(y_low), i_bias_y)),
i_cap_y,
);
let y_low = vminq_u16(vreinterpretq_u16_s16(vmaxq_s16((y_low), i_bias_y)), i_cap_y);

Check warning on line 115 in src/neon/rgb_to_y.rs

View workflow job for this annotation

GitHub Actions / Build

unnecessary parentheses around function argument

let y = vcombine_u8(vqmovn_u16(y_low), vqmovn_u16(y_high));
vst1q_u8(y_ptr.add(cx), y);
Expand Down
27 changes: 11 additions & 16 deletions src/neon/rgba_to_nv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,11 +55,9 @@ pub(crate) unsafe fn neon_rgbx_to_nv_row_rdm<
let source_channels: YuvSourceChannels = ORIGIN_CHANNELS.into();
let channels = source_channels.get_channels_count();

const V_SHR: i32 = 4;
const V_SCALE: i32 = 7;
let rounding_const_bias: i16 = 1 << (V_SHR - 1);
let bias_y = range.bias_y as i16 * (1 << V_SHR) + rounding_const_bias;
let bias_uv = range.bias_uv as i16 * (1 << V_SHR) + rounding_const_bias;
const V_SCALE: i32 = 3;
let bias_y = range.bias_y as i16;
let bias_uv = range.bias_uv as i16;

let y_ptr = y_plane.as_mut_ptr();
let uv_ptr = uv_plane.as_mut_ptr();
Expand Down Expand Up @@ -129,7 +127,7 @@ pub(crate) unsafe fn neon_rgbx_to_nv_row_rdm<
y_high = vqrdmlahq_laneq_s16::<2>(y_high, b_high, v_weights);

let y_high = vminq_u16(
vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::<V_SHR>(y_high), i_bias_y)),
vreinterpretq_u16_s16(vmaxq_s16((y_high), i_bias_y)),

Check warning on line 130 in src/neon/rgba_to_nv.rs

View workflow job for this annotation

GitHub Actions / Build

unnecessary parentheses around function argument
i_cap_y,
);

Expand All @@ -141,10 +139,7 @@ pub(crate) unsafe fn neon_rgbx_to_nv_row_rdm<
y_low = vqrdmlahq_laneq_s16::<1>(y_low, g_low, v_weights);
y_low = vqrdmlahq_laneq_s16::<2>(y_low, b_low, v_weights);

let y_low = vminq_u16(
vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::<V_SHR>(y_low), i_bias_y)),
i_cap_y,
);
let y_low = vminq_u16(vreinterpretq_u16_s16(vmaxq_s16((y_low), i_bias_y)), i_cap_y);

Check warning on line 142 in src/neon/rgba_to_nv.rs

View workflow job for this annotation

GitHub Actions / Build

unnecessary parentheses around function argument

let y = vcombine_u8(vqmovn_u16(y_low), vqmovn_u16(y_high));
vst1q_u8(y_ptr.add(cx), y);
Expand All @@ -155,7 +150,7 @@ pub(crate) unsafe fn neon_rgbx_to_nv_row_rdm<
cb_high = vqrdmlahq_laneq_s16::<5>(cb_high, b_high, v_weights);

let cb_high = vminq_u16(
vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::<V_SHR>(cb_high), i_bias_y)),
vreinterpretq_u16_s16(vmaxq_s16((cb_high), i_bias_y)),

Check warning on line 153 in src/neon/rgba_to_nv.rs

View workflow job for this annotation

GitHub Actions / Build

unnecessary parentheses around function argument
i_cap_uv,
);

Expand All @@ -164,7 +159,7 @@ pub(crate) unsafe fn neon_rgbx_to_nv_row_rdm<
cr_high = vqrdmlahq_laneq_s16::<0>(cr_high, b_high, v_cr_b);

let cr_high = vminq_u16(
vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::<V_SHR>(cr_high), i_bias_y)),
vreinterpretq_u16_s16(vmaxq_s16((cr_high), i_bias_y)),

Check warning on line 162 in src/neon/rgba_to_nv.rs

View workflow job for this annotation

GitHub Actions / Build

unnecessary parentheses around function argument
i_cap_uv,
);

Expand All @@ -173,7 +168,7 @@ pub(crate) unsafe fn neon_rgbx_to_nv_row_rdm<
cb_low = vqrdmlahq_laneq_s16::<5>(cb_low, b_low, v_weights);

let cb_low = vminq_u16(
vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::<V_SHR>(cb_low), i_bias_y)),
vreinterpretq_u16_s16(vmaxq_s16((cb_low), i_bias_y)),

Check warning on line 171 in src/neon/rgba_to_nv.rs

View workflow job for this annotation

GitHub Actions / Build

unnecessary parentheses around function argument
i_cap_uv,
);

Expand All @@ -182,7 +177,7 @@ pub(crate) unsafe fn neon_rgbx_to_nv_row_rdm<
cr_low = vqrdmlahq_laneq_s16::<0>(cr_low, b_low, v_cr_b);

let cr_low = vminq_u16(
vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::<V_SHR>(cr_low), i_bias_y)),
vreinterpretq_u16_s16(vmaxq_s16((cr_low), i_bias_y)),

Check warning on line 180 in src/neon/rgba_to_nv.rs

View workflow job for this annotation

GitHub Actions / Build

unnecessary parentheses around function argument
i_cap_uv,
);
let cb = vcombine_u8(vqmovn_u16(cb_low), vqmovn_u16(cb_high));
Expand Down Expand Up @@ -218,7 +213,7 @@ pub(crate) unsafe fn neon_rgbx_to_nv_row_rdm<
cbl = vqrdmlahq_laneq_s16::<5>(cbl, b1, v_weights);

let cb = vqmovn_u16(vminq_u16(
vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::<V_SHR>(cbl), i_bias_y)),
vreinterpretq_u16_s16(vmaxq_s16((cbl), i_bias_y)),

Check warning on line 216 in src/neon/rgba_to_nv.rs

View workflow job for this annotation

GitHub Actions / Build

unnecessary parentheses around function argument
i_cap_uv,
));

Expand All @@ -227,7 +222,7 @@ pub(crate) unsafe fn neon_rgbx_to_nv_row_rdm<
crl = vqrdmlahq_laneq_s16::<0>(crl, b1, v_cr_b);

let cr = vqmovn_u16(vminq_u16(
vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::<V_SHR>(crl), i_bias_y)),
vreinterpretq_u16_s16(vmaxq_s16((crl), i_bias_y)),

Check warning on line 225 in src/neon/rgba_to_nv.rs

View workflow job for this annotation

GitHub Actions / Build

unnecessary parentheses around function argument
i_cap_uv,
));

Expand Down
27 changes: 11 additions & 16 deletions src/neon/rgba_to_yuv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -53,11 +53,9 @@ pub(crate) unsafe fn neon_rgba_to_yuv_rdm<
let source_channels: YuvSourceChannels = ORIGIN_CHANNELS.into();
let channels = source_channels.get_channels_count();

const V_SHR: i32 = 4;
const V_SCALE: i32 = 7;
let rounding_const_bias: i16 = 1 << (V_SHR - 1);
let bias_y = range.bias_y as i16 * (1 << V_SHR) + rounding_const_bias;
let bias_uv = range.bias_uv as i16 * (1 << V_SHR) + rounding_const_bias;
const V_SCALE: i32 = 3;
let bias_y = range.bias_y as i16;
let bias_uv = range.bias_uv as i16;

let y_ptr = y_plane;
let u_ptr = u_plane;
Expand Down Expand Up @@ -128,7 +126,7 @@ pub(crate) unsafe fn neon_rgba_to_yuv_rdm<
y_high = vqrdmlahq_laneq_s16::<2>(y_high, b0hi, v_weights);

let y_high = vminq_u16(
vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::<V_SHR>(y_high), i_bias_y)),
vreinterpretq_u16_s16(vmaxq_s16((y_high), i_bias_y)),
i_cap_y,
);

Expand All @@ -140,10 +138,7 @@ pub(crate) unsafe fn neon_rgba_to_yuv_rdm<
y_low = vqrdmlahq_laneq_s16::<1>(y_low, g_low, v_weights);
y_low = vqrdmlahq_laneq_s16::<2>(y_low, b_low, v_weights);

let y_low = vminq_u16(
vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::<V_SHR>(y_low), i_bias_y)),
i_cap_y,
);
let y_low = vminq_u16(vreinterpretq_u16_s16(vmaxq_s16((y_low), i_bias_y)), i_cap_y);

let y = vcombine_u8(vqmovn_u16(y_low), vqmovn_u16(y_high));
vst1q_u8(y_ptr.get_unchecked_mut(cx..).as_mut_ptr(), y);
Expand All @@ -154,7 +149,7 @@ pub(crate) unsafe fn neon_rgba_to_yuv_rdm<
cb_high = vqrdmlahq_laneq_s16::<5>(cb_high, b0hi, v_weights);

let cb_high = vminq_u16(
vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::<V_SHR>(cb_high), i_bias_y)),
vreinterpretq_u16_s16(vmaxq_s16((cb_high), i_bias_y)),
i_cap_uv,
);

Expand All @@ -163,7 +158,7 @@ pub(crate) unsafe fn neon_rgba_to_yuv_rdm<
cr_high = vqrdmlahq_laneq_s16::<0>(cr_high, b0hi, v_cr_b);

let cr_high = vminq_u16(
vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::<V_SHR>(cr_high), i_bias_y)),
vreinterpretq_u16_s16(vmaxq_s16((cr_high), i_bias_y)),
i_cap_uv,
);

Expand All @@ -172,7 +167,7 @@ pub(crate) unsafe fn neon_rgba_to_yuv_rdm<
cb_low = vqrdmlahq_laneq_s16::<5>(cb_low, b_low, v_weights);

let cb_low = vminq_u16(
vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::<V_SHR>(cb_low), i_bias_y)),
vreinterpretq_u16_s16(vmaxq_s16((cb_low), i_bias_y)),
i_cap_uv,
);

Expand All @@ -181,7 +176,7 @@ pub(crate) unsafe fn neon_rgba_to_yuv_rdm<
cr_low = vqrdmlahq_laneq_s16::<0>(cr_low, b_low, v_cr_b);

let cr_low = vminq_u16(
vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::<V_SHR>(cr_low), i_bias_y)),
vreinterpretq_u16_s16(vmaxq_s16((cr_low), i_bias_y)),
i_cap_uv,
);
let cb = vcombine_u8(vqmovn_u16(cb_low), vqmovn_u16(cb_high));
Expand Down Expand Up @@ -209,7 +204,7 @@ pub(crate) unsafe fn neon_rgba_to_yuv_rdm<
cbl = vqrdmlahq_laneq_s16::<5>(cbl, b1, v_weights);

let cb = vqmovn_u16(vminq_u16(
vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::<V_SHR>(cbl), i_bias_y)),
vreinterpretq_u16_s16(vmaxq_s16((cbl), i_bias_y)),
i_cap_uv,
));

Expand All @@ -218,7 +213,7 @@ pub(crate) unsafe fn neon_rgba_to_yuv_rdm<
crl = vqrdmlahq_laneq_s16::<0>(crl, b1, v_cr_b);

let cr = vqmovn_u16(vminq_u16(
vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::<V_SHR>(crl), i_bias_y)),
vreinterpretq_u16_s16(vmaxq_s16((crl), i_bias_y)),
i_cap_uv,
));

Expand Down
20 changes: 9 additions & 11 deletions src/neon/rgba_to_yuv420.rs
Original file line number Diff line number Diff line change
Expand Up @@ -48,11 +48,9 @@ pub(crate) unsafe fn neon_rgba_to_yuv_rdm420<const ORIGIN_CHANNELS: u8, const PR
let source_channels: YuvSourceChannels = ORIGIN_CHANNELS.into();
let channels = source_channels.get_channels_count();

const V_SHR: i32 = 4;
const V_SCALE: i32 = 7;
let rounding_const_bias: i16 = 1 << (V_SHR - 1);
let bias_y = range.bias_y as i16 * (1 << V_SHR) + rounding_const_bias;
let bias_uv = range.bias_uv as i16 * (1 << V_SHR) + rounding_const_bias;
const V_SCALE: i32 = 3;
let bias_y = range.bias_y as i16;
let bias_uv = range.bias_uv as i16;

let u_ptr = u_plane;
let v_ptr = v_plane;
Expand Down Expand Up @@ -150,7 +148,7 @@ pub(crate) unsafe fn neon_rgba_to_yuv_rdm420<const ORIGIN_CHANNELS: u8, const PR
y0_high = vqrdmlahq_laneq_s16::<2>(y0_high, b0hi, v_weights);

let y0_high = vminq_u16(
vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::<V_SHR>(y0_high), i_bias_y)),
vreinterpretq_u16_s16(vmaxq_s16((y0_high), i_bias_y)),
i_cap_y,
);

Expand All @@ -159,7 +157,7 @@ pub(crate) unsafe fn neon_rgba_to_yuv_rdm420<const ORIGIN_CHANNELS: u8, const PR
y1_high = vqrdmlahq_laneq_s16::<2>(y1_high, b1hi, v_weights);

let y1_high = vminq_u16(
vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::<V_SHR>(y1_high), i_bias_y)),
vreinterpretq_u16_s16(vmaxq_s16((y1_high), i_bias_y)),
i_cap_y,
);

Expand All @@ -176,7 +174,7 @@ pub(crate) unsafe fn neon_rgba_to_yuv_rdm420<const ORIGIN_CHANNELS: u8, const PR
y0_low = vqrdmlahq_laneq_s16::<2>(y0_low, b0_low, v_weights);

let y0_low = vminq_u16(
vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::<V_SHR>(y0_low), i_bias_y)),
vreinterpretq_u16_s16(vmaxq_s16((y0_low), i_bias_y)),
i_cap_y,
);

Expand All @@ -185,7 +183,7 @@ pub(crate) unsafe fn neon_rgba_to_yuv_rdm420<const ORIGIN_CHANNELS: u8, const PR
y1_low = vqrdmlahq_laneq_s16::<2>(y1_low, b1_low, v_weights);

let y1_low = vminq_u16(
vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::<V_SHR>(y1_low), i_bias_y)),
vreinterpretq_u16_s16(vmaxq_s16((y1_low), i_bias_y)),
i_cap_y,
);

Expand All @@ -207,7 +205,7 @@ pub(crate) unsafe fn neon_rgba_to_yuv_rdm420<const ORIGIN_CHANNELS: u8, const PR
cbl = vqrdmlahq_laneq_s16::<5>(cbl, b1, v_weights);

let cb = vqmovn_u16(vminq_u16(
vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::<V_SHR>(cbl), i_bias_y)),
vreinterpretq_u16_s16(vmaxq_s16((cbl), i_bias_y)),
i_cap_uv,
));

Expand All @@ -216,7 +214,7 @@ pub(crate) unsafe fn neon_rgba_to_yuv_rdm420<const ORIGIN_CHANNELS: u8, const PR
crl = vqrdmlahq_laneq_s16::<0>(crl, b1, v_cr_b);

let cr = vqmovn_u16(vminq_u16(
vreinterpretq_u16_s16(vmaxq_s16(vshrq_n_s16::<V_SHR>(crl), i_bias_y)),
vreinterpretq_u16_s16(vmaxq_s16((crl), i_bias_y)),
i_cap_uv,
));

Expand Down
10 changes: 6 additions & 4 deletions src/neon/y_to_rgb.rs
Original file line number Diff line number Diff line change
Expand Up @@ -51,22 +51,24 @@ pub(crate) unsafe fn neon_y_to_rgb_row_rdm<const DESTINATION_CHANNELS: u8>(

let mut cx = start_cx;

const V_SCALE: i32 = 3;

while cx + 16 < width {
let y_values = vsubq_u8(vld1q_u8(y_ptr.add(cx)), y_corr);

let y_high = vqrdmulhq_n_s16(
vreinterpretq_s16_u16(vshll_high_n_u8::<7>(y_values)),
vreinterpretq_s16_u16(vshll_high_n_u8::<V_SCALE>(y_values)),
transform.y_coef as i16,
);

let r_high = vqrshrun_n_s16::<4>(y_high);
let r_high = vqmovun_s16(y_high);

let y_low = vqrdmulhq_n_s16(
vreinterpretq_s16_u16(vshll_n_u8::<7>(vget_low_u8(y_values))),
vreinterpretq_s16_u16(vshll_n_u8::<V_SCALE>(vget_low_u8(y_values))),
transform.y_coef as i16,
);

let r_low = vqrshrun_n_s16::<4>(y_low);
let r_low = vqmovun_s16(y_low);

let r_values = vcombine_u8(r_low, r_high);

Expand Down
21 changes: 10 additions & 11 deletions src/neon/yuv_nv_to_rgba.rs
Original file line number Diff line number Diff line change
Expand Up @@ -78,8 +78,7 @@ pub(crate) unsafe fn neon_yuv_nv_to_rgba_row_rdm<

let v_weights = vld1q_s16(weights_arr.as_ptr());

const SCALE: i32 = 7;
const V_SHR: i32 = 4;
const SCALE: i32 = 3;

while cx + 16 < width {
let y_values = vqsubq_u8(vld1q_u8(y_ptr.add(cx)), y_corr);
Expand Down Expand Up @@ -126,9 +125,9 @@ pub(crate) unsafe fn neon_yuv_nv_to_rgba_row_rdm<
v_weights,
);

let r_high = vqrshrun_n_s16::<V_SHR>(vqrdmlahq_laneq_s16::<1>(y_high, v_high, v_weights));
let b_high = vqrshrun_n_s16::<V_SHR>(vqrdmlahq_laneq_s16::<2>(y_high, u_high, v_weights));
let g_high = vqrshrun_n_s16::<V_SHR>(vqrdmlahq_laneq_s16::<4>(
let r_high = vqmovun_s16(vqrdmlahq_laneq_s16::<1>(y_high, v_high, v_weights));
let b_high = vqmovun_s16(vqrdmlahq_laneq_s16::<2>(y_high, u_high, v_weights));
let g_high = vqmovun_s16(vqrdmlahq_laneq_s16::<4>(
vqrdmlahq_laneq_s16::<3>(y_high, v_high, v_weights),
u_high,
v_weights,
Expand All @@ -144,9 +143,9 @@ pub(crate) unsafe fn neon_yuv_nv_to_rgba_row_rdm<
let y_v_shl = vshll_n_u8::<SCALE>(vget_low_u8(y_values));
let y_low = vqrdmulhq_laneq_s16::<0>(vreinterpretq_s16_u16(y_v_shl), v_weights);

let r_low = vqrshrun_n_s16::<V_SHR>(vqrdmlahq_laneq_s16::<1>(y_low, v_low, v_weights));
let b_low = vqrshrun_n_s16::<V_SHR>(vqrdmlahq_laneq_s16::<2>(y_low, u_low, v_weights));
let g_low = vqrshrun_n_s16::<V_SHR>(vqrdmlahq_laneq_s16::<4>(
let r_low = vqmovun_s16(vqrdmlahq_laneq_s16::<1>(y_low, v_low, v_weights));
let b_low = vqmovun_s16(vqrdmlahq_laneq_s16::<2>(y_low, u_low, v_weights));
let g_low = vqmovun_s16(vqrdmlahq_laneq_s16::<4>(
vqrdmlahq_laneq_s16::<3>(y_low, v_low, v_weights),
u_low,
v_weights,
Expand Down Expand Up @@ -235,9 +234,9 @@ pub(crate) unsafe fn neon_yuv_nv_to_rgba_row_rdm<
v_weights,
);

let r_low = vqrshrun_n_s16::<V_SHR>(vqrdmlahq_laneq_s16::<1>(y_low, v_low, v_weights));
let b_low = vqrshrun_n_s16::<V_SHR>(vqrdmlahq_laneq_s16::<2>(y_low, u_low, v_weights));
let g_low = vqrshrun_n_s16::<V_SHR>(vqrdmlahq_laneq_s16::<4>(
let r_low = vqmovun_s16(vqrdmlahq_laneq_s16::<1>(y_low, v_low, v_weights));
let b_low = vqmovun_s16(vqrdmlahq_laneq_s16::<2>(y_low, u_low, v_weights));
let g_low = vqmovun_s16(vqrdmlahq_laneq_s16::<4>(
vqrdmlahq_laneq_s16::<3>(y_low, v_low, v_weights),
u_low,
v_weights,
Expand Down
Loading

0 comments on commit b666b69

Please sign in to comment.