diff --git a/Cargo.lock b/Cargo.lock index 9aefdf6..9e3e36c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4,4 +4,4 @@ version = 3 [[package]] name = "yuvutils-rs" -version = "0.1.6" +version = "0.1.7" diff --git a/Cargo.toml b/Cargo.toml index 62c09bf..c45544f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "yuvutils-rs" -version = "0.1.6" +version = "0.1.7" edition = "2021" description = "Rust utilities for YUV format handling and conversion." readme = "README.md" diff --git a/src/yuv_to_rgba_alpha.rs b/src/yuv_to_rgba_alpha.rs index 0ac406a..a0315c1 100644 --- a/src/yuv_to_rgba_alpha.rs +++ b/src/yuv_to_rgba_alpha.rs @@ -2,8 +2,8 @@ #[cfg(target_feature = "neon")] use std::arch::aarch64::{ uint8x16_t, uint8x16x4_t, uint8x8_t, vcombine_u8, vdup_n_u8, vdupq_n_s16, vdupq_n_u16, - vdupq_n_u8, vget_high_u8, vget_low_u8, vld1_u8, vld1q_u8, vmaxq_s16, vmlal_high_u8, vmlal_u8, - vmovl_u8, vmull_high_u8, vmull_u8, vmulq_s16, vqaddq_s16, vqshrn_n_u16, vqshrun_n_s16, + vdupq_n_u8, vget_high_u8, vget_low_u8, vld1_u8, vld1q_u8, vmaxq_s16, vmovl_u8, vmull_high_u8, + vmull_u8, vmulq_s16, vqaddq_s16, vqaddq_u16, vqshrn_n_u16, vqshrun_n_s16, vreinterpretq_s16_u16, vst4q_u8, vsubq_s16, vsubq_u8, vzip1_u8, vzip2_u8, }; @@ -15,10 +15,9 @@ use crate::{YuvRange, YuvStandardMatrix}; #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] #[cfg(target_feature = "neon")] unsafe fn premutiply_vector(v: uint8x16_t, a_values: uint8x16_t) -> uint8x16_t { - let mut acc_hi = vdupq_n_u16(127); - let mut acc_lo = vdupq_n_u16(127); - acc_hi = vmlal_high_u8(acc_hi, v, a_values); - acc_lo = vmlal_u8(acc_lo, vget_low_u8(v), vget_low_u8(a_values)); + let initial = vdupq_n_u16(127); + let acc_hi = vqaddq_u16(initial, vmull_high_u8(v, a_values)); + let acc_lo = vqaddq_u16(initial, vmull_u8(vget_low_u8(v), vget_low_u8(a_values))); let hi = vqshrn_n_u16::<8>(acc_hi); let lo = vqshrn_n_u16::<8>(acc_lo); vcombine_u8(lo, hi)