Skip to content

Commit

Permalink
AVX, SSE encoding/decoding improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
awxkee committed Nov 25, 2024
1 parent 6f10a64 commit 8d360d9
Show file tree
Hide file tree
Showing 23 changed files with 1,137 additions and 393 deletions.
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,9 +76,9 @@ Tests performed on the image 5763x3842

| | time(NEON) | Time(AVX) |
|------------------------|:----------:|:---------:|
| utils RGB->YUV 4:2:0 | 3.48ms | 6.14ms |
| utils RGB->YUV 4:2:0 | 3.48ms | 3.64ms |
| libyuv RGB->YUV 4:2:0 | 3.58ms | 33.87ms |
| utils RGBA->YUV 4:2:0 | 4.32ms | 7.34ms |
| utils RGBA->YUV 4:2:0 | 4.32ms | 5.74ms |
| libyuv RGBA->YUV 4:2:0 | 4.87ms | 23.48ms |
| utils RGBA->YUV 4:2:2 | 4.83ms | 7.08ms |
| libyuv RGBA->YUV 4:2:2 | 5.90ms | 35.23ms |
Expand All @@ -90,7 +90,7 @@ Tests performed on the image 5763x3842
|------------------------|:----------:|:---------:|
| utils YUV NV12->RGB | 3.86ms | 6.48ms |
| libyuv YUV NV12->RGB | 5.20ms | 45.28ms |
| utils YUV 4:2:0->RGB | 3.28ms | 5.44ms |
| utils YUV 4:2:0->RGB | 3.28ms | 5.34ms |
| libyuv YUV 4:2:0->RGB | 5.70ms | 44.95ms |
| utils YUV 4:2:0->RGBA | 3.82ms | 5.98ms |
| libyuv YUV 4:2:0->RGBA | 6.13ms | 6.88ms |
Expand Down
100 changes: 61 additions & 39 deletions app/benches/yuv8/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

use std::alloc::Layout;
use criterion::{criterion_group, criterion_main, Criterion};
use image::{GenericImageView, ImageReader};
use yuv_sys::{
Expand Down Expand Up @@ -99,25 +99,36 @@ pub fn criterion_benchmark(c: &mut Criterion) {
});

c.bench_function("libyuv RGB -> YUV 4:2:0", |b| {
let mut test_planar = YuvPlanarImageMut::<u8>::alloc(
dimensions.0,
dimensions.1,
YuvChromaSubsampling::Yuv420,
);
b.iter(|| unsafe {
rs_RGB24ToI420(
src_bytes.as_ptr(),
stride as i32,
test_planar.y_plane.borrow_mut().as_mut_ptr(),
test_planar.y_stride as i32,
test_planar.u_plane.borrow_mut().as_mut_ptr(),
test_planar.u_stride as i32,
test_planar.v_plane.borrow_mut().as_mut_ptr(),
test_planar.v_stride as i32,
test_planar.width as i32,
test_planar.height as i32,
);
})
unsafe {
let layout_rgb = Layout::from_size_align(dimensions.0 as usize * dimensions.1 as usize * 3, 16).unwrap();
let layout_y = Layout::from_size_align(dimensions.0 as usize * dimensions.1 as usize, 16).unwrap();
let layout_uv = Layout::from_size_align((dimensions.0 as usize + 1) / 2 * (dimensions.1 as usize + 1) / 2, 16).unwrap();
let target_y = std::alloc::alloc(layout_y);
let target_u = std::alloc::alloc(layout_uv);
let target_v = std::alloc::alloc(layout_uv);
let source_rgb = std::alloc::alloc(layout_rgb);
for (x, src) in src_bytes.iter().enumerate() {
*source_rgb.add(x) = *src;
}
b.iter(|| {
rs_RGB24ToI420(
source_rgb,
stride as i32,
target_y,
dimensions.0 as i32,
target_u,
(dimensions.0 as i32 + 1) / 2,
target_v,
(dimensions.0 as i32 + 1) / 2,
dimensions.0 as i32,
dimensions.1 as i32,
);
});
std::alloc::dealloc(target_y, layout_y);
std::alloc::dealloc(target_u, layout_uv);
std::alloc::dealloc(target_v, layout_uv);
std::alloc::dealloc(source_rgb, layout_rgb);
}
});

c.bench_function("yuvutils RGBA -> YUV 4:2:0", |b| {
Expand All @@ -139,25 +150,36 @@ pub fn criterion_benchmark(c: &mut Criterion) {
});

c.bench_function("libyuv RGBA -> YUV 4:2:0", |b| {
let mut test_planar = YuvPlanarImageMut::<u8>::alloc(
dimensions.0,
dimensions.1,
YuvChromaSubsampling::Yuv420,
);
b.iter(|| unsafe {
rs_ABGRToI420(
rgba_image.as_ptr(),
dimensions.0 as i32 * 4i32,
test_planar.y_plane.borrow_mut().as_mut_ptr(),
test_planar.y_stride as i32,
test_planar.u_plane.borrow_mut().as_mut_ptr(),
test_planar.u_stride as i32,
test_planar.v_plane.borrow_mut().as_mut_ptr(),
test_planar.v_stride as i32,
test_planar.width as i32,
test_planar.height as i32,
);
})
unsafe {
let layout_rgba = Layout::from_size_align(dimensions.0 as usize * dimensions.1 as usize * 4, 16).unwrap();
let layout_y = Layout::from_size_align(dimensions.0 as usize * dimensions.1 as usize, 16).unwrap();
let layout_uv = Layout::from_size_align((dimensions.0 as usize + 1) / 2 * (dimensions.1 as usize + 1) / 2, 16).unwrap();
let target_y = std::alloc::alloc(layout_y);
let target_u = std::alloc::alloc(layout_uv);
let target_v = std::alloc::alloc(layout_uv);
let source_rgb = std::alloc::alloc(layout_rgba);
for (x, src) in src_bytes.iter().enumerate() {
*source_rgb.add(x) = *src;
}
b.iter(|| {
rs_ABGRToI420(
source_rgb,
dimensions.0 as i32 * 4i32,
target_y,
dimensions.0 as i32,
target_u,
(dimensions.0 as i32 + 1) / 2,
target_v,
(dimensions.0 as i32 + 1) / 2,
dimensions.0 as i32,
dimensions.1 as i32,
);
});
std::alloc::dealloc(target_y, layout_y);
std::alloc::dealloc(target_u, layout_uv);
std::alloc::dealloc(target_v, layout_uv);
std::alloc::dealloc(source_rgb, layout_rgba);
}
});

c.bench_function("yuvutils RGBA -> YUV 4:2:2", |b| {
Expand Down
2 changes: 2 additions & 0 deletions src/avx2/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ mod yuv_to_rgba_alpha;
mod yuv_to_yuv2;
mod yuy2_to_rgb;
mod yuy2_to_yuv;
mod rgba_to_yuv420;

pub(crate) use rgb_to_nv::avx2_rgba_to_nv;
pub(crate) use rgb_to_y::avx2_rgb_to_y_row;
Expand All @@ -62,3 +63,4 @@ pub(crate) use yuv_to_rgba_alpha::avx2_yuv_to_rgba_alpha;
pub(crate) use yuv_to_yuv2::yuv_to_yuy2_avx2_row;
pub(crate) use yuy2_to_rgb::yuy2_to_rgb_avx;
pub(crate) use yuy2_to_yuv::yuy2_to_yuv_avx;
pub(crate) use rgba_to_yuv420::avx2_rgba_to_yuv420;
51 changes: 26 additions & 25 deletions src/avx2/rgb_to_nv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,8 @@ unsafe fn avx2_rgba_to_nv_impl<
let mut uv_x = start_ux;

const V_SHR: i32 = 3;
const V_SCALE: i32 = 7;
const V_SCALE: i32 = 6;

let rounding_const_bias: i16 = 1 << (V_SHR - 1);
let bias_y = range.bias_y as i16 * (1 << V_SHR) + rounding_const_bias;
let bias_uv = range.bias_uv as i16 * (1 << V_SHR) + rounding_const_bias;
Expand Down Expand Up @@ -180,10 +181,10 @@ unsafe fn avx2_rgba_to_nv_impl<
y_bias,
_mm256_add_epi16(
_mm256_add_epi16(
_mm256_mulhi_epi16(r_low, v_yr),
_mm256_mulhi_epi16(g_low, v_yg),
_mm256_mulhrs_epi16(r_low, v_yr),
_mm256_mulhrs_epi16(g_low, v_yg),
),
_mm256_mulhi_epi16(b_low, v_yb),
_mm256_mulhrs_epi16(b_low, v_yb),
),
)),
i_cap_y,
Expand All @@ -197,10 +198,10 @@ unsafe fn avx2_rgba_to_nv_impl<
y_bias,
_mm256_add_epi16(
_mm256_add_epi16(
_mm256_mulhi_epi16(r_high, v_yr),
_mm256_mulhi_epi16(g_high, v_yg),
_mm256_mulhrs_epi16(r_high, v_yr),
_mm256_mulhrs_epi16(g_high, v_yg),
),
_mm256_mulhi_epi16(b_high, v_yb),
_mm256_mulhrs_epi16(b_high, v_yb),
),
)),
i_cap_y,
Expand All @@ -218,10 +219,10 @@ unsafe fn avx2_rgba_to_nv_impl<
uv_bias,
_mm256_add_epi16(
_mm256_add_epi16(
_mm256_mulhi_epi16(r_low, v_cb_r),
_mm256_mulhi_epi16(g_low, v_cb_g),
_mm256_mulhrs_epi16(r_low, v_cb_r),
_mm256_mulhrs_epi16(g_low, v_cb_g),
),
_mm256_mulhi_epi16(b_low, v_cb_b),
_mm256_mulhrs_epi16(b_low, v_cb_b),
),
)),
i_cap_uv,
Expand All @@ -234,10 +235,10 @@ unsafe fn avx2_rgba_to_nv_impl<
uv_bias,
_mm256_add_epi16(
_mm256_add_epi16(
_mm256_mulhi_epi16(r_low, v_cr_r),
_mm256_mulhi_epi16(g_low, v_cr_g),
_mm256_mulhrs_epi16(r_low, v_cr_r),
_mm256_mulhrs_epi16(g_low, v_cr_g),
),
_mm256_mulhi_epi16(b_low, v_cr_b),
_mm256_mulhrs_epi16(b_low, v_cr_b),
),
)),
i_cap_uv,
Expand All @@ -250,10 +251,10 @@ unsafe fn avx2_rgba_to_nv_impl<
uv_bias,
_mm256_add_epi16(
_mm256_add_epi16(
_mm256_mulhi_epi16(r_high, v_cb_r),
_mm256_mulhi_epi16(g_high, v_cb_g),
_mm256_mulhrs_epi16(r_high, v_cb_r),
_mm256_mulhrs_epi16(g_high, v_cb_g),
),
_mm256_mulhi_epi16(b_high, v_cb_b),
_mm256_mulhrs_epi16(b_high, v_cb_b),
),
)),
i_cap_uv,
Expand All @@ -266,10 +267,10 @@ unsafe fn avx2_rgba_to_nv_impl<
uv_bias,
_mm256_add_epi16(
_mm256_add_epi16(
_mm256_mulhi_epi16(r_high, v_cr_r),
_mm256_mulhi_epi16(g_high, v_cr_g),
_mm256_mulhrs_epi16(r_high, v_cr_r),
_mm256_mulhrs_epi16(g_high, v_cr_g),
),
_mm256_mulhi_epi16(b_high, v_cr_b),
_mm256_mulhrs_epi16(b_high, v_cr_b),
),
)),
i_cap_uv,
Expand Down Expand Up @@ -300,10 +301,10 @@ unsafe fn avx2_rgba_to_nv_impl<
uv_bias,
_mm256_add_epi16(
_mm256_add_epi16(
_mm256_mulhi_epi16(r1, v_cb_r),
_mm256_mulhi_epi16(g1, v_cb_g),
_mm256_mulhrs_epi16(r1, v_cb_r),
_mm256_mulhrs_epi16(g1, v_cb_g),
),
_mm256_mulhi_epi16(b1, v_cb_b),
_mm256_mulhrs_epi16(b1, v_cb_b),
),
)),
i_cap_uv,
Expand All @@ -316,10 +317,10 @@ unsafe fn avx2_rgba_to_nv_impl<
uv_bias,
_mm256_add_epi16(
_mm256_add_epi16(
_mm256_mulhi_epi16(r1, v_cr_r),
_mm256_mulhi_epi16(g1, v_cr_g),
_mm256_mulhrs_epi16(r1, v_cr_r),
_mm256_mulhrs_epi16(g1, v_cr_g),
),
_mm256_mulhi_epi16(b1, v_cr_b),
_mm256_mulhrs_epi16(b1, v_cr_b),
),
)),
i_cap_uv,
Expand Down
14 changes: 7 additions & 7 deletions src/avx2/rgb_to_y.rs
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ pub(crate) unsafe fn avx2_rgb_to_y_row_impl<const ORIGIN_CHANNELS: u8>(
let mut cx = start_cx;

const V_SHR: i32 = 3;
const V_SCALE: i32 = 7;
const V_SCALE: i32 = 6;
let rounding_const_bias: i16 = 1 << (V_SHR - 1);
let bias_y = range.bias_y as i16 * (1 << V_SHR) + rounding_const_bias;

Expand Down Expand Up @@ -144,10 +144,10 @@ pub(crate) unsafe fn avx2_rgb_to_y_row_impl<const ORIGIN_CHANNELS: u8>(
y_bias,
_mm256_add_epi16(
_mm256_add_epi16(
_mm256_mulhi_epi16(r_low, v_yr),
_mm256_mulhi_epi16(g_low, v_yg),
_mm256_mulhrs_epi16(r_low, v_yr),
_mm256_mulhrs_epi16(g_low, v_yg),
),
_mm256_mulhi_epi16(b_low, v_yb),
_mm256_mulhrs_epi16(b_low, v_yb),
),
)),
i_cap_y,
Expand All @@ -161,10 +161,10 @@ pub(crate) unsafe fn avx2_rgb_to_y_row_impl<const ORIGIN_CHANNELS: u8>(
y_bias,
_mm256_add_epi16(
_mm256_add_epi16(
_mm256_mulhi_epi16(r_high, v_yr),
_mm256_mulhi_epi16(g_high, v_yg),
_mm256_mulhrs_epi16(r_high, v_yr),
_mm256_mulhrs_epi16(g_high, v_yg),
),
_mm256_mulhi_epi16(b_high, v_yb),
_mm256_mulhrs_epi16(b_high, v_yb),
),
)),
i_cap_y,
Expand Down
Loading

0 comments on commit 8d360d9

Please sign in to comment.