From e455cd6f4453cc6c0e26b1a3c0c422949f15eba2 Mon Sep 17 00:00:00 2001 From: Radzivon Bartoshyk Date: Wed, 6 Nov 2024 00:44:42 +0000 Subject: [PATCH] Improvements --- src/avx2/yuy2_to_rgb.rs | 17 +-- src/avx2/yuy2_to_yuv.rs | 12 +- src/neon/yuy2_to_rgb.rs | 10 +- src/neon/yuy2_to_yuv.rs | 20 +-- src/sse/yuy2_to_rgb.rs | 21 +-- src/sse/yuy2_to_yuv.rs | 20 +-- src/yuy2_to_rgb.rs | 9 -- src/yuy2_to_rgb_p16.rs | 2 +- src/yuy2_to_yuv.rs | 318 ++++++++++++++++++++++++---------------- src/yuy2_to_yuv_p16.rs | 240 ++++++++++++++++++------------ 10 files changed, 376 insertions(+), 293 deletions(-) diff --git a/src/avx2/yuy2_to_rgb.rs b/src/avx2/yuy2_to_rgb.rs index 76de0a3..eb62235 100644 --- a/src/avx2/yuy2_to_rgb.rs +++ b/src/avx2/yuy2_to_rgb.rs @@ -43,22 +43,13 @@ pub fn yuy2_to_rgb_avx( range: &YuvChromaRange, transform: &CbCrInverseTransform, yuy2_store: &[u8], - yuy2_offset: usize, rgb: &mut [u8], - rgb_offset: usize, width: u32, nav: YuvToYuy2Navigation, ) -> YuvToYuy2Navigation { unsafe { yuy2_to_rgb_avx_impl::( - range, - transform, - yuy2_store, - yuy2_offset, - rgb, - rgb_offset, - width, - nav, + range, transform, yuy2_store, rgb, width, nav, ) } } @@ -68,9 +59,7 @@ unsafe fn yuy2_to_rgb_avx_impl range: &YuvChromaRange, transform: &CbCrInverseTransform, yuy2_store: &[u8], - yuy2_offset: usize, rgb: &mut [u8], - rgb_offset: usize, width: u32, nav: YuvToYuy2Navigation, ) -> YuvToYuy2Navigation { @@ -95,8 +84,8 @@ unsafe fn yuy2_to_rgb_avx_impl let rounding_const = _mm256_set1_epi16(1 << 5); for x in (_yuy2_x..max_x_32).step_by(32) { - let yuy2_offset = yuy2_offset + x * 4; - let dst_pos = rgb_offset + _cx * dst_chans.get_channels_count(); + let yuy2_offset = x * 4; + let dst_pos = _cx * dst_chans.get_channels_count(); let dst_ptr = rgb.as_mut_ptr().add(dst_pos); let yuy2_ptr = yuy2_store.as_ptr().add(yuy2_offset); diff --git a/src/avx2/yuy2_to_yuv.rs b/src/avx2/yuy2_to_yuv.rs index e88b3c6..e8a42fc 100644 --- a/src/avx2/yuy2_to_yuv.rs +++ b/src/avx2/yuy2_to_yuv.rs @@ -39,13 +39,9 @@ use std::arch::x86_64::*; #[target_feature(enable = "avx2")] pub unsafe fn yuy2_to_yuv_avx( y_plane: &mut [u8], - y_offset: usize, u_plane: &mut [u8], - u_offset: usize, v_plane: &mut [u8], - v_offset: usize, yuy2_store: &[u8], - yuy2_offset: usize, width: u32, nav: YuvToYuy2Navigation, ) -> YuvToYuy2Navigation { @@ -59,10 +55,10 @@ pub unsafe fn yuy2_to_yuv_avx( let max_x_32 = (width as usize / 2).saturating_sub(32); for x in (_yuy2_x..max_x_32).step_by(32) { - let dst_offset = yuy2_offset + x * 4; - let u_pos = u_offset + _uv_x; - let v_pos = v_offset + _uv_x; - let y_pos = y_offset + _cx; + let dst_offset = x * 4; + let u_pos = _uv_x; + let v_pos = _uv_x; + let y_pos = _cx; let yuy2_ptr = yuy2_store.as_ptr().add(dst_offset); diff --git a/src/neon/yuy2_to_rgb.rs b/src/neon/yuy2_to_rgb.rs index f054205..66c6cfa 100644 --- a/src/neon/yuy2_to_rgb.rs +++ b/src/neon/yuy2_to_rgb.rs @@ -36,9 +36,7 @@ pub fn yuy2_to_rgb_neon( range: &YuvChromaRange, transform: &CbCrInverseTransform, yuy2_store: &[u8], - yuy2_offset: usize, rgb: &mut [u8], - rgb_offset: usize, width: u32, nav: YuvToYuy2Navigation, ) -> YuvToYuy2Navigation { @@ -63,8 +61,8 @@ pub fn yuy2_to_rgb_neon( let v_alpha = vdupq_n_u8(255u8); for x in (_yuy2_x..max_x_16).step_by(16) { - let dst_offset = yuy2_offset + x * 4; - let dst_pos = rgb_offset + _cx * dst_chans.get_channels_count(); + let dst_offset = x * 4; + let dst_pos = _cx * dst_chans.get_channels_count(); let dst_ptr = rgb.as_mut_ptr().add(dst_pos); let pixel_set = vld4q_u8(yuy2_store.as_ptr().add(dst_offset)); @@ -250,8 +248,8 @@ pub fn yuy2_to_rgb_neon( } for x in (_yuy2_x..max_x_8).step_by(8) { - let dst_offset = yuy2_offset + x * 4; - let dst_pos = rgb_offset + _cx * dst_chans.get_channels_count(); + let dst_offset = x * 4; + let dst_pos = _cx * dst_chans.get_channels_count(); let dst_ptr = rgb.as_mut_ptr().add(dst_pos); let pixel_set = vld4_u8(yuy2_store.as_ptr().add(dst_offset)); diff --git a/src/neon/yuy2_to_yuv.rs b/src/neon/yuy2_to_yuv.rs index 54a31be..62db95c 100644 --- a/src/neon/yuy2_to_yuv.rs +++ b/src/neon/yuy2_to_yuv.rs @@ -32,13 +32,9 @@ use std::arch::aarch64::*; pub fn yuy2_to_yuv_neon_impl( y_plane: &mut [u8], - y_offset: usize, u_plane: &mut [u8], - u_offset: usize, v_plane: &mut [u8], - v_offset: usize, yuy2_store: &[u8], - yuy2_offset: usize, width: u32, nav: YuvToYuy2Navigation, ) -> YuvToYuy2Navigation { @@ -54,10 +50,10 @@ pub fn yuy2_to_yuv_neon_impl( let max_x_8 = (width as usize / 2).saturating_sub(8); for x in (_yuy2_x..max_x_16).step_by(16) { - let dst_offset = yuy2_offset + x * 4; - let u_pos = u_offset + _uv_x; - let v_pos = v_offset + _uv_x; - let y_pos = y_offset + _cx; + let dst_offset = x * 4; + let u_pos = _uv_x; + let v_pos = _uv_x; + let y_pos = _cx; let pixel_set = vld4q_u8(yuy2_store.as_ptr().add(dst_offset)); let mut y_first = match yuy2_source { @@ -121,10 +117,10 @@ pub fn yuy2_to_yuv_neon_impl( } for x in (_yuy2_x..max_x_8).step_by(8) { - let dst_offset = yuy2_offset + x * 4; - let u_pos = u_offset + _uv_x; - let v_pos = v_offset + _uv_x; - let y_pos = y_offset + _cx; + let dst_offset = x * 4; + let u_pos = _uv_x; + let v_pos = _uv_x; + let y_pos = _cx; let pixel_set = vld4_u8(yuy2_store.as_ptr().add(dst_offset)); let mut y_first = match yuy2_source { diff --git a/src/sse/yuy2_to_rgb.rs b/src/sse/yuy2_to_rgb.rs index 0c37ad8..9a6f1ba 100644 --- a/src/sse/yuy2_to_rgb.rs +++ b/src/sse/yuy2_to_rgb.rs @@ -40,22 +40,13 @@ pub fn yuy2_to_rgb_sse( range: &YuvChromaRange, transform: &CbCrInverseTransform, yuy2_store: &[u8], - yuy2_offset: usize, rgb: &mut [u8], - rgb_offset: usize, width: u32, nav: YuvToYuy2Navigation, ) -> YuvToYuy2Navigation { unsafe { yuy2_to_rgb_sse_impl::( - range, - transform, - yuy2_store, - yuy2_offset, - rgb, - rgb_offset, - width, - nav, + range, transform, yuy2_store, rgb, width, nav, ) } } @@ -65,9 +56,7 @@ unsafe fn yuy2_to_rgb_sse_impl range: &YuvChromaRange, transform: &CbCrInverseTransform, yuy2_store: &[u8], - yuy2_offset: usize, rgb: &mut [u8], - rgb_offset: usize, width: u32, nav: YuvToYuy2Navigation, ) -> YuvToYuy2Navigation { @@ -94,8 +83,8 @@ unsafe fn yuy2_to_rgb_sse_impl let zeros = _mm_setzero_si128(); for x in (_yuy2_x..max_x_16).step_by(16) { - let yuy2_offset = yuy2_offset + x * 4; - let dst_pos = rgb_offset + _cx * dst_chans.get_channels_count(); + let yuy2_offset = x * 4; + let dst_pos = _cx * dst_chans.get_channels_count(); let dst_ptr = rgb.as_mut_ptr().add(dst_pos); let yuy2_ptr = yuy2_store.as_ptr().add(yuy2_offset); @@ -346,8 +335,8 @@ unsafe fn yuy2_to_rgb_sse_impl } for x in (_yuy2_x..max_x_8).step_by(8) { - let yuy2_offset = yuy2_offset + x * 4; - let dst_pos = rgb_offset + _cx * dst_chans.get_channels_count(); + let yuy2_offset = x * 4; + let dst_pos = _cx * dst_chans.get_channels_count(); let dst_ptr = rgb.as_mut_ptr().add(dst_pos); let yuy2_ptr = yuy2_store.as_ptr().add(yuy2_offset); diff --git a/src/sse/yuy2_to_yuv.rs b/src/sse/yuy2_to_yuv.rs index cfaf6ac..2f9b097 100644 --- a/src/sse/yuy2_to_yuv.rs +++ b/src/sse/yuy2_to_yuv.rs @@ -37,13 +37,9 @@ use std::arch::x86_64::*; #[target_feature(enable = "sse4.1")] pub unsafe fn yuy2_to_yuv_sse_impl( y_plane: &mut [u8], - y_offset: usize, u_plane: &mut [u8], - u_offset: usize, v_plane: &mut [u8], - v_offset: usize, yuy2_store: &[u8], - yuy2_offset: usize, width: u32, nav: YuvToYuy2Navigation, ) -> YuvToYuy2Navigation { @@ -59,10 +55,10 @@ pub unsafe fn yuy2_to_yuv_sse_impl let max_x_8 = (width as usize / 2).saturating_sub(8); for x in (_yuy2_x..max_x_16).step_by(16) { - let yuy2_offset = yuy2_offset + x * 4; - let u_pos = u_offset + _uv_x; - let v_pos = v_offset + _uv_x; - let y_pos = y_offset + _cx; + let yuy2_offset = x * 4; + let u_pos = _uv_x; + let v_pos = _uv_x; + let y_pos = _cx; let yuy2_ptr = yuy2_store.as_ptr().add(yuy2_offset); @@ -133,10 +129,10 @@ pub unsafe fn yuy2_to_yuv_sse_impl } for x in (_yuy2_x..max_x_8).step_by(8) { - let yuy2_offset = yuy2_offset + x * 4; - let u_pos = u_offset + _uv_x; - let v_pos = v_offset + _uv_x; - let y_pos = y_offset + _cx; + let yuy2_offset = x * 4; + let u_pos = _uv_x; + let v_pos = _uv_x; + let y_pos = _cx; let yuy2_ptr = yuy2_store.as_ptr().add(yuy2_offset); diff --git a/src/yuy2_to_rgb.rs b/src/yuy2_to_rgb.rs index e1e1139..ec09746 100644 --- a/src/yuy2_to_rgb.rs +++ b/src/yuy2_to_rgb.rs @@ -91,9 +91,6 @@ fn yuy2_to_rgb_impl( } rgb_iter.zip(yuy2_iter).for_each(|(rgb_store, yuy2_store)| { - let rgb_offset = 0usize; - let yuy_offset = 0usize; - let mut _cx = 0usize; let mut _yuy2_x = 0usize; @@ -104,9 +101,7 @@ fn yuy2_to_rgb_impl( &range, &inverse_transform, yuy2_store, - yuy_offset, rgb_store, - rgb_offset, width, YuvToYuy2Navigation::new(_cx, 0, _yuy2_x), ); @@ -118,9 +113,7 @@ fn yuy2_to_rgb_impl( &range, &inverse_transform, yuy2_store, - yuy_offset, rgb_store, - rgb_offset, width, YuvToYuy2Navigation::new(_cx, 0, _yuy2_x), ); @@ -135,9 +128,7 @@ fn yuy2_to_rgb_impl( &range, &inverse_transform, yuy2_store, - yuy_offset, rgb_store, - rgb_offset, width, YuvToYuy2Navigation::new(_cx, 0, _yuy2_x), ); diff --git a/src/yuy2_to_rgb_p16.rs b/src/yuy2_to_rgb_p16.rs index 15e824d..7ec5ad9 100644 --- a/src/yuy2_to_rgb_p16.rs +++ b/src/yuy2_to_rgb_p16.rs @@ -43,7 +43,7 @@ fn yuy2_to_rgb_impl_p16( planar_image: &mut YuvPlanarImageMut, @@ -47,158 +51,222 @@ fn yuy2_to_yuv_impl( planar_image.check_constraints(chroma_subsampling)?; - let mut y_offset = 0usize; - let mut u_offset = 0usize; - let mut v_offset = 0usize; - let mut yuy_offset = 0usize; - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] let mut _use_sse = std::arch::is_x86_feature_detected!("sse4.1"); #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] let mut _use_avx2 = std::arch::is_x86_feature_detected!("avx2"); + let width = planar_image.width; + + let process_wide_row = + |_y_plane: &mut [u8], _u_plane: &mut [u8], _v_plane: &mut [u8], _yuy2_store: &[u8]| { + let mut _yuy2_nav = YuvToYuy2Navigation::new(0, 0, 0); + #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] + { + _yuy2_nav = yuy2_to_yuv_neon_impl::( + _y_plane, + _u_plane, + _v_plane, + _yuy2_store, + width, + _yuy2_nav, + ); + } + + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + unsafe { + if _use_avx2 { + _yuy2_nav = yuy2_to_yuv_avx::( + _y_plane, + _u_plane, + _v_plane, + _yuy2_store, + width, + _yuy2_nav, + ); + } + if _use_sse { + _yuy2_nav = yuy2_to_yuv_sse_impl::( + _y_plane, + _u_plane, + _v_plane, + _yuy2_store, + width, + _yuy2_nav, + ); + } + } + _yuy2_nav + }; + let y_plane = planar_image.y_plane.borrow_mut(); let u_plane = planar_image.u_plane.borrow_mut(); let v_plane = planar_image.v_plane.borrow_mut(); let y_stride = planar_image.y_stride; let u_stride = planar_image.u_stride; let v_stride = planar_image.v_stride; - let width = planar_image.width; - let height = planar_image.height; - - for y in 0..height as usize { - let mut _cx = 0usize; - let mut _uv_x = 0usize; - let mut _yuy2_x = 0usize; - #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] + if chroma_subsampling == YuvChromaSubsample::Yuv444 { + let iter; + #[cfg(feature = "rayon")] + { + iter = y_plane + .par_chunks_exact_mut(y_stride as usize) + .zip(u_plane.par_chunks_exact_mut(u_stride as usize)) + .zip(v_plane.par_chunks_exact_mut(v_stride as usize)) + .zip(yuy2_store.par_chunks_exact(yuy2_stride as usize)); + } + #[cfg(not(feature = "rayon"))] { - let processed = yuy2_to_yuv_neon_impl::( - y_plane, - y_offset, - u_plane, - u_offset, - v_plane, - v_offset, - yuy2_store, - yuy_offset, - width, - YuvToYuy2Navigation::new(_cx, _uv_x, _yuy2_x), - ); - _cx = processed.cx; - _uv_x = processed.uv_x; - _yuy2_x = processed.x; + iter = y_plane + .chunks_exact_mut(y_stride as usize) + .zip(u_plane.chunks_exact_mut(u_stride as usize)) + .zip(v_plane.chunks_exact_mut(v_stride as usize)) + .zip(yuy2_store.chunks_exact(yuy2_stride as usize)); } + iter.for_each(|(((y_dst, u_dst), v_dst), yuy2_src)| { + let p_offset = process_wide_row(y_dst, u_dst, v_dst, yuy2_src); - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - unsafe { - if _use_avx2 { - let processed = yuy2_to_yuv_avx::( - y_plane, - y_offset, - u_plane, - u_offset, - v_plane, - v_offset, - yuy2_store, - yuy_offset, - width, - YuvToYuy2Navigation::new(_cx, _uv_x, _yuy2_x), - ); - _cx = processed.cx; - _uv_x = processed.uv_x; - _yuy2_x = processed.x; + for (((y_dst, u_dst), v_dst), yuy2) in y_dst + .chunks_exact_mut(2) + .zip(u_dst.chunks_exact_mut(2)) + .zip(v_dst.chunks_exact_mut(2)) + .zip(yuy2_src.chunks_exact(4)) + .skip(p_offset.cx) + { + let first_y_position = yuy2[yuy2_target.get_first_y_position()]; + let second_y_position = yuy2[yuy2_target.get_second_y_position()]; + let u_value = yuy2[yuy2_target.get_u_position()]; + let v_value = yuy2[yuy2_target.get_v_position()]; + y_dst[0] = first_y_position; + y_dst[1] = second_y_position; + u_dst[0] = u_value; + u_dst[1] = u_value; + v_dst[0] = v_value; + v_dst[1] = v_value; } - if _use_sse { - let processed = yuy2_to_yuv_sse_impl::( - y_plane, - y_offset, - u_plane, - u_offset, - v_plane, - v_offset, - yuy2_store, - yuy_offset, - width, - YuvToYuy2Navigation::new(_cx, _uv_x, _yuy2_x), - ); - _cx = processed.cx; - _uv_x = processed.uv_x; - _yuy2_x = processed.x; + + if width & 1 != 0 { + let y_dst = y_dst.last_mut().unwrap(); + let u_dst = u_dst.last_mut().unwrap(); + let v_dst = v_dst.last_mut().unwrap(); + let yuy2 = yuy2_src.chunks_exact(4).remainder(); + let yuy2 = &yuy2[0..4]; + *y_dst = yuy2[yuy2_target.get_first_y_position()]; + *u_dst = yuy2[yuy2_target.get_u_position()]; + *v_dst = yuy2[yuy2_target.get_v_position()]; } + }); + } else if chroma_subsampling == YuvChromaSubsample::Yuv422 { + let iter; + #[cfg(feature = "rayon")] + { + iter = y_plane + .par_chunks_exact_mut(y_stride as usize) + .zip(u_plane.par_chunks_exact_mut(u_stride as usize)) + .zip(v_plane.par_chunks_exact_mut(v_stride as usize)) + .zip(yuy2_store.par_chunks_exact(yuy2_stride as usize)); } + #[cfg(not(feature = "rayon"))] + { + iter = y_plane + .chunks_exact_mut(y_stride as usize) + .zip(u_plane.chunks_exact_mut(u_stride as usize)) + .zip(v_plane.chunks_exact_mut(v_stride as usize)) + .zip(yuy2_store.chunks_exact(yuy2_stride as usize)); + } + iter.for_each(|(((y_dst, u_dst), v_dst), yuy2_src)| { + let p_offset = process_wide_row(y_dst, u_dst, v_dst, yuy2_src); - for x in _yuy2_x..width as usize / 2 { - let u_pos = u_offset + _uv_x; - let v_pos = v_offset + _uv_x; - let y_pos = y_offset + _cx; - let yuy2_offset = yuy_offset + x * 4; - - let yuy2_plane_shifted = unsafe { yuy2_store.get_unchecked(yuy2_offset..) }; - - let first_y_position = - unsafe { *yuy2_plane_shifted.get_unchecked(yuy2_target.get_first_y_position()) }; - let second_y_position = - unsafe { *yuy2_plane_shifted.get_unchecked(yuy2_target.get_second_y_position()) }; - let u_value = - unsafe { *yuy2_plane_shifted.get_unchecked(yuy2_target.get_u_position()) }; - let v_value = - unsafe { *yuy2_plane_shifted.get_unchecked(yuy2_target.get_v_position()) }; - - unsafe { - *y_plane.get_unchecked_mut(y_pos) = first_y_position; - *y_plane.get_unchecked_mut(y_pos + 1) = second_y_position; - *u_plane.get_unchecked_mut(u_pos) = u_value; - *v_plane.get_unchecked_mut(v_pos) = v_value; - if chroma_subsampling == YuvChromaSubsample::Yuv444 { - *u_plane.get_unchecked_mut(u_pos + 1) = u_value; - *v_plane.get_unchecked_mut(v_pos + 1) = v_value; - } + for (((y_dst, u_dst), v_dst), yuy2) in y_dst + .chunks_exact_mut(2) + .zip(u_dst.iter_mut()) + .zip(v_dst.iter_mut()) + .zip(yuy2_src.chunks_exact(4)) + .skip(p_offset.cx) + { + let first_y_position = yuy2[yuy2_target.get_first_y_position()]; + let second_y_position = yuy2[yuy2_target.get_second_y_position()]; + let u_value = yuy2[yuy2_target.get_u_position()]; + let v_value = yuy2[yuy2_target.get_v_position()]; + y_dst[0] = first_y_position; + y_dst[1] = second_y_position; + *u_dst = u_value; + *v_dst = v_value; } - _uv_x += match chroma_subsampling { - YuvChromaSubsample::Yuv420 | YuvChromaSubsample::Yuv422 => 1, - YuvChromaSubsample::Yuv444 => 2, - }; - _cx += 2; + if width & 1 != 0 { + let y_dst = y_dst.last_mut().unwrap(); + let u_dst = u_dst.last_mut().unwrap(); + let v_dst = v_dst.last_mut().unwrap(); + let yuy2 = yuy2_src.chunks_exact(4).remainder(); + let yuy2 = &yuy2[0..4]; + *y_dst = yuy2[yuy2_target.get_first_y_position()]; + *u_dst = yuy2[yuy2_target.get_u_position()]; + *v_dst = yuy2[yuy2_target.get_v_position()]; + } + }); + } else if chroma_subsampling == YuvChromaSubsample::Yuv420 { + let iter; + #[cfg(feature = "rayon")] + { + iter = y_plane + .par_chunks_exact_mut(y_stride as usize * 2) + .zip(u_plane.par_chunks_exact_mut(u_stride as usize)) + .zip(v_plane.par_chunks_exact_mut(v_stride as usize)) + .zip(yuy2_store.par_chunks_exact(yuy2_stride as usize * 2)); } + #[cfg(not(feature = "rayon"))] + { + iter = y_plane + .chunks_exact_mut(y_stride as usize * 2) + .zip(u_plane.chunks_exact_mut(u_stride as usize)) + .zip(v_plane.chunks_exact_mut(v_stride as usize)) + .zip(yuy2_store.chunks_exact(yuy2_stride as usize * 2)); + } + iter.for_each(|(((y_dst, u_dst), v_dst), yuy2_src)| { + for (y, (y_dst, yuy2)) in y_dst + .chunks_exact_mut(y_stride as usize) + .zip(yuy2_src.chunks_exact(yuy2_stride as usize)) + .enumerate() + { + let p_offset = process_wide_row(y_dst, u_dst, v_dst, yuy2); - if width & 1 == 1 { - let u_pos = u_offset + _uv_x; - let v_pos = v_offset + _uv_x; - let y_pos = y_offset + _cx; - let yuy2_offset = yuy_offset + ((width as usize - 1) / 2) * 4; - - let yuy2_plane_shifted = unsafe { yuy2_store.get_unchecked(yuy2_offset..) }; - - let first_y_position = - unsafe { *yuy2_plane_shifted.get_unchecked(yuy2_target.get_first_y_position()) }; - let u_value = - unsafe { *yuy2_plane_shifted.get_unchecked(yuy2_target.get_u_position()) }; - let v_value = - unsafe { *yuy2_plane_shifted.get_unchecked(yuy2_target.get_v_position()) }; + let process_chroma = y & 1 == 0; - unsafe { - *y_plane.get_unchecked_mut(y_pos) = first_y_position; - *u_plane.get_unchecked_mut(u_pos) = u_value; - *v_plane.get_unchecked_mut(v_pos) = v_value; - } - } + for (((y_dst, u_dst), v_dst), yuy2) in y_dst + .chunks_exact_mut(2) + .zip(u_dst.iter_mut()) + .zip(v_dst.iter_mut()) + .zip(yuy2.chunks_exact(4)) + .skip(p_offset.cx) + { + let first_y_position = yuy2[yuy2_target.get_first_y_position()]; + let second_y_position = yuy2[yuy2_target.get_second_y_position()]; + y_dst[0] = first_y_position; + y_dst[1] = second_y_position; + if process_chroma { + let u_value = yuy2[yuy2_target.get_u_position()]; + let v_value = yuy2[yuy2_target.get_v_position()]; + *u_dst = u_value; + *v_dst = v_value; + } + } - y_offset += y_stride as usize; - yuy_offset += yuy2_stride as usize; - match chroma_subsampling { - YuvChromaSubsample::Yuv420 => { - if y & 1 == 1 { - u_offset += u_stride as usize; - v_offset += v_stride as usize; + if width & 1 != 0 { + let y_dst = y_dst.last_mut().unwrap(); + let yuy2 = yuy2_src.chunks_exact(4).remainder(); + let yuy2 = &yuy2[0..4]; + *y_dst = yuy2[yuy2_target.get_first_y_position()]; + if process_chroma { + let u_dst = u_dst.last_mut().unwrap(); + let v_dst = v_dst.last_mut().unwrap(); + *u_dst = yuy2[yuy2_target.get_u_position()]; + *v_dst = yuy2[yuy2_target.get_v_position()]; + } } } - YuvChromaSubsample::Yuv444 | YuvChromaSubsample::Yuv422 => { - u_offset += u_stride as usize; - v_offset += v_stride as usize; - } - } + }); } Ok(()) diff --git a/src/yuy2_to_yuv_p16.rs b/src/yuy2_to_yuv_p16.rs index 70f2415..c19b845 100644 --- a/src/yuy2_to_yuv_p16.rs +++ b/src/yuy2_to_yuv_p16.rs @@ -28,6 +28,10 @@ */ use crate::yuv_support::{YuvChromaSubsample, Yuy2Description}; use crate::{YuvError, YuvPlanarImageMut}; +#[cfg(feature = "rayon")] +use rayon::iter::{IndexedParallelIterator, ParallelIterator}; +#[cfg(feature = "rayon")] +use rayon::prelude::{ParallelSlice, ParallelSliceMut}; fn yuy2_to_yuv_impl( planar_image: &mut YuvPlanarImageMut, @@ -39,110 +43,166 @@ fn yuy2_to_yuv_impl( planar_image.check_constraints(chroma_subsampling)?; - let mut y_offset = 0usize; - let mut u_offset = 0usize; - let mut v_offset = 0usize; - let mut yuy_offset = 0usize; - let width = planar_image.width; - let height = planar_image.height; let y_plane = planar_image.y_plane.borrow_mut(); - let y_stride = planar_image.y_stride * 2; + let y_stride = planar_image.y_stride; let u_plane = planar_image.u_plane.borrow_mut(); - let u_stride = planar_image.u_stride * 2; + let u_stride = planar_image.u_stride; let v_plane = planar_image.v_plane.borrow_mut(); - let v_stride = planar_image.v_stride * 2; - - for y in 0..height as usize { - let mut _cx = 0usize; - let mut _uv_x = 0usize; - let mut _yuy2_x = 0usize; - - for x in _yuy2_x..width as usize / 2 { - unsafe { - let u_pos = _uv_x; - let v_pos = _uv_x; - let y_pos = _cx; - - let mut y_dst_ptr = (y_plane.as_mut_ptr() as *mut u8).add(y_offset) as *mut u16; - y_dst_ptr = y_dst_ptr.add(y_pos); - let mut u_dst_ptr = (u_plane.as_mut_ptr() as *mut u8).add(u_offset) as *mut u16; - u_dst_ptr = u_dst_ptr.add(u_pos); - let mut v_dst_ptr = (v_plane.as_mut_ptr() as *mut u8).add(v_offset) as *mut u16; - v_dst_ptr = v_dst_ptr.add(v_pos); - - let mut yuy2_ptr = (yuy2_store.as_ptr() as *const u8).add(yuy_offset) as *const u16; - yuy2_ptr = yuy2_ptr.add(x * 4); - - let first_y_position = yuy2_ptr - .add(yuy2_target.get_first_y_position()) - .read_unaligned(); - let second_y_position = yuy2_ptr - .add(yuy2_target.get_second_y_position()) - .read_unaligned(); - let u_value = u_dst_ptr.add(yuy2_target.get_u_position()).read_unaligned(); - let v_value = v_dst_ptr.add(yuy2_target.get_v_position()).read_unaligned(); + let v_stride = planar_image.v_stride; - y_dst_ptr.write_unaligned(first_y_position); - y_dst_ptr.add(1).write_unaligned(second_y_position); - u_dst_ptr.write_unaligned(u_value); - v_dst_ptr.write_unaligned(v_value); - if chroma_subsampling == YuvChromaSubsample::Yuv444 { - u_dst_ptr.add(1).write_unaligned(u_value); - v_dst_ptr.add(1).write_unaligned(v_value); - } + if chroma_subsampling == YuvChromaSubsample::Yuv444 { + let iter; + #[cfg(feature = "rayon")] + { + iter = y_plane + .par_chunks_exact_mut(y_stride as usize) + .zip(u_plane.par_chunks_exact_mut(u_stride as usize)) + .zip(v_plane.par_chunks_exact_mut(v_stride as usize)) + .zip(yuy2_store.par_chunks_exact(yuy2_stride as usize)); + } + #[cfg(not(feature = "rayon"))] + { + iter = y_plane + .chunks_exact_mut(y_stride as usize) + .zip(u_plane.chunks_exact_mut(u_stride as usize)) + .zip(v_plane.chunks_exact_mut(v_stride as usize)) + .zip(yuy2_store.chunks_exact(yuy2_stride as usize)); + } + iter.for_each(|(((y_dst, u_dst), v_dst), yuy2_src)| { + for (((y_dst, u_dst), v_dst), yuy2) in y_dst + .chunks_exact_mut(2) + .zip(u_dst.chunks_exact_mut(2)) + .zip(v_dst.chunks_exact_mut(2)) + .zip(yuy2_src.chunks_exact(4)) + { + let first_y_position = yuy2[yuy2_target.get_first_y_position()]; + let second_y_position = yuy2[yuy2_target.get_second_y_position()]; + let u_value = yuy2[yuy2_target.get_u_position()]; + let v_value = yuy2[yuy2_target.get_v_position()]; + y_dst[0] = first_y_position; + y_dst[1] = second_y_position; + u_dst[0] = u_value; + u_dst[1] = u_value; + v_dst[0] = v_value; + v_dst[1] = v_value; } - _uv_x += match chroma_subsampling { - YuvChromaSubsample::Yuv420 | YuvChromaSubsample::Yuv422 => 1, - YuvChromaSubsample::Yuv444 => 2, - }; - _cx += 2; + if width & 1 != 0 { + let y_dst = y_dst.last_mut().unwrap(); + let u_dst = u_dst.last_mut().unwrap(); + let v_dst = v_dst.last_mut().unwrap(); + let yuy2 = yuy2_src.chunks_exact(4).remainder(); + let yuy2 = &yuy2[0..4]; + *y_dst = yuy2[yuy2_target.get_first_y_position()]; + *u_dst = yuy2[yuy2_target.get_u_position()]; + *v_dst = yuy2[yuy2_target.get_v_position()]; + } + }); + } else if chroma_subsampling == YuvChromaSubsample::Yuv422 { + let iter; + #[cfg(feature = "rayon")] + { + iter = y_plane + .par_chunks_exact_mut(y_stride as usize) + .zip(u_plane.par_chunks_exact_mut(u_stride as usize)) + .zip(v_plane.par_chunks_exact_mut(v_stride as usize)) + .zip(yuy2_store.par_chunks_exact(yuy2_stride as usize)); } + #[cfg(not(feature = "rayon"))] + { + iter = y_plane + .chunks_exact_mut(y_stride as usize) + .zip(u_plane.chunks_exact_mut(u_stride as usize)) + .zip(v_plane.chunks_exact_mut(v_stride as usize)) + .zip(yuy2_store.chunks_exact(yuy2_stride as usize)); + } + iter.for_each(|(((y_dst, u_dst), v_dst), yuy2_src)| { + for (((y_dst, u_dst), v_dst), yuy2) in y_dst + .chunks_exact_mut(2) + .zip(u_dst.iter_mut()) + .zip(v_dst.iter_mut()) + .zip(yuy2_src.chunks_exact(4)) + { + let first_y_position = yuy2[yuy2_target.get_first_y_position()]; + let second_y_position = yuy2[yuy2_target.get_second_y_position()]; + let u_value = yuy2[yuy2_target.get_u_position()]; + let v_value = yuy2[yuy2_target.get_v_position()]; + y_dst[0] = first_y_position; + y_dst[1] = second_y_position; + *u_dst = u_value; + *v_dst = v_value; + } - if width & 1 == 1 { - unsafe { - let u_pos = _uv_x; - let v_pos = _uv_x; - let y_pos = _cx; - let yuy2_x = ((width as usize - 1) / 2) * 4; - - let mut y_dst_ptr = (y_plane.as_mut_ptr() as *mut u8).add(y_offset) as *mut u16; - y_dst_ptr = y_dst_ptr.add(y_pos); - let mut u_dst_ptr = (u_plane.as_mut_ptr() as *mut u8).add(u_offset) as *mut u16; - u_dst_ptr = u_dst_ptr.add(u_pos); - let mut v_dst_ptr = (v_plane.as_mut_ptr() as *mut u8).add(v_offset) as *mut u16; - v_dst_ptr = v_dst_ptr.add(v_pos); - - let mut yuy2_ptr = (yuy2_store.as_ptr() as *const u8).add(yuy_offset) as *const u16; - yuy2_ptr = yuy2_ptr.add(yuy2_x); - - let first_y_position = yuy2_ptr - .add(yuy2_target.get_first_y_position()) - .read_unaligned(); - let u_value = yuy2_ptr.add(yuy2_target.get_u_position()).read_unaligned(); - let v_value = yuy2_ptr.add(yuy2_target.get_v_position()).read_unaligned(); - - y_dst_ptr.write_unaligned(first_y_position); - u_dst_ptr.write_unaligned(u_value); - v_dst_ptr.write_unaligned(v_value); + if width & 1 != 0 { + let y_dst = y_dst.last_mut().unwrap(); + let u_dst = u_dst.last_mut().unwrap(); + let v_dst = v_dst.last_mut().unwrap(); + let yuy2 = yuy2_src.chunks_exact(4).remainder(); + let yuy2 = &yuy2[0..4]; + *y_dst = yuy2[yuy2_target.get_first_y_position()]; + *u_dst = yuy2[yuy2_target.get_u_position()]; + *v_dst = yuy2[yuy2_target.get_v_position()]; } + }); + } else if chroma_subsampling == YuvChromaSubsample::Yuv420 { + let iter; + #[cfg(feature = "rayon")] + { + iter = y_plane + .par_chunks_exact_mut(y_stride as usize * 2) + .zip(u_plane.par_chunks_exact_mut(u_stride as usize)) + .zip(v_plane.par_chunks_exact_mut(v_stride as usize)) + .zip(yuy2_store.par_chunks_exact(yuy2_stride as usize * 2)); } + #[cfg(not(feature = "rayon"))] + { + iter = y_plane + .chunks_exact_mut(y_stride as usize * 2) + .zip(u_plane.chunks_exact_mut(u_stride as usize)) + .zip(v_plane.chunks_exact_mut(v_stride as usize)) + .zip(yuy2_store.chunks_exact(yuy2_stride as usize * 2)); + } + iter.for_each(|(((y_dst, u_dst), v_dst), yuy2_src)| { + for (y, (y_dst, yuy2)) in y_dst + .chunks_exact_mut(y_stride as usize) + .zip(yuy2_src.chunks_exact(yuy2_stride as usize)) + .enumerate() + { + let process_chroma = y & 1 == 0; - y_offset += y_stride as usize; - yuy_offset += yuy2_stride as usize; - match chroma_subsampling { - YuvChromaSubsample::Yuv420 => { - if y & 1 == 1 { - u_offset += u_stride as usize; - v_offset += v_stride as usize; + for (((y_dst, u_dst), v_dst), yuy2) in y_dst + .chunks_exact_mut(2) + .zip(u_dst.iter_mut()) + .zip(v_dst.iter_mut()) + .zip(yuy2.chunks_exact(4)) + { + let first_y_position = yuy2[yuy2_target.get_first_y_position()]; + let second_y_position = yuy2[yuy2_target.get_second_y_position()]; + y_dst[0] = first_y_position; + y_dst[1] = second_y_position; + if process_chroma { + let u_value = yuy2[yuy2_target.get_u_position()]; + let v_value = yuy2[yuy2_target.get_v_position()]; + *u_dst = u_value; + *v_dst = v_value; + } + } + + if width & 1 != 0 { + let y_dst = y_dst.last_mut().unwrap(); + let yuy2 = yuy2_src.chunks_exact(4).remainder(); + let yuy2 = &yuy2[0..4]; + *y_dst = yuy2[yuy2_target.get_first_y_position()]; + if process_chroma { + let u_dst = u_dst.last_mut().unwrap(); + let v_dst = v_dst.last_mut().unwrap(); + *u_dst = yuy2[yuy2_target.get_u_position()]; + *v_dst = yuy2[yuy2_target.get_v_position()]; + } } } - YuvChromaSubsample::Yuv444 | YuvChromaSubsample::Yuv422 => { - u_offset += u_stride as usize; - v_offset += v_stride as usize; - } - } + }); } Ok(())