Skip to content

Commit

Permalink
Benchmarks, YUV 4:2:0, 4:2:2 improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
awxkee committed Nov 23, 2024
1 parent 296895e commit 23af46f
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 77 deletions.
64 changes: 18 additions & 46 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,57 +37,29 @@ cargo add yuvutils-rs
### RGB to YCbCr

```rust
rgb_to_yuv422(&mut y_plane, y_stride,
&mut u_plane, u_width,
&mut v_plane, v_width,
&rgb, rgb_stride,
width, height,
YuvRange::Full, YuvStandardMatrix::Bt709);
```

### RGB to sharp YUV

```rust
rgb_to_sharp_yuv420(&mut y_plane, y_stride,
&mut u_plane, u_width,
&mut v_plane, v_width,
&rgb, rgb_stride,
width, height,
YuvRange::Full, YuvStandardMatrix::Bt709,
SharpYuvGammaTransfer::Srgb);
let mut planar_image =
YuvPlanarImageMut::<u8>::alloc(width as u32, height as u32, YuvChromaSubsampling::Yuv420);
rgb_to_yuv422(
&mut planar_image,
&src_bytes,
rgba_stride as u32,
YuvRange::Limited,
YuvStandardMatrix::Bt601,
)
.unwrap();
```

### YCbCr to RGB

```rust
yuv422_to_rgb(&y_plane, y_stride,
&u_plane, u_stride,
&v_plane, v_stride,
&mut rgb, rgb_stride,
width, height,
YuvRange::Full, YuvStandardMatrix::Bt709);
```

### RGB To YCgCo

```rust
rgb_to_ycgco420(&mut y_plane, y_stride,
&mut cg_plane, cg_width,
&mut cg_plane, cg_width,
&rgb, rgb_stride,
width, height,
YuvRange::TV);
```

### YCgCo to RGB

```rust
ycgco420_to_rgb(&y_plane, y_stride,
&cg_plane, cg_stride,
&co_plane, co_stride,
&mut rgb, rgb_stride,
width, height,
YuvRange::TV);
yuv420_to_rgb(
&yuv_planar_image,
&mut rgba,
rgba_stride as u32,
YuvRange::Limited,
YuvStandardMatrix::Bt601,
)
.unwrap();
```

## Benchmarks
Expand Down
30 changes: 17 additions & 13 deletions src/avx512bw/yuv_to_rgba.rs
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,11 @@ unsafe fn avx512_yuv_to_rgba_impl<const DESTINATION_CHANNELS: u8, const SAMPLING
let v_g_coeff_1 = _mm512_set1_epi16(transform.g_coeff_1 as i16);
let v_g_coeff_2 = _mm512_set1_epi16(transform.g_coeff_2 as i16);
let v_alpha = _mm512_set1_epi8(255u8 as i8);
let rounding_const = _mm512_set1_epi16(1 << 2);

const SCALE: u32 = 7;
const V_SHR: u32 = 3;

let rounding_const = _mm512_set1_epi16(1 << (V_SHR - 1));

while cx + 64 < width {
let y_values = _mm512_subs_epu8(_mm512_loadu_si512(y_ptr.add(cx) as *const i32), y_corr);
Expand Down Expand Up @@ -120,25 +124,25 @@ unsafe fn avx512_yuv_to_rgba_impl<const DESTINATION_CHANNELS: u8, const SAMPLING
}

let u_high =
_mm512_slli_epi16::<7>(_mm512_sub_epi16(_mm512_cvtepu8_epi16(u_high_u8), uv_corr));
_mm512_slli_epi16::<SCALE>(_mm512_sub_epi16(_mm512_cvtepu8_epi16(u_high_u8), uv_corr));
let v_high =
_mm512_slli_epi16::<7>(_mm512_sub_epi16(_mm512_cvtepu8_epi16(v_high_u8), uv_corr));
_mm512_slli_epi16::<SCALE>(_mm512_sub_epi16(_mm512_cvtepu8_epi16(v_high_u8), uv_corr));
let y_high = _mm512_mulhi_epi16(
_mm512_slli_epi16::<7>(_mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64::<1>(
_mm512_slli_epi16::<SCALE>(_mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64::<1>(
y_values,
))),
v_luma_coeff,
);

let r_high = _mm512_srli_epi16::<3>(_mm512_add_epi16(
let r_high = _mm512_srli_epi16::<V_SHR>(_mm512_add_epi16(
_mm512_add_epi16(y_high, _mm512_mulhi_epi16(v_high, v_cr_coeff)),
rounding_const,
));
let b_high = _mm512_srli_epi16::<3>(_mm512_add_epi16(
let b_high = _mm512_srli_epi16::<V_SHR>(_mm512_add_epi16(
_mm512_add_epi16(y_high, _mm512_mulhi_epi16(u_high, v_cb_coeff)),
rounding_const,
));
let g_high = _mm512_srli_epi16::<3>(_mm512_add_epi16(
let g_high = _mm512_srli_epi16::<V_SHR>(_mm512_add_epi16(
_mm512_sub_epi16(
y_high,
_mm512_add_epi16(
Expand All @@ -150,23 +154,23 @@ unsafe fn avx512_yuv_to_rgba_impl<const DESTINATION_CHANNELS: u8, const SAMPLING
));

let u_low =
_mm512_slli_epi16::<7>(_mm512_sub_epi16(_mm512_cvtepu8_epi16(u_low_u8), uv_corr));
_mm512_slli_epi16::<SCALE>(_mm512_sub_epi16(_mm512_cvtepu8_epi16(u_low_u8), uv_corr));
let v_low =
_mm512_slli_epi16::<7>(_mm512_sub_epi16(_mm512_cvtepu8_epi16(v_low_u8), uv_corr));
_mm512_slli_epi16::<SCALE>(_mm512_sub_epi16(_mm512_cvtepu8_epi16(v_low_u8), uv_corr));
let y_low = _mm512_mulhi_epi16(
_mm512_slli_epi16::<7>(_mm512_cvtepu8_epi16(_mm512_castsi512_si256(y_values))),
_mm512_slli_epi16::<SCALE>(_mm512_cvtepu8_epi16(_mm512_castsi512_si256(y_values))),
v_luma_coeff,
);

let r_low = _mm512_srli_epi16::<3>(_mm512_add_epi16(
let r_low = _mm512_srli_epi16::<V_SHR>(_mm512_add_epi16(
_mm512_add_epi16(y_low, _mm512_mulhi_epi16(v_low, v_cr_coeff)),
rounding_const,
));
let b_low = _mm512_srli_epi16::<3>(_mm512_adds_epi16(
let b_low = _mm512_srli_epi16::<V_SHR>(_mm512_adds_epi16(
_mm512_adds_epi16(y_low, _mm512_mulhi_epi16(u_low, v_cb_coeff)),
rounding_const,
));
let g_low = _mm512_srli_epi16::<3>(_mm512_add_epi16(
let g_low = _mm512_srli_epi16::<V_SHR>(_mm512_add_epi16(
_mm512_sub_epi16(
y_low,
_mm512_add_epi16(
Expand Down
2 changes: 1 addition & 1 deletion src/rgb_to_nv_p16.rs
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ fn rgbx_to_yuv_bi_planar_10_impl<
let transform_precise =
get_forward_transform(max_range, range.range_y, range.range_uv, kr_kb.kr, kr_kb.kb);
let transform = transform_precise.to_integers(8);
const PRECISION: i32 = 8;
const PRECISION: i32 = 12;
const ROUNDING_CONST_BIAS: i32 = 1 << (PRECISION - 1);
let bias_y = range.bias_y as i32 * (1 << PRECISION) + ROUNDING_CONST_BIAS;
let bias_uv = range.bias_uv as i32 * (1 << PRECISION) + ROUNDING_CONST_BIAS;
Expand Down
32 changes: 15 additions & 17 deletions src/rgba_to_nv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -135,23 +135,21 @@ fn rgbx_to_nv<const ORIGIN_CHANNELS: u8, const UV_ORDER: u8, const SAMPLING: u8>

#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
unsafe {
if is_rdm_available {
let offset = neon_wide_row_handler(
y_plane,
0,
uv_plane,
0,
rgba,
0,
width,
&range,
&transform,
_offset.cx,
_offset.ux,
compute_uv_row,
);
_offset = offset
}
let offset = neon_wide_row_handler(
y_plane,
0,
uv_plane,
0,
rgba,
0,
width,
&range,
&transform,
_offset.cx,
_offset.ux,
compute_uv_row,
);
_offset = offset
}
_offset
};
Expand Down

0 comments on commit 23af46f

Please sign in to comment.