From 274d00e291936fd5dbc54d40aee8086adb904d27 Mon Sep 17 00:00:00 2001 From: Kornel Date: Wed, 25 Oct 2023 14:32:38 +0100 Subject: [PATCH] Reduce risk of uninit ac in luma_ac and encode_coeffs (#3271) * Reduce risk of uninit ac in luma_ac * eob can't exceed u16 * Avoid uninitialized data in coeff_contexts * Fix minor type conversion and clippy issues --- src/asm/aarch64/predict.rs | 11 ++++--- src/asm/aarch64/transform/inverse.rs | 2 +- src/asm/shared/predict.rs | 20 ++++++++---- src/asm/shared/transform/inverse.rs | 16 +++++----- src/asm/x86/predict.rs | 14 ++++++--- src/asm/x86/quantize.rs | 16 +++++----- src/asm/x86/transform/inverse.rs | 2 +- src/context/block_unit.rs | 33 +++++++++++-------- src/context/transform_unit.rs | 33 ++++++++++++------- src/encoder.rs | 8 ++--- src/partition.rs | 6 ++++ src/predict.rs | 47 ++++++++++++++++++++-------- src/quantize/mod.rs | 12 +++---- src/rdo.rs | 7 +++-- src/transform/inverse.rs | 2 +- src/transform/mod.rs | 2 +- 16 files changed, 146 insertions(+), 85 deletions(-) diff --git a/src/asm/aarch64/predict.rs b/src/asm/aarch64/predict.rs index 1db55eed2f..218199a617 100644 --- a/src/asm/aarch64/predict.rs +++ b/src/asm/aarch64/predict.rs @@ -30,7 +30,7 @@ macro_rules! decl_cfl_ac_fn { extern { $( fn $f( - ac: *mut i16, src: *const u8, stride: libc::ptrdiff_t, + ac: *mut MaybeUninit, src: *const u8, stride: libc::ptrdiff_t, w_pad: libc::c_int, h_pad: libc::c_int, width: libc::c_int, height: libc::c_int, ); @@ -50,7 +50,7 @@ macro_rules! decl_cfl_ac_hbd_fn { extern { $( fn $f( - ac: *mut i16, src: *const u16, stride: libc::ptrdiff_t, + ac: *mut MaybeUninit, src: *const u16, stride: libc::ptrdiff_t, w_pad: libc::c_int, h_pad: libc::c_int, width: libc::c_int, height: libc::c_int, ); @@ -659,11 +659,14 @@ pub fn dispatch_predict_intra( } } +/// It MUST initialize all `ac` elements. #[inline(always)] pub(crate) fn pred_cfl_ac( - ac: &mut [i16], luma: &PlaneRegion<'_, T>, bsize: BlockSize, w_pad: usize, - h_pad: usize, cpu: CpuFeatureLevel, + ac: &mut [MaybeUninit], luma: &PlaneRegion<'_, T>, bsize: BlockSize, + w_pad: usize, h_pad: usize, cpu: CpuFeatureLevel, ) { + debug_assert_eq!(ac.len(), bsize.area()); + if cpu < CpuFeatureLevel::NEON { return rust::pred_cfl_ac::( ac, luma, bsize, w_pad, h_pad, cpu, diff --git a/src/asm/aarch64/transform/inverse.rs b/src/asm/aarch64/transform/inverse.rs index cba27385ae..f98f911217 100644 --- a/src/asm/aarch64/transform/inverse.rs +++ b/src/asm/aarch64/transform/inverse.rs @@ -17,7 +17,7 @@ use crate::asm::shared::transform::inverse::*; use crate::asm::shared::transform::*; pub fn inverse_transform_add( - input: &[T::Coeff], output: &mut PlaneRegionMut<'_, T>, eob: usize, + input: &[T::Coeff], output: &mut PlaneRegionMut<'_, T>, eob: u16, tx_size: TxSize, tx_type: TxType, bd: usize, cpu: CpuFeatureLevel, ) { if tx_type == TxType::WHT_WHT { diff --git a/src/asm/shared/predict.rs b/src/asm/shared/predict.rs index 7b73157116..0431134541 100644 --- a/src/asm/shared/predict.rs +++ b/src/asm/shared/predict.rs @@ -11,6 +11,7 @@ mod test { use interpolate_name::interpolate_test; use rand::random; + use std::mem::MaybeUninit; use crate::context::MAX_TX_SIZE; use crate::cpu_features::CpuFeatureLevel; @@ -23,7 +24,7 @@ mod test { IntraEdgeFilterParameters, PredictionMode, PredictionVariant, }; use crate::transform::TxSize; - use crate::util::Aligned; + use crate::util::{slice_assume_init_mut, Aligned}; use crate::Pixel; #[test] @@ -188,27 +189,34 @@ mod test { } let luma = &plane.as_region(); - let mut ac_ref = Aligned::new([0i16; 32 * 32]); + let mut ac_ref = Aligned::new([MaybeUninit::new(0x3333i16); 32 * 32]); + let ac_ref = &mut ac_ref.data[..plane_bsize.area()]; let cpu = CpuFeatureLevel::RUST; (match (xdec, ydec) { (0, 0) => rust::pred_cfl_ac::, (1, 0) => rust::pred_cfl_ac::, (_, _) => rust::pred_cfl_ac::, - })(&mut ac_ref.data, luma, plane_bsize, w_pad, h_pad, cpu); + })(ac_ref, luma, plane_bsize, w_pad, h_pad, cpu); for &cpu in &CpuFeatureLevel::all()[..=CpuFeatureLevel::default().as_index()] { - let mut ac = Aligned::new([0i16; 32 * 32]); + let mut ac = Aligned::new([MaybeUninit::new(0x7FFFi16); 32 * 32]); + let ac = &mut ac.data[..plane_bsize.area()]; (match (xdec, ydec) { (0, 0) => pred_cfl_ac::, (1, 0) => pred_cfl_ac::, (_, _) => pred_cfl_ac::, - })(&mut ac.data, luma, plane_bsize, w_pad, h_pad, cpu); + })(ac, luma, plane_bsize, w_pad, h_pad, cpu); - assert_eq!(&ac_ref.data[..], &ac.data[..]) + unsafe { + let ac_ref = slice_assume_init_mut(ac_ref); + let ac = slice_assume_init_mut(ac); + + assert_eq!(&ac_ref, &ac); + } } } } diff --git a/src/asm/shared/transform/inverse.rs b/src/asm/shared/transform/inverse.rs index 3547ce9e84..6180f08c5d 100644 --- a/src/asm/shared/transform/inverse.rs +++ b/src/asm/shared/transform/inverse.rs @@ -20,7 +20,7 @@ pub type InvTxfmHBDFunc = pub fn call_inverse_func( func: InvTxfmFunc, input: &[T::Coeff], output: &mut PlaneRegionMut<'_, T>, - eob: usize, width: usize, height: usize, bd: usize, + eob: u16, width: usize, height: usize, bd: usize, ) { debug_assert!(bd == 8); @@ -51,7 +51,7 @@ pub fn call_inverse_func( pub fn call_inverse_hbd_func( func: InvTxfmHBDFunc, input: &[T::Coeff], - output: &mut PlaneRegionMut<'_, T>, eob: usize, width: usize, height: usize, + output: &mut PlaneRegionMut<'_, T>, eob: u16, width: usize, height: usize, bd: usize, ) { // Only use at most 32 columns and 32 rows of input coefficients. @@ -94,7 +94,7 @@ pub mod test { pub fn pick_eob( coeffs: &mut [T], tx_size: TxSize, tx_type: TxType, sub_h: usize, - ) -> usize { + ) -> u16 { /* From dav1d * copy the topleft coefficients such that the return value (being the * coefficient scantable index for the eob token) guarantees that only @@ -105,14 +105,14 @@ pub mod test { let coeff_h = av1_get_coded_tx_size(tx_size).height(); let sub_high: usize = if sub_h > 0 { sub_h * 8 - 1 } else { 0 }; let sub_low: usize = if sub_h > 1 { sub_high - 8 } else { 0 }; - let mut eob = 0; + let mut eob = 0u16; let mut exit = 0; // Wrap WHT_WHT (16) to DCT_DCT (0) scan table let scan = av1_scan_orders[tx_size as usize][(tx_type as usize) & 15].scan; for (i, &pos) in scan.iter().enumerate() { - exit = i; + exit = i as u16; let rc = pos as usize; let rcx = rc % coeff_h; @@ -121,14 +121,14 @@ pub mod test { if rcx > sub_high || rcy > sub_high { break; } else if eob == 0 && (rcx > sub_low || rcy > sub_low) { - eob = i; + eob = i as u16; } } if eob != 0 { eob += thread_rng().gen_range(0..(exit - eob).min(1)); } - for &pos in scan.iter().skip(eob) { + for &pos in scan.iter().skip(usize::from(eob)) { coeffs[pos as usize] = T::cast_from(0); } @@ -181,7 +181,7 @@ pub mod test { // SAFETY: forward_transform initialized freq let freq = unsafe { slice_assume_init_mut(freq) }; - let eob: usize = pick_eob(freq, tx_size, tx_type, sub_h); + let eob: u16 = pick_eob(freq, tx_size, tx_type, sub_h); let mut rust_dst = dst.clone(); inverse_transform_add( diff --git a/src/asm/x86/predict.rs b/src/asm/x86/predict.rs index 4044fd75b9..227fe08859 100644 --- a/src/asm/x86/predict.rs +++ b/src/asm/x86/predict.rs @@ -17,6 +17,7 @@ use crate::tiling::{PlaneRegion, PlaneRegionMut}; use crate::transform::TxSize; use crate::util::Aligned; use crate::Pixel; +use std::mem::MaybeUninit; use v_frame::pixel::PixelType; macro_rules! decl_angular_ipred_fn { @@ -145,7 +146,7 @@ macro_rules! decl_cfl_ac_fn { extern { $( fn $f( - ac: *mut i16, src: *const u8, stride: libc::ptrdiff_t, + ac: *mut MaybeUninit, src: *const u8, stride: libc::ptrdiff_t, w_pad: libc::c_int, h_pad: libc::c_int, width: libc::c_int, height: libc::c_int, ); @@ -168,7 +169,7 @@ macro_rules! decl_cfl_ac_hbd_fn { extern { $( fn $f( - ac: *mut i16, src: *const u16, stride: libc::ptrdiff_t, + ac: *mut MaybeUninit, src: *const u16, stride: libc::ptrdiff_t, w_pad: libc::c_int, h_pad: libc::c_int, width: libc::c_int, height: libc::c_int, ); @@ -871,12 +872,15 @@ pub fn dispatch_predict_intra( } } +// The implementation MUST inititialize all `ac` elements #[inline(always)] pub(crate) fn pred_cfl_ac( - ac: &mut [i16], luma: &PlaneRegion<'_, T>, bsize: BlockSize, w_pad: usize, - h_pad: usize, cpu: CpuFeatureLevel, + ac: &mut [MaybeUninit], luma: &PlaneRegion<'_, T>, bsize: BlockSize, + w_pad: usize, h_pad: usize, cpu: CpuFeatureLevel, ) { - let call_rust = |ac: &mut [i16]| { + debug_assert_eq!(ac.len(), bsize.area()); + + let call_rust = |ac: &mut [MaybeUninit]| { rust::pred_cfl_ac::(ac, luma, bsize, w_pad, h_pad, cpu); }; diff --git a/src/asm/x86/quantize.rs b/src/asm/x86/quantize.rs index b61f5ad392..0331685635 100644 --- a/src/asm/x86/quantize.rs +++ b/src/asm/x86/quantize.rs @@ -22,7 +22,7 @@ use std::mem::MaybeUninit; type DequantizeFn = unsafe fn( qindex: u8, coeffs_ptr: *const i16, - _eob: usize, + _eob: u16, rcoeffs_ptr: *mut i16, tx_size: TxSize, bit_depth: usize, @@ -38,7 +38,7 @@ cpu_function_lookup_table!( #[inline(always)] pub fn dequantize( - qindex: u8, coeffs: &[T], eob: usize, rcoeffs: &mut [MaybeUninit], + qindex: u8, coeffs: &[T], eob: u16, rcoeffs: &mut [MaybeUninit], tx_size: TxSize, bit_depth: usize, dc_delta_q: i8, ac_delta_q: i8, cpu: CpuFeatureLevel, ) { @@ -91,7 +91,7 @@ pub fn dequantize( #[target_feature(enable = "avx2")] unsafe fn dequantize_avx2( - qindex: u8, coeffs_ptr: *const i16, _eob: usize, rcoeffs_ptr: *mut i16, + qindex: u8, coeffs_ptr: *const i16, _eob: u16, rcoeffs_ptr: *mut i16, tx_size: TxSize, bit_depth: usize, dc_delta_q: i8, ac_delta_q: i8, ) { let log_tx_scale = _mm256_set1_epi32(get_log_tx_scale(tx_size) as i32); @@ -182,12 +182,12 @@ mod test { // Test the min, max, and random eobs let eobs = { - let mut out = [0usize; 16]; + let mut out = [0u16; 16]; let area: usize = av1_get_coded_tx_size(tx_size).area(); out[0] = 0; - out[1] = area; + out[1] = area as u16; for eob in out.iter_mut().skip(2) { - *eob = rng.gen_range(0..area); + *eob = rng.gen_range(0..area as u16); } out }; @@ -198,7 +198,9 @@ mod test { // Generate quantized coefficients up to the eob let between = Uniform::from(-i16::MAX..=i16::MAX); - for (i, qcoeff) in qcoeffs.data.iter_mut().enumerate().take(eob) { + for (i, qcoeff) in + qcoeffs.data.iter_mut().enumerate().take(eob as usize) + { *qcoeff = between.sample(&mut rng) / if i == 0 { dc_quant } else { ac_quant }; } diff --git a/src/asm/x86/transform/inverse.rs b/src/asm/x86/transform/inverse.rs index df99f4b4b3..f9865ae7ee 100644 --- a/src/asm/x86/transform/inverse.rs +++ b/src/asm/x86/transform/inverse.rs @@ -17,7 +17,7 @@ use crate::asm::shared::transform::inverse::*; use crate::asm::shared::transform::*; pub fn inverse_transform_add( - input: &[T::Coeff], output: &mut PlaneRegionMut<'_, T>, eob: usize, + input: &[T::Coeff], output: &mut PlaneRegionMut<'_, T>, eob: u16, tx_size: TxSize, tx_type: TxType, bd: usize, cpu: CpuFeatureLevel, ) { if tx_type == TxType::WHT_WHT { diff --git a/src/context/block_unit.rs b/src/context/block_unit.rs index d426132a1d..9a26c8537f 100644 --- a/src/context/block_unit.rs +++ b/src/context/block_unit.rs @@ -7,6 +7,8 @@ // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. +use std::mem::MaybeUninit; + use super::*; use crate::predict::PredictionMode; @@ -1781,7 +1783,7 @@ impl<'a> ContextWriter<'a> { pub fn write_coeffs_lv_map( &mut self, w: &mut W, plane: usize, bo: TileBlockOffset, coeffs_in: &[T], - eob: usize, pred_mode: PredictionMode, tx_size: TxSize, tx_type: TxType, + eob: u16, pred_mode: PredictionMode, tx_size: TxSize, tx_type: TxType, plane_bsize: BlockSize, xdec: usize, ydec: usize, use_reduced_tx_set: bool, frame_clipped_txw: usize, frame_clipped_txh: usize, @@ -1792,8 +1794,8 @@ impl<'a> ContextWriter<'a> { let is_inter = pred_mode >= PredictionMode::NEARESTMV; // Note: Both intra and inter mode uses inter scan order. Surprised? - let scan: &[u16] = - &av1_scan_orders[tx_size as usize][tx_type as usize].scan[..eob]; + let scan: &[u16] = &av1_scan_orders[tx_size as usize][tx_type as usize] + .scan[..usize::from(eob)]; let height = av1_get_coded_tx_size(tx_size).height(); // Create a slice with coeffs in scan order @@ -1858,7 +1860,7 @@ impl<'a> ContextWriter<'a> { } fn encode_eob( - &mut self, eob: usize, tx_size: TxSize, tx_class: TxClass, txs_ctx: usize, + &mut self, eob: u16, tx_size: TxSize, tx_class: TxClass, txs_ctx: usize, plane_type: usize, w: &mut W, ) { let (eob_pt, eob_extra) = Self::get_eob_pos_token(eob); @@ -1913,18 +1915,19 @@ impl<'a> ContextWriter<'a> { } fn encode_coeffs( - &mut self, coeffs: &[T], levels: &mut [u8], scan: &[u16], eob: usize, + &mut self, coeffs: &[T], levels: &mut [u8], scan: &[u16], eob: u16, tx_size: TxSize, tx_class: TxClass, txs_ctx: usize, plane_type: usize, w: &mut W, ) { // SAFETY: We write to the array below before reading from it. - let mut coeff_contexts: Aligned<[i8; MAX_CODED_TX_SQUARE]> = + let mut coeff_contexts: Aligned<[MaybeUninit; MAX_CODED_TX_SQUARE]> = unsafe { Aligned::uninitialized() }; - self.get_nz_map_contexts( + // get_nz_map_contexts sets coeff_contexts contiguously as a parallel array for scan, not in scan order + let coeff_contexts = self.get_nz_map_contexts( levels, scan, - eob as u16, + eob, tx_size, tx_class, &mut coeff_contexts.data, @@ -1932,24 +1935,28 @@ impl<'a> ContextWriter<'a> { let bhl = Self::get_txb_bhl(tx_size); - for (c, (&pos, &v)) in scan.iter().zip(coeffs.iter()).enumerate().rev() { + let scan_with_ctx = + scan.iter().copied().zip(coeff_contexts.iter().copied()); + for (c, ((pos, coeff_ctx), v)) in + scan_with_ctx.zip(coeffs.iter().copied()).enumerate().rev() + { let pos = pos as usize; - let coeff_ctx = coeff_contexts.data[pos]; + let coeff_ctx = coeff_ctx as usize; let level = v.abs(); - if c == eob - 1 { + if c == usize::from(eob) - 1 { symbol_with_update!( self, w, cmp::min(u32::cast_from(level), 3) - 1, - &self.fc.coeff_base_eob_cdf[txs_ctx][plane_type][coeff_ctx as usize] + &self.fc.coeff_base_eob_cdf[txs_ctx][plane_type][coeff_ctx] ); } else { symbol_with_update!( self, w, cmp::min(u32::cast_from(level), 3), - &self.fc.coeff_base_cdf[txs_ctx][plane_type][coeff_ctx as usize] + &self.fc.coeff_base_cdf[txs_ctx][plane_type][coeff_ctx] ); } diff --git a/src/context/transform_unit.rs b/src/context/transform_unit.rs index 5baf660657..a350cb4a3f 100644 --- a/src/context/transform_unit.rs +++ b/src/context/transform_unit.rs @@ -11,6 +11,7 @@ use super::*; use crate::predict::PredictionMode; use crate::predict::PredictionMode::*; use crate::transform::TxType::*; +use std::mem::MaybeUninit; pub const MAX_TX_SIZE: usize = 64; @@ -804,11 +805,11 @@ impl<'a> ContextWriter<'a> { /// /// - If `eob` is prior to the start of the group #[inline] - pub fn get_eob_pos_token(eob: usize) -> (u32, u32) { + pub fn get_eob_pos_token(eob: u16) -> (u32, u32) { let t = if eob < 33 { - eob_to_pos_small[eob] as u32 + eob_to_pos_small[usize::from(eob)] as u32 } else { - let e = cmp::min((eob - 1) >> 5, 16); + let e = usize::from(cmp::min((eob - 1) >> 5, 16)); eob_to_pos_large[e] as u32 }; assert!(eob as i32 >= k_eob_group_start[t as usize] as i32); @@ -905,25 +906,33 @@ impl<'a> ContextWriter<'a> { Self::get_nz_map_ctx_from_stats(stats, coeff_idx, bhl, tx_size, tx_class) } - pub fn get_nz_map_contexts( + /// `coeff_contexts_no_scan` is not in the scan order. + /// Value for `pos = scan[i]` is at `coeff[i]`, not at `coeff[pos]`. + pub fn get_nz_map_contexts<'c>( &self, levels: &mut [u8], scan: &[u16], eob: u16, tx_size: TxSize, - tx_class: TxClass, coeff_contexts: &mut [i8], - ) { + tx_class: TxClass, coeff_contexts_no_scan: &'c mut [MaybeUninit], + ) -> &'c mut [i8] { let bhl = Self::get_txb_bhl(tx_size); let area = av1_get_coded_tx_size(tx_size).area(); - for i in 0..eob { - let pos = scan[i as usize]; - coeff_contexts[pos as usize] = Self::get_nz_map_ctx( + + let scan = &scan[..usize::from(eob)]; + let coeffs = &mut coeff_contexts_no_scan[..usize::from(eob)]; + for (i, (coeff, pos)) in + coeffs.iter_mut().zip(scan.iter().copied()).enumerate() + { + coeff.write(Self::get_nz_map_ctx( levels, pos as usize, bhl, area, - i as usize, - i == eob - 1, + i, + i == usize::from(eob) - 1, tx_size, tx_class, - ) as i8; + ) as i8); } + // SAFETY: every element has been initialized + unsafe { slice_assume_init_mut(coeffs) } } pub fn get_br_ctx( diff --git a/src/encoder.rs b/src/encoder.rs index 7c60b2c0ec..8b2d96e9e9 100644 --- a/src/encoder.rs +++ b/src/encoder.rs @@ -2261,7 +2261,8 @@ pub fn write_tx_blocks( let PlaneConfig { xdec, ydec, .. } = ts.input.planes[1].cfg; // SAFETY: We write to the array below before reading from it. - let mut ac: Aligned<[i16; 32 * 32]> = unsafe { Aligned::uninitialized() }; + let mut ac: Aligned<[MaybeUninit; 32 * 32]> = + unsafe { Aligned::uninitialized() }; let mut partition_has_coeff: bool = false; let mut tx_dist = ScaledDistortion::zero(); let do_chroma = @@ -2341,10 +2342,9 @@ pub fn write_tx_blocks( bh_uv /= uv_tx_size.height_mi(); let ac_data = if chroma_mode.is_cfl() { - luma_ac(&mut ac.data, ts, tile_bo, bsize, tx_size, fi); - &ac.data[..] + luma_ac(&mut ac.data, ts, tile_bo, bsize, tx_size, fi) } else { - &[] + [].as_slice() }; let uv_tx_type = if uv_tx_size.width() >= 32 || uv_tx_size.height() >= 32 { diff --git a/src/partition.rs b/src/partition.rs index 66320fe98c..dc89ab8367 100644 --- a/src/partition.rs +++ b/src/partition.rs @@ -237,6 +237,12 @@ impl BlockSize { 1 << self.width_log2() } + /// width * height + #[inline] + pub const fn area(self) -> usize { + self.width() * self.height() + } + #[inline] pub const fn width_log2(self) -> usize { match self { diff --git a/src/predict.rs b/src/predict.rs index 91378d0ee2..4705cf8ad3 100644 --- a/src/predict.rs +++ b/src/predict.rs @@ -11,6 +11,8 @@ #![allow(non_camel_case_types)] #![allow(dead_code)] +use std::mem::MaybeUninit; + cfg_if::cfg_if! { if #[cfg(nasm_x86_64)] { pub use crate::asm::x86::predict::*; @@ -630,17 +632,27 @@ const fn get_scaled_luma_q0(alpha_q3: i16, ac_pred_q3: i16) -> i32 { } } +/// # Returns +/// +/// Initialized luma AC coefficients +/// /// # Panics /// /// - If the block size is invalid for subsampling -pub fn luma_ac( - ac: &mut [i16], ts: &mut TileStateMut<'_, T>, tile_bo: TileBlockOffset, - bsize: BlockSize, tx_size: TxSize, fi: &FrameInvariants, -) { +/// +pub fn luma_ac<'ac, T: Pixel>( + ac: &'ac mut [MaybeUninit], ts: &mut TileStateMut<'_, T>, + tile_bo: TileBlockOffset, bsize: BlockSize, tx_size: TxSize, + fi: &FrameInvariants, +) -> &'ac mut [i16] { use crate::context::MI_SIZE_LOG2; let PlaneConfig { xdec, ydec, .. } = ts.input.planes[1].cfg; let plane_bsize = bsize.subsampled_size(xdec, ydec).unwrap(); + + // ensure ac has the right length, so there aren't any uninitialized elements at the end + let ac = &mut ac[..plane_bsize.area()]; + let bo = if bsize.is_sub8x8(xdec, ydec) { let offset = bsize.sub8x8_offset(xdec, ydec); tile_bo.with_offset(offset.0, offset.1) @@ -679,6 +691,9 @@ pub fn luma_ac( (1, 0) => pred_cfl_ac::, (_, _) => pred_cfl_ac::, })(ac, luma, plane_bsize, w_pad, h_pad, cpu); + + // SAFETY: it relies on individual pred_cfl_ac implementations to initialize the ac + unsafe { slice_assume_init_mut(ac) } } pub(crate) mod rust { @@ -1008,16 +1023,21 @@ pub(crate) mod rust { } pub(crate) fn pred_cfl_ac( - ac: &mut [i16], luma: &PlaneRegion<'_, T>, plane_bsize: BlockSize, - w_pad: usize, h_pad: usize, _cpu: CpuFeatureLevel, + ac: &mut [MaybeUninit], luma: &PlaneRegion<'_, T>, + plane_bsize: BlockSize, w_pad: usize, h_pad: usize, _cpu: CpuFeatureLevel, ) { let max_luma_w = (plane_bsize.width() - w_pad * 4) << XDEC; let max_luma_h = (plane_bsize.height() - h_pad * 4) << YDEC; let max_luma_x: usize = max_luma_w.max(8) - (1 << XDEC); let max_luma_y: usize = max_luma_h.max(8) - (1 << YDEC); let mut sum: i32 = 0; - for sub_y in 0..plane_bsize.height() { - for sub_x in 0..plane_bsize.width() { + + let ac = &mut ac[..plane_bsize.area()]; + + for (sub_y, ac_rows) in + ac.chunks_exact_mut(plane_bsize.width()).enumerate() + { + for (sub_x, ac_item) in ac_rows.iter_mut().enumerate() { // Refer to https://aomediacodec.github.io/av1-spec/#predict-chroma-from-luma-process let luma_y = sub_y << YDEC; let luma_x = sub_x << XDEC; @@ -1033,14 +1053,16 @@ pub(crate) mod rust { + i16::cast_from(luma[y + 1][x + 1]); } sample <<= 3 - XDEC - YDEC; - ac[sub_y * plane_bsize.width() + sub_x] = sample; + ac_item.write(sample); sum += sample as i32; } } + // SAFETY: the loop above has initialized all items + let ac = unsafe { assume_slice_init_mut(ac) }; let shift = plane_bsize.width_log2() + plane_bsize.height_log2(); let average = ((sum + (1 << (shift - 1))) >> shift) as i16; - for val in &mut ac[..(plane_bsize.height() * plane_bsize.width())] { + for val in ac { *val -= average; } } @@ -1052,8 +1074,7 @@ pub(crate) mod rust { if alpha == 0 { return; } - assert!(32 >= width); - assert!(ac.len() >= 32 * (height - 1) + width); + debug_assert!(ac.len() >= width * height); assert!(output.plane_cfg.stride >= width); assert!(output.rows_iter().len() >= height); @@ -1061,7 +1082,7 @@ pub(crate) mod rust { let avg: i32 = output[0][0].into(); for (line, luma) in - output.rows_iter_mut().zip(ac.chunks(width)).take(height) + output.rows_iter_mut().zip(ac.chunks_exact(width)).take(height) { for (v, &l) in line[..width].iter_mut().zip(luma[..width].iter()) { *v = T::cast_from( diff --git a/src/quantize/mod.rs b/src/quantize/mod.rs index 1555f58db9..d2d3533c0d 100644 --- a/src/quantize/mod.rs +++ b/src/quantize/mod.rs @@ -269,7 +269,7 @@ impl QuantizationContext { #[inline] pub fn quantize( &self, coeffs: &[T], qcoeffs: &mut [T], tx_size: TxSize, tx_type: TxType, - ) -> usize { + ) -> u16 { let scan = av1_scan_orders[tx_size as usize][tx_type as usize].scan; let iscan = av1_scan_orders[tx_size as usize][tx_type as usize].iscan; @@ -299,9 +299,9 @@ impl QuantizationContext { .unwrap_or(0); // We skip the DC coefficient since it has its own quantizer index. if eob_minus_one > 0 { - eob_minus_one as usize + 1 + eob_minus_one + 1 } else { - usize::from(qcoeffs[0] != T::cast_from(0)) + u16::from(qcoeffs[0] != T::cast_from(0)) } }; @@ -317,7 +317,7 @@ impl QuantizationContext { // that tail of zeroes and ones than we do for the larger coefficients. let mut level_mode = 1; let ac_quant = self.ac_quant.get() as u32; - for &pos in scan.iter().take(eob).skip(1) { + for &pos in scan.iter().take(usize::from(eob)).skip(1) { let coeff = i32::cast_from(coeffs[pos as usize]) << self.log_tx_scale; let abs_coeff = coeff.unsigned_abs(); @@ -344,7 +344,7 @@ impl QuantizationContext { // Check the eob is correct debug_assert_eq!( - eob, + usize::from(eob), scan .iter() .rposition(|&i| qcoeffs[i as usize] != T::cast_from(0)) @@ -362,7 +362,7 @@ pub mod rust { use std::mem::MaybeUninit; pub fn dequantize( - qindex: u8, coeffs: &[T], _eob: usize, rcoeffs: &mut [MaybeUninit], + qindex: u8, coeffs: &[T], _eob: u16, rcoeffs: &mut [MaybeUninit], tx_size: TxSize, bit_depth: usize, dc_delta_q: i8, ac_delta_q: i8, _cpu: CpuFeatureLevel, ) { diff --git a/src/rdo.rs b/src/rdo.rs index 34d751eba8..42146d5136 100644 --- a/src/rdo.rs +++ b/src/rdo.rs @@ -1610,8 +1610,9 @@ pub fn rdo_cfl_alpha( return None; }; // SAFETY: We write to the array below before reading from it. - let mut ac: Aligned<[i16; 32 * 32]> = unsafe { Aligned::uninitialized() }; - luma_ac(&mut ac.data, ts, tile_bo, bsize, luma_tx_size, fi); + let mut ac: Aligned<[MaybeUninit; 32 * 32]> = + unsafe { Aligned::uninitialized() }; + let ac = luma_ac(&mut ac.data, ts, tile_bo, bsize, luma_tx_size, fi); let best_alpha: ArrayVec = (1..3) .map(|p| { let &PlaneConfig { xdec, ydec, .. } = ts.rec.planes[p].plane_cfg; @@ -1640,7 +1641,7 @@ pub fn rdo_cfl_alpha( &mut rec_region, uv_tx_size, fi.sequence.bit_depth, - &ac.data, + ac, IntraParam::Alpha(alpha), None, &edge_buf, diff --git a/src/transform/inverse.rs b/src/transform/inverse.rs index 870e517f37..a85f371029 100644 --- a/src/transform/inverse.rs +++ b/src/transform/inverse.rs @@ -1633,7 +1633,7 @@ pub(crate) mod rust { #[cold_for_target_arch("x86_64", "aarch64")] pub fn inverse_transform_add( - input: &[T::Coeff], output: &mut PlaneRegionMut<'_, T>, _eob: usize, + input: &[T::Coeff], output: &mut PlaneRegionMut<'_, T>, _eob: u16, tx_size: TxSize, tx_type: TxType, bd: usize, _cpu: CpuFeatureLevel, ) { let width: usize = tx_size.width(); diff --git a/src/transform/mod.rs b/src/transform/mod.rs index 3d17f6d8fe..55f91c155f 100644 --- a/src/transform/mod.rs +++ b/src/transform/mod.rs @@ -506,7 +506,7 @@ mod test { inverse_transform_add( freq, &mut dst.as_region_mut(), - coeff_area, + coeff_area.try_into().unwrap(), tx_size, tx_type, 8,