From 274d00e291936fd5dbc54d40aee8086adb904d27 Mon Sep 17 00:00:00 2001
From: Kornel <kornel@geekhood.net>
Date: Wed, 25 Oct 2023 14:32:38 +0100
Subject: [PATCH] Reduce risk of uninit ac in luma_ac and encode_coeffs (#3271)

* Reduce risk of uninit ac in luma_ac
* eob can't exceed u16
* Avoid uninitialized data in coeff_contexts
* Fix minor type conversion and clippy issues
---
 src/asm/aarch64/predict.rs           | 11 ++++---
 src/asm/aarch64/transform/inverse.rs |  2 +-
 src/asm/shared/predict.rs            | 20 ++++++++----
 src/asm/shared/transform/inverse.rs  | 16 +++++-----
 src/asm/x86/predict.rs               | 14 ++++++---
 src/asm/x86/quantize.rs              | 16 +++++-----
 src/asm/x86/transform/inverse.rs     |  2 +-
 src/context/block_unit.rs            | 33 +++++++++++--------
 src/context/transform_unit.rs        | 33 ++++++++++++-------
 src/encoder.rs                       |  8 ++---
 src/partition.rs                     |  6 ++++
 src/predict.rs                       | 47 ++++++++++++++++++++--------
 src/quantize/mod.rs                  | 12 +++----
 src/rdo.rs                           |  7 +++--
 src/transform/inverse.rs             |  2 +-
 src/transform/mod.rs                 |  2 +-
 16 files changed, 146 insertions(+), 85 deletions(-)
diff --git a/src/asm/aarch64/predict.rs b/src/asm/aarch64/predict.rs
index 1db55eed2f..218199a617 100644
--- a/src/asm/aarch64/predict.rs
+++ b/src/asm/aarch64/predict.rs
@@ -30,7 +30,7 @@ macro_rules! decl_cfl_ac_fn {
     extern {
       $(
         fn $f(
-          ac: *mut i16, src: *const u8, stride: libc::ptrdiff_t,
+          ac: *mut MaybeUninit<i16>, src: *const u8, stride: libc::ptrdiff_t,
           w_pad: libc::c_int, h_pad: libc::c_int,
           width: libc::c_int, height: libc::c_int,
         );
@@ -50,7 +50,7 @@ macro_rules! decl_cfl_ac_hbd_fn {
     extern {
       $(
         fn $f(
-          ac: *mut i16, src: *const u16, stride: libc::ptrdiff_t,
+          ac: *mut MaybeUninit<i16>, src: *const u16, stride: libc::ptrdiff_t,
           w_pad: libc::c_int, h_pad: libc::c_int,
           width: libc::c_int, height: libc::c_int,
         );
@@ -659,11 +659,14 @@ pub fn dispatch_predict_intra<T: Pixel>(
   }
 }
 
+/// It MUST initialize all `ac` elements.
 #[inline(always)]
 pub(crate) fn pred_cfl_ac<T: Pixel, const XDEC: usize, const YDEC: usize>(
-  ac: &mut [i16], luma: &PlaneRegion<'_, T>, bsize: BlockSize, w_pad: usize,
-  h_pad: usize, cpu: CpuFeatureLevel,
+  ac: &mut [MaybeUninit<i16>], luma: &PlaneRegion<'_, T>, bsize: BlockSize,
+  w_pad: usize, h_pad: usize, cpu: CpuFeatureLevel,
 ) {
+  debug_assert_eq!(ac.len(), bsize.area());
+
   if cpu < CpuFeatureLevel::NEON {
     return rust::pred_cfl_ac::<T, XDEC, YDEC>(
       ac, luma, bsize, w_pad, h_pad, cpu,
diff --git a/src/asm/aarch64/transform/inverse.rs b/src/asm/aarch64/transform/inverse.rs
index cba27385ae..f98f911217 100644
--- a/src/asm/aarch64/transform/inverse.rs
+++ b/src/asm/aarch64/transform/inverse.rs
@@ -17,7 +17,7 @@ use crate::asm::shared::transform::inverse::*;
 use crate::asm::shared::transform::*;
 
 pub fn inverse_transform_add<T: Pixel>(
-  input: &[T::Coeff], output: &mut PlaneRegionMut<'_, T>, eob: usize,
+  input: &[T::Coeff], output: &mut PlaneRegionMut<'_, T>, eob: u16,
   tx_size: TxSize, tx_type: TxType, bd: usize, cpu: CpuFeatureLevel,
 ) {
   if tx_type == TxType::WHT_WHT {
diff --git a/src/asm/shared/predict.rs b/src/asm/shared/predict.rs
index 7b73157116..0431134541 100644
--- a/src/asm/shared/predict.rs
+++ b/src/asm/shared/predict.rs
@@ -11,6 +11,7 @@
 mod test {
   use interpolate_name::interpolate_test;
   use rand::random;
+  use std::mem::MaybeUninit;
 
   use crate::context::MAX_TX_SIZE;
   use crate::cpu_features::CpuFeatureLevel;
@@ -23,7 +24,7 @@ mod test {
     IntraEdgeFilterParameters, PredictionMode, PredictionVariant,
   };
   use crate::transform::TxSize;
-  use crate::util::Aligned;
+  use crate::util::{slice_assume_init_mut, Aligned};
   use crate::Pixel;
 
   #[test]
@@ -188,27 +189,34 @@ mod test {
     }
     let luma = &plane.as_region();
 
-    let mut ac_ref = Aligned::new([0i16; 32 * 32]);
+    let mut ac_ref = Aligned::new([MaybeUninit::new(0x3333i16); 32 * 32]);
+    let ac_ref = &mut ac_ref.data[..plane_bsize.area()];
 
     let cpu = CpuFeatureLevel::RUST;
     (match (xdec, ydec) {
       (0, 0) => rust::pred_cfl_ac::<T, 0, 0>,
       (1, 0) => rust::pred_cfl_ac::<T, 1, 0>,
       (_, _) => rust::pred_cfl_ac::<T, 1, 1>,
-    })(&mut ac_ref.data, luma, plane_bsize, w_pad, h_pad, cpu);
+    })(ac_ref, luma, plane_bsize, w_pad, h_pad, cpu);
 
     for &cpu in
       &CpuFeatureLevel::all()[..=CpuFeatureLevel::default().as_index()]
     {
-      let mut ac = Aligned::new([0i16; 32 * 32]);
+      let mut ac = Aligned::new([MaybeUninit::new(0x7FFFi16); 32 * 32]);
+      let ac = &mut ac.data[..plane_bsize.area()];
 
       (match (xdec, ydec) {
         (0, 0) => pred_cfl_ac::<T, 0, 0>,
         (1, 0) => pred_cfl_ac::<T, 1, 0>,
         (_, _) => pred_cfl_ac::<T, 1, 1>,
-      })(&mut ac.data, luma, plane_bsize, w_pad, h_pad, cpu);
+      })(ac, luma, plane_bsize, w_pad, h_pad, cpu);
 
-      assert_eq!(&ac_ref.data[..], &ac.data[..])
+      unsafe {
+        let ac_ref = slice_assume_init_mut(ac_ref);
+        let ac = slice_assume_init_mut(ac);
+
+        assert_eq!(&ac_ref, &ac);
+      }
     }
   }
 }
diff --git a/src/asm/shared/transform/inverse.rs b/src/asm/shared/transform/inverse.rs
index 3547ce9e84..6180f08c5d 100644
--- a/src/asm/shared/transform/inverse.rs
+++ b/src/asm/shared/transform/inverse.rs
@@ -20,7 +20,7 @@ pub type InvTxfmHBDFunc =
 
 pub fn call_inverse_func<T: Pixel>(
   func: InvTxfmFunc, input: &[T::Coeff], output: &mut PlaneRegionMut<'_, T>,
-  eob: usize, width: usize, height: usize, bd: usize,
+  eob: u16, width: usize, height: usize, bd: usize,
 ) {
   debug_assert!(bd == 8);
 
@@ -51,7 +51,7 @@ pub fn call_inverse_func<T: Pixel>(
 
 pub fn call_inverse_hbd_func<T: Pixel>(
   func: InvTxfmHBDFunc, input: &[T::Coeff],
-  output: &mut PlaneRegionMut<'_, T>, eob: usize, width: usize, height: usize,
+  output: &mut PlaneRegionMut<'_, T>, eob: u16, width: usize, height: usize,
   bd: usize,
 ) {
   // Only use at most 32 columns and 32 rows of input coefficients.
@@ -94,7 +94,7 @@ pub mod test {
 
   pub fn pick_eob<T: Coefficient>(
     coeffs: &mut [T], tx_size: TxSize, tx_type: TxType, sub_h: usize,
-  ) -> usize {
+  ) -> u16 {
     /* From dav1d
      * copy the topleft coefficients such that the return value (being the
      * coefficient scantable index for the eob token) guarantees that only
@@ -105,14 +105,14 @@ pub mod test {
     let coeff_h = av1_get_coded_tx_size(tx_size).height();
     let sub_high: usize = if sub_h > 0 { sub_h * 8 - 1 } else { 0 };
     let sub_low: usize = if sub_h > 1 { sub_high - 8 } else { 0 };
-    let mut eob = 0;
+    let mut eob = 0u16;
     let mut exit = 0;
 
     // Wrap WHT_WHT (16) to DCT_DCT (0) scan table
     let scan = av1_scan_orders[tx_size as usize][(tx_type as usize) & 15].scan;
 
     for (i, &pos) in scan.iter().enumerate() {
-      exit = i;
+      exit = i as u16;
 
       let rc = pos as usize;
       let rcx = rc % coeff_h;
@@ -121,14 +121,14 @@ pub mod test {
       if rcx > sub_high || rcy > sub_high {
         break;
       } else if eob == 0 && (rcx > sub_low || rcy > sub_low) {
-        eob = i;
+        eob = i as u16;
       }
     }
 
     if eob != 0 {
       eob += thread_rng().gen_range(0..(exit - eob).min(1));
     }
-    for &pos in scan.iter().skip(eob) {
+    for &pos in scan.iter().skip(usize::from(eob)) {
       coeffs[pos as usize] = T::cast_from(0);
     }
 
@@ -181,7 +181,7 @@ pub mod test {
       // SAFETY: forward_transform initialized freq
       let freq = unsafe { slice_assume_init_mut(freq) };
 
-      let eob: usize = pick_eob(freq, tx_size, tx_type, sub_h);
+      let eob: u16 = pick_eob(freq, tx_size, tx_type, sub_h);
       let mut rust_dst = dst.clone();
 
       inverse_transform_add(
diff --git a/src/asm/x86/predict.rs b/src/asm/x86/predict.rs
index 4044fd75b9..227fe08859 100644
--- a/src/asm/x86/predict.rs
+++ b/src/asm/x86/predict.rs
@@ -17,6 +17,7 @@ use crate::tiling::{PlaneRegion, PlaneRegionMut};
 use crate::transform::TxSize;
 use crate::util::Aligned;
 use crate::Pixel;
+use std::mem::MaybeUninit;
 use v_frame::pixel::PixelType;
 
 macro_rules! decl_angular_ipred_fn {
@@ -145,7 +146,7 @@ macro_rules! decl_cfl_ac_fn {
     extern {
       $(
         fn $f(
-          ac: *mut i16, src: *const u8, stride: libc::ptrdiff_t,
+          ac: *mut MaybeUninit<i16>, src: *const u8, stride: libc::ptrdiff_t,
           w_pad: libc::c_int, h_pad: libc::c_int,
           width: libc::c_int, height: libc::c_int,
         );
@@ -168,7 +169,7 @@ macro_rules! decl_cfl_ac_hbd_fn {
     extern {
       $(
         fn $f(
-          ac: *mut i16, src: *const u16, stride: libc::ptrdiff_t,
+          ac: *mut MaybeUninit<i16>, src: *const u16, stride: libc::ptrdiff_t,
           w_pad: libc::c_int, h_pad: libc::c_int,
           width: libc::c_int, height: libc::c_int,
         );
@@ -871,12 +872,15 @@ pub fn dispatch_predict_intra<T: Pixel>(
   }
 }
 
+// The implementation MUST inititialize all `ac` elements
 #[inline(always)]
 pub(crate) fn pred_cfl_ac<T: Pixel, const XDEC: usize, const YDEC: usize>(
-  ac: &mut [i16], luma: &PlaneRegion<'_, T>, bsize: BlockSize, w_pad: usize,
-  h_pad: usize, cpu: CpuFeatureLevel,
+  ac: &mut [MaybeUninit<i16>], luma: &PlaneRegion<'_, T>, bsize: BlockSize,
+  w_pad: usize, h_pad: usize, cpu: CpuFeatureLevel,
 ) {
-  let call_rust = |ac: &mut [i16]| {
+  debug_assert_eq!(ac.len(), bsize.area());
+
+  let call_rust = |ac: &mut [MaybeUninit<i16>]| {
     rust::pred_cfl_ac::<T, XDEC, YDEC>(ac, luma, bsize, w_pad, h_pad, cpu);
   };
 
diff --git a/src/asm/x86/quantize.rs b/src/asm/x86/quantize.rs
index b61f5ad392..0331685635 100644
--- a/src/asm/x86/quantize.rs
+++ b/src/asm/x86/quantize.rs
@@ -22,7 +22,7 @@ use std::mem::MaybeUninit;
 type DequantizeFn = unsafe fn(
   qindex: u8,
   coeffs_ptr: *const i16,
-  _eob: usize,
+  _eob: u16,
   rcoeffs_ptr: *mut i16,
   tx_size: TxSize,
   bit_depth: usize,
@@ -38,7 +38,7 @@ cpu_function_lookup_table!(
 
 #[inline(always)]
 pub fn dequantize<T: Coefficient>(
-  qindex: u8, coeffs: &[T], eob: usize, rcoeffs: &mut [MaybeUninit<T>],
+  qindex: u8, coeffs: &[T], eob: u16, rcoeffs: &mut [MaybeUninit<T>],
   tx_size: TxSize, bit_depth: usize, dc_delta_q: i8, ac_delta_q: i8,
   cpu: CpuFeatureLevel,
 ) {
@@ -91,7 +91,7 @@ pub fn dequantize<T: Coefficient>(
 
 #[target_feature(enable = "avx2")]
 unsafe fn dequantize_avx2(
-  qindex: u8, coeffs_ptr: *const i16, _eob: usize, rcoeffs_ptr: *mut i16,
+  qindex: u8, coeffs_ptr: *const i16, _eob: u16, rcoeffs_ptr: *mut i16,
   tx_size: TxSize, bit_depth: usize, dc_delta_q: i8, ac_delta_q: i8,
 ) {
   let log_tx_scale = _mm256_set1_epi32(get_log_tx_scale(tx_size) as i32);
@@ -182,12 +182,12 @@ mod test {
 
       // Test the min, max, and random eobs
       let eobs = {
-        let mut out = [0usize; 16];
+        let mut out = [0u16; 16];
         let area: usize = av1_get_coded_tx_size(tx_size).area();
         out[0] = 0;
-        out[1] = area;
+        out[1] = area as u16;
         for eob in out.iter_mut().skip(2) {
-          *eob = rng.gen_range(0..area);
+          *eob = rng.gen_range(0..area as u16);
         }
         out
       };
@@ -198,7 +198,9 @@ mod test {
 
         // Generate quantized coefficients up to the eob
         let between = Uniform::from(-i16::MAX..=i16::MAX);
-        for (i, qcoeff) in qcoeffs.data.iter_mut().enumerate().take(eob) {
+        for (i, qcoeff) in
+          qcoeffs.data.iter_mut().enumerate().take(eob as usize)
+        {
           *qcoeff = between.sample(&mut rng)
             / if i == 0 { dc_quant } else { ac_quant };
         }
diff --git a/src/asm/x86/transform/inverse.rs b/src/asm/x86/transform/inverse.rs
index df99f4b4b3..f9865ae7ee 100644
--- a/src/asm/x86/transform/inverse.rs
+++ b/src/asm/x86/transform/inverse.rs
@@ -17,7 +17,7 @@ use crate::asm::shared::transform::inverse::*;
 use crate::asm::shared::transform::*;
 
 pub fn inverse_transform_add<T: Pixel>(
-  input: &[T::Coeff], output: &mut PlaneRegionMut<'_, T>, eob: usize,
+  input: &[T::Coeff], output: &mut PlaneRegionMut<'_, T>, eob: u16,
   tx_size: TxSize, tx_type: TxType, bd: usize, cpu: CpuFeatureLevel,
 ) {
   if tx_type == TxType::WHT_WHT {
diff --git a/src/context/block_unit.rs b/src/context/block_unit.rs
index d426132a1d..9a26c8537f 100644
--- a/src/context/block_unit.rs
+++ b/src/context/block_unit.rs
@@ -7,6 +7,8 @@
 // Media Patent License 1.0 was not distributed with this source code in the
 // PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 
+use std::mem::MaybeUninit;
+
 use super::*;
 
 use crate::predict::PredictionMode;
@@ -1781,7 +1783,7 @@ impl<'a> ContextWriter<'a> {
 
   pub fn write_coeffs_lv_map<T: Coefficient, W: Writer>(
     &mut self, w: &mut W, plane: usize, bo: TileBlockOffset, coeffs_in: &[T],
-    eob: usize, pred_mode: PredictionMode, tx_size: TxSize, tx_type: TxType,
+    eob: u16, pred_mode: PredictionMode, tx_size: TxSize, tx_type: TxType,
     plane_bsize: BlockSize, xdec: usize, ydec: usize,
     use_reduced_tx_set: bool, frame_clipped_txw: usize,
     frame_clipped_txh: usize,
@@ -1792,8 +1794,8 @@ impl<'a> ContextWriter<'a> {
     let is_inter = pred_mode >= PredictionMode::NEARESTMV;
 
     // Note: Both intra and inter mode uses inter scan order. Surprised?
-    let scan: &[u16] =
-      &av1_scan_orders[tx_size as usize][tx_type as usize].scan[..eob];
+    let scan: &[u16] = &av1_scan_orders[tx_size as usize][tx_type as usize]
+      .scan[..usize::from(eob)];
     let height = av1_get_coded_tx_size(tx_size).height();
 
     // Create a slice with coeffs in scan order
@@ -1858,7 +1860,7 @@ impl<'a> ContextWriter<'a> {
   }
 
   fn encode_eob<W: Writer>(
-    &mut self, eob: usize, tx_size: TxSize, tx_class: TxClass, txs_ctx: usize,
+    &mut self, eob: u16, tx_size: TxSize, tx_class: TxClass, txs_ctx: usize,
     plane_type: usize, w: &mut W,
   ) {
     let (eob_pt, eob_extra) = Self::get_eob_pos_token(eob);
@@ -1913,18 +1915,19 @@ impl<'a> ContextWriter<'a> {
   }
 
   fn encode_coeffs<T: Coefficient, W: Writer>(
-    &mut self, coeffs: &[T], levels: &mut [u8], scan: &[u16], eob: usize,
+    &mut self, coeffs: &[T], levels: &mut [u8], scan: &[u16], eob: u16,
     tx_size: TxSize, tx_class: TxClass, txs_ctx: usize, plane_type: usize,
     w: &mut W,
   ) {
     // SAFETY: We write to the array below before reading from it.
-    let mut coeff_contexts: Aligned<[i8; MAX_CODED_TX_SQUARE]> =
+    let mut coeff_contexts: Aligned<[MaybeUninit<i8>; MAX_CODED_TX_SQUARE]> =
       unsafe { Aligned::uninitialized() };
 
-    self.get_nz_map_contexts(
+    // get_nz_map_contexts sets coeff_contexts contiguously as a parallel array for scan, not in scan order
+    let coeff_contexts = self.get_nz_map_contexts(
       levels,
       scan,
-      eob as u16,
+      eob,
       tx_size,
       tx_class,
       &mut coeff_contexts.data,
@@ -1932,24 +1935,28 @@ impl<'a> ContextWriter<'a> {
 
     let bhl = Self::get_txb_bhl(tx_size);
 
-    for (c, (&pos, &v)) in scan.iter().zip(coeffs.iter()).enumerate().rev() {
+    let scan_with_ctx =
+      scan.iter().copied().zip(coeff_contexts.iter().copied());
+    for (c, ((pos, coeff_ctx), v)) in
+      scan_with_ctx.zip(coeffs.iter().copied()).enumerate().rev()
+    {
       let pos = pos as usize;
-      let coeff_ctx = coeff_contexts.data[pos];
+      let coeff_ctx = coeff_ctx as usize;
       let level = v.abs();
 
-      if c == eob - 1 {
+      if c == usize::from(eob) - 1 {
         symbol_with_update!(
           self,
           w,
           cmp::min(u32::cast_from(level), 3) - 1,
-          &self.fc.coeff_base_eob_cdf[txs_ctx][plane_type][coeff_ctx as usize]
+          &self.fc.coeff_base_eob_cdf[txs_ctx][plane_type][coeff_ctx]
         );
       } else {
         symbol_with_update!(
           self,
           w,
           cmp::min(u32::cast_from(level), 3),
-          &self.fc.coeff_base_cdf[txs_ctx][plane_type][coeff_ctx as usize]
+          &self.fc.coeff_base_cdf[txs_ctx][plane_type][coeff_ctx]
         );
       }
 
diff --git a/src/context/transform_unit.rs b/src/context/transform_unit.rs
index 5baf660657..a350cb4a3f 100644
--- a/src/context/transform_unit.rs
+++ b/src/context/transform_unit.rs
@@ -11,6 +11,7 @@ use super::*;
 use crate::predict::PredictionMode;
 use crate::predict::PredictionMode::*;
 use crate::transform::TxType::*;
+use std::mem::MaybeUninit;
 
 pub const MAX_TX_SIZE: usize = 64;
 
@@ -804,11 +805,11 @@ impl<'a> ContextWriter<'a> {
   ///
   /// - If `eob` is prior to the start of the group
   #[inline]
-  pub fn get_eob_pos_token(eob: usize) -> (u32, u32) {
+  pub fn get_eob_pos_token(eob: u16) -> (u32, u32) {
     let t = if eob < 33 {
-      eob_to_pos_small[eob] as u32
+      eob_to_pos_small[usize::from(eob)] as u32
     } else {
-      let e = cmp::min((eob - 1) >> 5, 16);
+      let e = usize::from(cmp::min((eob - 1) >> 5, 16));
       eob_to_pos_large[e] as u32
     };
     assert!(eob as i32 >= k_eob_group_start[t as usize] as i32);
@@ -905,25 +906,33 @@ impl<'a> ContextWriter<'a> {
     Self::get_nz_map_ctx_from_stats(stats, coeff_idx, bhl, tx_size, tx_class)
   }
 
-  pub fn get_nz_map_contexts(
+  /// `coeff_contexts_no_scan` is not in the scan order.
+  /// Value for `pos = scan[i]` is at `coeff[i]`, not at `coeff[pos]`.
+  pub fn get_nz_map_contexts<'c>(
     &self, levels: &mut [u8], scan: &[u16], eob: u16, tx_size: TxSize,
-    tx_class: TxClass, coeff_contexts: &mut [i8],
-  ) {
+    tx_class: TxClass, coeff_contexts_no_scan: &'c mut [MaybeUninit<i8>],
+  ) -> &'c mut [i8] {
     let bhl = Self::get_txb_bhl(tx_size);
     let area = av1_get_coded_tx_size(tx_size).area();
-    for i in 0..eob {
-      let pos = scan[i as usize];
-      coeff_contexts[pos as usize] = Self::get_nz_map_ctx(
+
+    let scan = &scan[..usize::from(eob)];
+    let coeffs = &mut coeff_contexts_no_scan[..usize::from(eob)];
+    for (i, (coeff, pos)) in
+      coeffs.iter_mut().zip(scan.iter().copied()).enumerate()
+    {
+      coeff.write(Self::get_nz_map_ctx(
         levels,
         pos as usize,
         bhl,
         area,
-        i as usize,
-        i == eob - 1,
+        i,
+        i == usize::from(eob) - 1,
         tx_size,
         tx_class,
-      ) as i8;
+      ) as i8);
     }
+    // SAFETY: every element has been initialized
+    unsafe { slice_assume_init_mut(coeffs) }
   }
 
   pub fn get_br_ctx(
diff --git a/src/encoder.rs b/src/encoder.rs
index 7c60b2c0ec..8b2d96e9e9 100644
--- a/src/encoder.rs
+++ b/src/encoder.rs
@@ -2261,7 +2261,8 @@ pub fn write_tx_blocks<T: Pixel, W: Writer>(
 
   let PlaneConfig { xdec, ydec, .. } = ts.input.planes[1].cfg;
   // SAFETY: We write to the array below before reading from it.
-  let mut ac: Aligned<[i16; 32 * 32]> = unsafe { Aligned::uninitialized() };
+  let mut ac: Aligned<[MaybeUninit<i16>; 32 * 32]> =
+    unsafe { Aligned::uninitialized() };
   let mut partition_has_coeff: bool = false;
   let mut tx_dist = ScaledDistortion::zero();
   let do_chroma =
@@ -2341,10 +2342,9 @@ pub fn write_tx_blocks<T: Pixel, W: Writer>(
   bh_uv /= uv_tx_size.height_mi();
 
   let ac_data = if chroma_mode.is_cfl() {
-    luma_ac(&mut ac.data, ts, tile_bo, bsize, tx_size, fi);
-    &ac.data[..]
+    luma_ac(&mut ac.data, ts, tile_bo, bsize, tx_size, fi)
   } else {
-    &[]
+    [].as_slice()
   };
 
   let uv_tx_type = if uv_tx_size.width() >= 32 || uv_tx_size.height() >= 32 {
diff --git a/src/partition.rs b/src/partition.rs
index 66320fe98c..dc89ab8367 100644
--- a/src/partition.rs
+++ b/src/partition.rs
@@ -237,6 +237,12 @@ impl BlockSize {
     1 << self.width_log2()
   }
 
+  /// width * height
+  #[inline]
+  pub const fn area(self) -> usize {
+    self.width() * self.height()
+  }
+
   #[inline]
   pub const fn width_log2(self) -> usize {
     match self {
diff --git a/src/predict.rs b/src/predict.rs
index 91378d0ee2..4705cf8ad3 100644
--- a/src/predict.rs
+++ b/src/predict.rs
@@ -11,6 +11,8 @@
 #![allow(non_camel_case_types)]
 #![allow(dead_code)]
 
+use std::mem::MaybeUninit;
+
 cfg_if::cfg_if! {
   if #[cfg(nasm_x86_64)] {
     pub use crate::asm::x86::predict::*;
@@ -630,17 +632,27 @@ const fn get_scaled_luma_q0(alpha_q3: i16, ac_pred_q3: i16) -> i32 {
   }
 }
 
+/// # Returns
+///
+/// Initialized luma AC coefficients
+///
 /// # Panics
 ///
 /// - If the block size is invalid for subsampling
-pub fn luma_ac<T: Pixel>(
-  ac: &mut [i16], ts: &mut TileStateMut<'_, T>, tile_bo: TileBlockOffset,
-  bsize: BlockSize, tx_size: TxSize, fi: &FrameInvariants<T>,
-) {
+///
+pub fn luma_ac<'ac, T: Pixel>(
+  ac: &'ac mut [MaybeUninit<i16>], ts: &mut TileStateMut<'_, T>,
+  tile_bo: TileBlockOffset, bsize: BlockSize, tx_size: TxSize,
+  fi: &FrameInvariants<T>,
+) -> &'ac mut [i16] {
   use crate::context::MI_SIZE_LOG2;
 
   let PlaneConfig { xdec, ydec, .. } = ts.input.planes[1].cfg;
   let plane_bsize = bsize.subsampled_size(xdec, ydec).unwrap();
+
+  // ensure ac has the right length, so there aren't any uninitialized elements at the end
+  let ac = &mut ac[..plane_bsize.area()];
+
   let bo = if bsize.is_sub8x8(xdec, ydec) {
     let offset = bsize.sub8x8_offset(xdec, ydec);
     tile_bo.with_offset(offset.0, offset.1)
@@ -679,6 +691,9 @@ pub fn luma_ac<T: Pixel>(
     (1, 0) => pred_cfl_ac::<T, 1, 0>,
     (_, _) => pred_cfl_ac::<T, 1, 1>,
   })(ac, luma, plane_bsize, w_pad, h_pad, cpu);
+
+  // SAFETY: it relies on individual pred_cfl_ac implementations to initialize the ac
+  unsafe { slice_assume_init_mut(ac) }
 }
 
 pub(crate) mod rust {
@@ -1008,16 +1023,21 @@ pub(crate) mod rust {
   }
 
   pub(crate) fn pred_cfl_ac<T: Pixel, const XDEC: usize, const YDEC: usize>(
-    ac: &mut [i16], luma: &PlaneRegion<'_, T>, plane_bsize: BlockSize,
-    w_pad: usize, h_pad: usize, _cpu: CpuFeatureLevel,
+    ac: &mut [MaybeUninit<i16>], luma: &PlaneRegion<'_, T>,
+    plane_bsize: BlockSize, w_pad: usize, h_pad: usize, _cpu: CpuFeatureLevel,
   ) {
     let max_luma_w = (plane_bsize.width() - w_pad * 4) << XDEC;
     let max_luma_h = (plane_bsize.height() - h_pad * 4) << YDEC;
     let max_luma_x: usize = max_luma_w.max(8) - (1 << XDEC);
     let max_luma_y: usize = max_luma_h.max(8) - (1 << YDEC);
     let mut sum: i32 = 0;
-    for sub_y in 0..plane_bsize.height() {
-      for sub_x in 0..plane_bsize.width() {
+
+    let ac = &mut ac[..plane_bsize.area()];
+
+    for (sub_y, ac_rows) in
+      ac.chunks_exact_mut(plane_bsize.width()).enumerate()
+    {
+      for (sub_x, ac_item) in ac_rows.iter_mut().enumerate() {
         // Refer to https://aomediacodec.github.io/av1-spec/#predict-chroma-from-luma-process
         let luma_y = sub_y << YDEC;
         let luma_x = sub_x << XDEC;
@@ -1033,14 +1053,16 @@ pub(crate) mod rust {
             + i16::cast_from(luma[y + 1][x + 1]);
         }
         sample <<= 3 - XDEC - YDEC;
-        ac[sub_y * plane_bsize.width() + sub_x] = sample;
+        ac_item.write(sample);
         sum += sample as i32;
       }
     }
+    // SAFETY: the loop above has initialized all items
+    let ac = unsafe { assume_slice_init_mut(ac) };
     let shift = plane_bsize.width_log2() + plane_bsize.height_log2();
     let average = ((sum + (1 << (shift - 1))) >> shift) as i16;
 
-    for val in &mut ac[..(plane_bsize.height() * plane_bsize.width())] {
+    for val in ac {
       *val -= average;
     }
   }
@@ -1052,8 +1074,7 @@ pub(crate) mod rust {
     if alpha == 0 {
       return;
     }
-    assert!(32 >= width);
-    assert!(ac.len() >= 32 * (height - 1) + width);
+    debug_assert!(ac.len() >= width * height);
     assert!(output.plane_cfg.stride >= width);
     assert!(output.rows_iter().len() >= height);
 
@@ -1061,7 +1082,7 @@ pub(crate) mod rust {
     let avg: i32 = output[0][0].into();
 
     for (line, luma) in
-      output.rows_iter_mut().zip(ac.chunks(width)).take(height)
+      output.rows_iter_mut().zip(ac.chunks_exact(width)).take(height)
     {
       for (v, &l) in line[..width].iter_mut().zip(luma[..width].iter()) {
         *v = T::cast_from(
diff --git a/src/quantize/mod.rs b/src/quantize/mod.rs
index 1555f58db9..d2d3533c0d 100644
--- a/src/quantize/mod.rs
+++ b/src/quantize/mod.rs
@@ -269,7 +269,7 @@ impl QuantizationContext {
   #[inline]
   pub fn quantize<T: Coefficient>(
     &self, coeffs: &[T], qcoeffs: &mut [T], tx_size: TxSize, tx_type: TxType,
-  ) -> usize {
+  ) -> u16 {
     let scan = av1_scan_orders[tx_size as usize][tx_type as usize].scan;
     let iscan = av1_scan_orders[tx_size as usize][tx_type as usize].iscan;
 
@@ -299,9 +299,9 @@ impl QuantizationContext {
         .unwrap_or(0);
       // We skip the DC coefficient since it has its own quantizer index.
       if eob_minus_one > 0 {
-        eob_minus_one as usize + 1
+        eob_minus_one + 1
       } else {
-        usize::from(qcoeffs[0] != T::cast_from(0))
+        u16::from(qcoeffs[0] != T::cast_from(0))
       }
     };
 
@@ -317,7 +317,7 @@ impl QuantizationContext {
     // that tail of zeroes and ones than we do for the larger coefficients.
     let mut level_mode = 1;
     let ac_quant = self.ac_quant.get() as u32;
-    for &pos in scan.iter().take(eob).skip(1) {
+    for &pos in scan.iter().take(usize::from(eob)).skip(1) {
       let coeff = i32::cast_from(coeffs[pos as usize]) << self.log_tx_scale;
       let abs_coeff = coeff.unsigned_abs();
 
@@ -344,7 +344,7 @@ impl QuantizationContext {
 
     // Check the eob is correct
     debug_assert_eq!(
-      eob,
+      usize::from(eob),
       scan
         .iter()
         .rposition(|&i| qcoeffs[i as usize] != T::cast_from(0))
@@ -362,7 +362,7 @@ pub mod rust {
   use std::mem::MaybeUninit;
 
   pub fn dequantize<T: Coefficient>(
-    qindex: u8, coeffs: &[T], _eob: usize, rcoeffs: &mut [MaybeUninit<T>],
+    qindex: u8, coeffs: &[T], _eob: u16, rcoeffs: &mut [MaybeUninit<T>],
     tx_size: TxSize, bit_depth: usize, dc_delta_q: i8, ac_delta_q: i8,
     _cpu: CpuFeatureLevel,
   ) {
diff --git a/src/rdo.rs b/src/rdo.rs
index 34d751eba8..42146d5136 100644
--- a/src/rdo.rs
+++ b/src/rdo.rs
@@ -1610,8 +1610,9 @@ pub fn rdo_cfl_alpha<T: Pixel>(
     return None;
   };
   // SAFETY: We write to the array below before reading from it.
-  let mut ac: Aligned<[i16; 32 * 32]> = unsafe { Aligned::uninitialized() };
-  luma_ac(&mut ac.data, ts, tile_bo, bsize, luma_tx_size, fi);
+  let mut ac: Aligned<[MaybeUninit<i16>; 32 * 32]> =
+    unsafe { Aligned::uninitialized() };
+  let ac = luma_ac(&mut ac.data, ts, tile_bo, bsize, luma_tx_size, fi);
   let best_alpha: ArrayVec<i16, 2> = (1..3)
     .map(|p| {
       let &PlaneConfig { xdec, ydec, .. } = ts.rec.planes[p].plane_cfg;
@@ -1640,7 +1641,7 @@ pub fn rdo_cfl_alpha<T: Pixel>(
           &mut rec_region,
           uv_tx_size,
           fi.sequence.bit_depth,
-          &ac.data,
+          ac,
           IntraParam::Alpha(alpha),
           None,
           &edge_buf,
diff --git a/src/transform/inverse.rs b/src/transform/inverse.rs
index 870e517f37..a85f371029 100644
--- a/src/transform/inverse.rs
+++ b/src/transform/inverse.rs
@@ -1633,7 +1633,7 @@ pub(crate) mod rust {
 
   #[cold_for_target_arch("x86_64", "aarch64")]
   pub fn inverse_transform_add<T: Pixel>(
-    input: &[T::Coeff], output: &mut PlaneRegionMut<'_, T>, _eob: usize,
+    input: &[T::Coeff], output: &mut PlaneRegionMut<'_, T>, _eob: u16,
     tx_size: TxSize, tx_type: TxType, bd: usize, _cpu: CpuFeatureLevel,
   ) {
     let width: usize = tx_size.width();
diff --git a/src/transform/mod.rs b/src/transform/mod.rs
index 3d17f6d8fe..55f91c155f 100644
--- a/src/transform/mod.rs
+++ b/src/transform/mod.rs
@@ -506,7 +506,7 @@ mod test {
     inverse_transform_add(
       freq,
       &mut dst.as_region_mut(),
-      coeff_area,
+      coeff_area.try_into().unwrap(),
       tx_size,
       tx_type,
       8,