@@ -235,38 +235,45 @@ where
235235 . map ( |f| f[ p] . data_origin ( ) )
236236 . collect :: < ArrayVec < _ , TB_SIZE > > ( ) ;
237237
238- for y in ( 0 ..effective_height) . step_by ( INC ) {
239- for x in ( 0 ..=( pad_width - SB_SIZE ) ) . step_by ( INC ) {
240- for z in 0 ..TB_SIZE {
241- self . proc0 (
242- & src_planes[ z] [ x..] ,
243- & self . hw [ ( BLOCK_AREA * z) ..] ,
244- & mut dftr[ ( BLOCK_AREA * z) ..] ,
245- src_stride,
238+ // SAFETY: We know the size of the planes we're working on,
239+ // so we can safely ensure we are not out of bounds.
240+ // There are a fair number of unsafe function calls here
241+ // which are unsafe for optimization purposes.
242+ // All are safe as long as we do not pass out-of-bounds parameters.
243+ unsafe {
244+ for y in ( 0 ..effective_height) . step_by ( INC ) {
245+ for x in ( 0 ..=( pad_width - SB_SIZE ) ) . step_by ( INC ) {
246+ for z in 0 ..TB_SIZE {
247+ self . proc0 (
248+ & src_planes[ z] [ x..] ,
249+ & self . hw [ ( BLOCK_AREA * z) ..] ,
250+ & mut dftr[ ( BLOCK_AREA * z) ..] ,
251+ src_stride,
252+ SB_SIZE ,
253+ self . src_scale ,
254+ ) ;
255+ }
256+
257+ self . real_to_complex_3d ( & dftr, & mut dftc) ;
258+ self . remove_mean ( & mut dftc, & self . dftgc , & mut means) ;
259+
260+ self . filter_coeffs ( & mut dftc) ;
261+
262+ self . add_mean ( & mut dftc, & means) ;
263+ self . complex_to_real_3d ( & dftc, & mut dftr) ;
264+
265+ self . proc1 (
266+ & dftr[ ( TB_MIDPOINT * BLOCK_AREA ) ..] ,
267+ & self . hw [ ( TB_MIDPOINT * BLOCK_AREA ) ..] ,
268+ & mut ebuff[ ( y * ebuff_stride + x) ..] ,
246269 SB_SIZE ,
247- self . src_scale ,
270+ ebuff_stride ,
248271 ) ;
249272 }
250273
251- self . real_to_complex_3d ( & dftr, & mut dftc) ;
252- self . remove_mean ( & mut dftc, & self . dftgc , & mut means) ;
253-
254- self . filter_coeffs ( & mut dftc) ;
255-
256- self . add_mean ( & mut dftc, & means) ;
257- self . complex_to_real_3d ( & dftc, & mut dftr) ;
258-
259- self . proc1 (
260- & dftr[ ( TB_MIDPOINT * BLOCK_AREA ) ..] ,
261- & self . hw [ ( TB_MIDPOINT * BLOCK_AREA ) ..] ,
262- & mut ebuff[ ( y * ebuff_stride + x) ..] ,
263- SB_SIZE ,
264- ebuff_stride,
265- ) ;
266- }
267-
268- for q in 0 ..TB_SIZE {
269- src_planes[ q] = & src_planes[ q] [ ( INC * src_stride) ..] ;
274+ for q in 0 ..TB_SIZE {
275+ src_planes[ q] = & src_planes[ q] [ ( INC * src_stride) ..] ;
276+ }
270277 }
271278 }
272279
@@ -313,6 +320,7 @@ where
313320 hw
314321 }
315322
323+ #[ inline( always) ]
316324 // Hanning windowing
317325 fn spatial_window ( n : f64 ) -> f64 {
318326 0.5 - 0.5 * ( 2.0 * PI * n / SB_SIZE as f64 ) . cos ( )
@@ -345,35 +353,44 @@ where
345353 }
346354 }
347355
348- fn proc0 (
356+ #[ inline]
357+ unsafe fn proc0 (
349358 & self , s0 : & [ T ] , s1 : & [ f32 ] , dest : & mut [ f32 ] , p0 : usize , p1 : usize ,
350359 src_scale : f32 ,
351360 ) {
352- let s0 = s0. chunks ( p0 ) ;
353- let s1 = s1. chunks ( p1 ) ;
354- let dest = dest. chunks_mut ( p1 ) ;
361+ let s0 = s0. as_ptr ( ) ;
362+ let s1 = s1. as_ptr ( ) ;
363+ let dest = dest. as_mut_ptr ( ) ;
355364
356- for ( s0 , ( s1 , dest ) ) in s0 . zip ( s1 . zip ( dest ) ) . take ( p1 ) {
365+ for u in 0 ..p1 {
357366 for v in 0 ..p1 {
358- dest[ v] = u16:: cast_from ( s0[ v] ) as f32 * src_scale * s1[ v] ;
367+ let s0 = s0. add ( u * p0 + v) ;
368+ let s1 = s1. add ( u * p1 + v) ;
369+ let dest = dest. add ( u * p1 + v) ;
370+ dest. write ( u16:: cast_from ( s0. read ( ) ) as f32 * src_scale * s1. read ( ) )
359371 }
360372 }
361373 }
362374
363- fn proc1 (
375+ #[ inline]
376+ unsafe fn proc1 (
364377 & self , s0 : & [ f32 ] , s1 : & [ f32 ] , dest : & mut [ f32 ] , p0 : usize , p1 : usize ,
365378 ) {
366- let s0 = s0. chunks ( p0 ) ;
367- let s1 = s1. chunks ( p0 ) ;
368- let dest = dest. chunks_mut ( p1 ) ;
379+ let s0 = s0. as_ptr ( ) ;
380+ let s1 = s1. as_ptr ( ) ;
381+ let dest = dest. as_mut_ptr ( ) ;
369382
370- for ( s0 , ( s1 , dest ) ) in s0 . zip ( s1 . zip ( dest ) ) . take ( p0 ) {
383+ for u in 0 ..p0 {
371384 for v in 0 ..p0 {
372- dest[ v] += s0[ v] * s1[ v] ;
385+ let s0 = s0. add ( u * p0 + v) ;
386+ let s1 = s1. add ( u * p0 + v) ;
387+ let dest = dest. add ( u * p1 + v) ;
388+ dest. write ( s0. read ( ) . mul_add ( s1. read ( ) , dest. read ( ) ) ) ;
373389 }
374390 }
375391 }
376392
393+ #[ inline]
377394 fn remove_mean (
378395 & self , dftc : & mut [ Complex < f32 > ; COMPLEX_COUNT ] ,
379396 dftgc : & [ Complex < f32 > ; COMPLEX_COUNT ] ,
@@ -389,6 +406,7 @@ where
389406 }
390407 }
391408
409+ #[ inline]
392410 fn add_mean (
393411 & self , dftc : & mut [ Complex < f32 > ; COMPLEX_COUNT ] ,
394412 means : & [ Complex < f32 > ; COMPLEX_COUNT ] ,
@@ -399,6 +417,7 @@ where
399417 }
400418 }
401419
420+ #[ inline]
402421 // Applies a generalized wiener filter
403422 fn filter_coeffs ( & self , dftc : & mut [ Complex < f32 > ; COMPLEX_COUNT ] ) {
404423 for h in 0 ..COMPLEX_COUNT {
@@ -495,11 +514,8 @@ where
495514 for ( ebuff, dest) in ebuff. zip ( dest) . take ( dest_height) {
496515 for x in 0 ..dest_width {
497516 let fval = ebuff[ x] . mul_add ( self . dest_scale , 0.5 ) ;
498- dest[ x] = clamp (
499- T :: cast_from ( fval. round ( ) as u16 ) ,
500- T :: cast_from ( 0u16 ) ,
501- self . peak ,
502- ) ;
517+ dest[ x] =
518+ clamp ( T :: cast_from ( fval as u16 ) , T :: cast_from ( 0u16 ) , self . peak ) ;
503519 }
504520 }
505521 }
@@ -544,6 +560,7 @@ where
544560 }
545561}
546562
563+ #[ inline( always) ]
547564fn extra ( a : usize , b : usize ) -> usize {
548565 if a % b > 0 {
549566 b - ( a % b)
0 commit comments