diff --git a/crates/zune-jpeg/src/bitstream.rs b/crates/zune-jpeg/src/bitstream.rs index 5cd20818..f8392e07 100644 --- a/crates/zune-jpeg/src/bitstream.rs +++ b/crates/zune-jpeg/src/bitstream.rs @@ -56,7 +56,7 @@ use crate::errors::DecodeErrors; use crate::huffman::{HuffmanTable, HUFF_LOOKAHEAD}; use crate::marker::Marker; use crate::mcu::DCT_BLOCK; -use crate::misc::UN_ZIGZAG; +use crate::misc::UN_ZIGZAG_TRANSPOSED; macro_rules! decode_huff { ($stream:tt,$symbol:tt,$table:tt) => { @@ -339,7 +339,7 @@ impl BitStream { if fast_ac != 0 { // FAST AC path pos += ((fast_ac >> 4) & 15) as usize; // run - let t_pos = UN_ZIGZAG[min(pos, 63)] & 63; + let t_pos = UN_ZIGZAG_TRANSPOSED[min(pos, 63)] & 63; block[t_pos] = i32::from(fast_ac >> 8) * (qt_table[t_pos]); // Value self.drop_bits((fast_ac & 15) as u8); @@ -354,7 +354,7 @@ impl BitStream { pos += r as usize; r = self.get_bits(symbol as u8); symbol = huff_extend(r, symbol); - let t_pos = UN_ZIGZAG[pos & 63] & 63; + let t_pos = UN_ZIGZAG_TRANSPOSED[pos & 63] & 63; block[t_pos] = symbol * qt_table[t_pos]; @@ -409,6 +409,7 @@ impl BitStream { *block = (*dc_prediction as i16).wrapping_mul(1_i16 << self.successive_low); return Ok(()); } + #[inline] pub(crate) fn decode_prog_dc_refine( &mut self, reader: &mut ZByteReader, block: &mut i16 @@ -435,6 +436,7 @@ impl BitStream { self.drop_bits(1); return k; } + pub(crate) fn decode_mcu_ac_first( &mut self, reader: &mut ZByteReader, ac_table: &HuffmanTable, block: &mut [i16; 64] ) -> Result @@ -458,7 +460,7 @@ impl BitStream { if fac != 0 { // fast ac path k += ((fac >> 4) & 15) as usize; // run - block[UN_ZIGZAG[min(k, 63)] & 63] = (fac >> 8).wrapping_mul(1 << shift); // value + block[UN_ZIGZAG_TRANSPOSED[min(k, 63)] & 63] = (fac >> 8).wrapping_mul(1 << shift); // value self.drop_bits((fac & 15) as u8); k += 1; } else { @@ -471,7 +473,7 @@ impl BitStream { k += r as usize; r = self.get_bits(symbol as u8); symbol = huff_extend(r, symbol); - block[UN_ZIGZAG[k & 63] & 63] = (symbol as i16).wrapping_mul(1 << shift); + block[UN_ZIGZAG_TRANSPOSED[k & 63] & 63] = (symbol as i16).wrapping_mul(1 << shift); k += 1; } else { if r != 15 { @@ -546,7 +548,7 @@ impl BitStream { if k <= self.spec_end { 'advance_nonzero: loop { - let coefficient = &mut block[UN_ZIGZAG[k as usize & 63] & 63]; + let coefficient = &mut block[UN_ZIGZAG_TRANSPOSED[k as usize & 63] & 63]; if *coefficient != 0 { if self.get_bit() == 1 && (*coefficient & bit) == 0 { @@ -578,7 +580,7 @@ impl BitStream { } if symbol != 0 { - let pos = UN_ZIGZAG[k as usize & 63]; + let pos = UN_ZIGZAG_TRANSPOSED[k as usize & 63]; // output new non-zero coefficient. block[pos & 63] = symbol as i16; } @@ -596,7 +598,7 @@ impl BitStream { self.refill(reader)?; while k <= self.spec_end { - let coefficient = &mut block[UN_ZIGZAG[k as usize & 63] & 63]; + let coefficient = &mut block[UN_ZIGZAG_TRANSPOSED[k as usize & 63] & 63]; if *coefficient != 0 && self.get_bit() == 1 { // check if we already modified it, if so do nothing, otherwise diff --git a/crates/zune-jpeg/src/headers.rs b/crates/zune-jpeg/src/headers.rs index 7e772921..3e2ca0de 100644 --- a/crates/zune-jpeg/src/headers.rs +++ b/crates/zune-jpeg/src/headers.rs @@ -22,7 +22,7 @@ use crate::components::Components; use crate::decoder::{ICCChunk, JpegDecoder, MAX_COMPONENTS}; use crate::errors::DecodeErrors; use crate::huffman::HuffmanTable; -use crate::misc::{SOFMarkers, UN_ZIGZAG}; +use crate::misc::{SOFMarkers, UN_ZIGZAG_TRANSPOSED}; ///**B.2.4.2 Huffman table-specification syntax** #[allow(clippy::similar_names, clippy::cast_sign_loss)] @@ -148,8 +148,8 @@ pub(crate) fn parse_dqt(img: &mut JpegDecoder) -> Result<(), DecodeErrors::Format(format!("Could not read symbols into the buffer\n{x}")) })?; qt_length -= (precision_value as u16) + 1 /*QT BIT*/; - // carry out un zig-zag here - un_zig_zag(&qt_values) + // carry out transposed un zig-zag here + un_zig_zag_transposed(&qt_values) } 1 => { // 16 bit quantization tables @@ -160,7 +160,7 @@ pub(crate) fn parse_dqt(img: &mut JpegDecoder) -> Result<(), } qt_length -= (precision_value as u16) + 1; - un_zig_zag(&qt_values) + un_zig_zag_transposed(&qt_values) } _ => { return Err(DecodeErrors::DqtError(format!( @@ -530,7 +530,7 @@ pub(crate) fn parse_app2( /// Small utility function to print Un-zig-zagged quantization tables -fn un_zig_zag(a: &[T]) -> [i32; 64] +fn un_zig_zag_transposed(a: &[T]) -> [i32; 64] where T: Default + Copy, i32: core::convert::From @@ -538,7 +538,8 @@ where let mut output = [i32::default(); 64]; for i in 0..64 { - output[UN_ZIGZAG[i]] = i32::from(a[i]); + // Transpose everything so we can still vectorize as they'll be used on transposed data + output[UN_ZIGZAG_TRANSPOSED[i]] = i32::from(a[i]); } output diff --git a/crates/zune-jpeg/src/idct/avx2.rs b/crates/zune-jpeg/src/idct/avx2.rs index 0f2f1953..981e335c 100644 --- a/crates/zune-jpeg/src/idct/avx2.rs +++ b/crates/zune-jpeg/src/idct/avx2.rs @@ -100,21 +100,21 @@ pub unsafe fn idct_int_avx2_inner( // we only care about AC terms let rw8 = _mm256_loadu_si256(in_vector[1..].as_ptr().cast()); - let zero = _mm256_setzero_si256(); - - let mut non_zero = 0; + let or = ( + _mm256_or_si256(rw1, rw8), + _mm256_or_si256(rw2, rw3), + _mm256_or_si256(rw4, rw5), + _mm256_or_si256(rw6, rw7), + ); - non_zero += _mm256_movemask_epi8(_mm256_cmpeq_epi32(rw8, zero)); - non_zero += _mm256_movemask_epi8(_mm256_cmpeq_epi32(rw1, zero)); - non_zero += _mm256_movemask_epi8(_mm256_cmpeq_epi32(rw2, zero)); - non_zero += _mm256_movemask_epi8(_mm256_cmpeq_epi64(rw3, zero)); + let or = ( + _mm256_or_si256(or.0, or.1), + _mm256_or_si256(or.2, or.3), + ); - non_zero += _mm256_movemask_epi8(_mm256_cmpeq_epi64(rw4, zero)); - non_zero += _mm256_movemask_epi8(_mm256_cmpeq_epi64(rw5, zero)); - non_zero += _mm256_movemask_epi8(_mm256_cmpeq_epi64(rw6, zero)); - non_zero += _mm256_movemask_epi8(_mm256_cmpeq_epi64(rw7, zero)); + let or = _mm256_or_si256(or.0, or.1); - if non_zero == -8 { + if _mm256_testz_si256(or, or) != 0 { // AC terms all zero, idct of the block is is ( coeff[0] * qt[0] )/8 + 128 (bias) // (and clamped to 255) let idct_value = _mm_set1_epi16(((in_vector[0] >> 3) + 128).clamp(0, 255) as i16); @@ -214,17 +214,14 @@ pub unsafe fn idct_int_avx2_inner( }; } - // Process rows + // process columns dct_pass!(512, 10); transpose( &mut row0, &mut row1, &mut row2, &mut row3, &mut row4, &mut row5, &mut row6, &mut row7 ); - // process columns + // Process rows dct_pass!(SCALE_BITS, 17); - transpose( - &mut row0, &mut row1, &mut row2, &mut row3, &mut row4, &mut row5, &mut row6, &mut row7 - ); // Pack i32 to i16's, // clamp them to be between 0-255 diff --git a/crates/zune-jpeg/src/idct/neon.rs b/crates/zune-jpeg/src/idct/neon.rs index cba9f7d9..3fc36780 100644 --- a/crates/zune-jpeg/src/idct/neon.rs +++ b/crates/zune-jpeg/src/idct/neon.rs @@ -201,17 +201,14 @@ pub unsafe fn idct_int_neon_inner( }; } - // Process rows + // process columns dct_pass!(512, 10); transpose( &mut row0, &mut row1, &mut row2, &mut row3, &mut row4, &mut row5, &mut row6, &mut row7 ); - // process columns + // Process rows dct_pass!(SCALE_BITS, 17); - transpose( - &mut row0, &mut row1, &mut row2, &mut row3, &mut row4, &mut row5, &mut row6, &mut row7 - ); // Pack i32 to i16's, // clamp them to be between 0-255 diff --git a/crates/zune-jpeg/src/idct/scalar.rs b/crates/zune-jpeg/src/idct/scalar.rs index 3120381e..78d8940a 100644 --- a/crates/zune-jpeg/src/idct/scalar.rs +++ b/crates/zune-jpeg/src/idct/scalar.rs @@ -16,7 +16,9 @@ const SCALE_BITS: i32 = 512 + 65536 + (128 << 17); #[allow( clippy::too_many_lines, clippy::op_ref, - clippy::cast_possible_truncation + clippy::cast_possible_truncation, + clippy::erasing_op, + clippy::identity_op, )] pub fn idct_int(in_vector: &mut [i32; 64], out_vector: &mut [i16], stride: usize) { // Temporary variables. @@ -57,17 +59,17 @@ pub fn idct_int(in_vector: &mut [i32; 64], out_vector: &mut [i16], stride: usize } else { // because the compiler fails to see that it can be auto_vectorised so i'll // leave it here check out [idct_int_slow, and idct_int_1D to get what i mean ] https://godbolt.org/z/8hqW9z9j9 - for ptr in 0..8 { - let p2 = in_vector[ptr + 16]; - let p3 = in_vector[ptr + 48]; + while i < 64 { + let p2 = in_vector[i + 2]; + let p3 = in_vector[i + 6]; let p1 = (p2 + p3).wrapping_mul(2217); let t2 = p1 + p3 * -7567; let t3 = p1 + p2 * 3135; - let p2 = in_vector[ptr]; - let p3 = in_vector[32 + ptr]; + let p2 = in_vector[i]; + let p3 = in_vector[i + 4]; let t0 = fsh(p2 + p3); let t1 = fsh(p2 - p3); @@ -77,10 +79,10 @@ pub fn idct_int(in_vector: &mut [i32; 64], out_vector: &mut [i16], stride: usize let x2 = t1 - t2 + 512; // odd part - let mut t0 = in_vector[ptr + 56]; - let mut t1 = in_vector[ptr + 40]; - let mut t2 = in_vector[ptr + 24]; - let mut t3 = in_vector[ptr + 8]; + let mut t0 = in_vector[i + 7]; + let mut t1 = in_vector[i + 5]; + let mut t2 = in_vector[i + 3]; + let mut t3 = in_vector[i + 1]; let p3 = t0 + t2; let p4 = t1 + t3; @@ -105,30 +107,32 @@ pub fn idct_int(in_vector: &mut [i32; 64], out_vector: &mut [i16], stride: usize // constants scaled things up by 1<<12; let's bring them back // down, but keep 2 extra bits of precision - in_vector[ptr] = (x0 + t3) >> 10; - in_vector[ptr + 8] = (x1 + t2) >> 10; - in_vector[ptr + 16] = (x2 + t1) >> 10; - in_vector[ptr + 24] = (x3 + t0) >> 10; - in_vector[ptr + 32] = (x3 - t0) >> 10; - in_vector[ptr + 40] = (x2 - t1) >> 10; - in_vector[ptr + 48] = (x1 - t2) >> 10; - in_vector[ptr + 56] = (x0 - t3) >> 10; + in_vector[i] = (x0 + t3) >> 10; + in_vector[i + 1] = (x1 + t2) >> 10; + in_vector[i + 2] = (x2 + t1) >> 10; + in_vector[i + 3] = (x3 + t0) >> 10; + in_vector[i + 4] = (x3 - t0) >> 10; + in_vector[i + 5] = (x2 - t1) >> 10; + in_vector[i + 6] = (x1 - t2) >> 10; + in_vector[i + 7] = (x0 - t3) >> 10; + + i += 8; } // This is vectorised in architectures supporting SSE 4.1 - while i < 64 { + for ptr in 0..8 { // We won't try to short circuit here because it rarely works // Even part - let p2 = in_vector[i + 2]; - let p3 = in_vector[i + 6]; + let p2 = in_vector[ptr + 16]; + let p3 = in_vector[ptr + 48]; let p1 = (p2 + p3) * 2217; let t2 = p1 + p3 * -7567; let t3 = p1 + p2 * 3135; - let p2 = in_vector[i]; - let p3 = in_vector[i + 4]; + let p2 = in_vector[ptr]; + let p3 = in_vector[ptr + 32]; let t0 = fsh(p2 + p3); let t1 = fsh(p2 - p3); @@ -143,10 +147,10 @@ pub fn idct_int(in_vector: &mut [i32; 64], out_vector: &mut [i16], stride: usize let x1 = t1 + t2 + SCALE_BITS; let x2 = t1 - t2 + SCALE_BITS; // odd part - let mut t0 = in_vector[i + 7]; - let mut t1 = in_vector[i + 5]; - let mut t2 = in_vector[i + 3]; - let mut t3 = in_vector[i + 1]; + let mut t0 = in_vector[ptr + 56]; + let mut t1 = in_vector[ptr + 40]; + let mut t2 = in_vector[ptr + 24]; + let mut t3 = in_vector[ptr + 8]; let p3 = t0 + t2; let p4 = t1 + t3; @@ -184,8 +188,6 @@ pub fn idct_int(in_vector: &mut [i32; 64], out_vector: &mut [i16], stride: usize out[6] = clamp((x1 - t2) >> 17); out[7] = clamp((x0 - t3) >> 17); - i += 8; - pos += stride; } } diff --git a/crates/zune-jpeg/src/mcu.rs b/crates/zune-jpeg/src/mcu.rs index daabb583..1c243c77 100644 --- a/crates/zune-jpeg/src/mcu.rs +++ b/crates/zune-jpeg/src/mcu.rs @@ -364,8 +364,9 @@ impl JpegDecoder { // iterate over each line, since color-convert needs only // one line for (j, samp) in raw_samples.iter_mut().enumerate().take(comp_len) { - *samp = &samples[j][pos * padded_width..(pos + 1) * padded_width] + *samp = &samples[j][pos * padded_width..(pos + 1) * padded_width]; } + color_convert( &raw_samples, self.color_convert_16, diff --git a/crates/zune-jpeg/src/misc.rs b/crates/zune-jpeg/src/misc.rs index 089ea8e5..1dd8eff7 100644 --- a/crates/zune-jpeg/src/misc.rs +++ b/crates/zune-jpeg/src/misc.rs @@ -66,6 +66,21 @@ pub const UN_ZIGZAG: [usize; 64 + 16] = [ 63, 63, 63, 63, 63, 63, 63, 63 ]; +#[rustfmt::skip] +pub const UN_ZIGZAG_TRANSPOSED: [usize; 64 + 16] = [ + 0, 8, 1, 2, 9, 16, 24, 17, + 10, 3, 4, 11, 18, 25, 32, 40, + 33, 26, 19, 12, 5, 6, 13, 20, + 27, 34, 41, 48, 56, 49, 42, 35, + 28, 21, 14, 7, 15, 22, 29, 36, + 43, 50, 57, 58, 51, 44, 37, 30, + 23, 31, 38, 45, 52, 59, 60, 53, + 46, 39, 47, 54, 61, 62, 55, 63, + // Prevent overflowing + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63 +]; + /// Align data to a 16 byte boundary #[repr(align(16))] #[derive(Clone)]