Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 10 additions & 8 deletions crates/zune-jpeg/src/bitstream.rs
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ use crate::errors::DecodeErrors;
use crate::huffman::{HuffmanTable, HUFF_LOOKAHEAD};
use crate::marker::Marker;
use crate::mcu::DCT_BLOCK;
use crate::misc::UN_ZIGZAG;
use crate::misc::UN_ZIGZAG_TRANSPOSED;

macro_rules! decode_huff {
($stream:tt,$symbol:tt,$table:tt) => {
Expand Down Expand Up @@ -339,7 +339,7 @@ impl BitStream {
if fast_ac != 0 {
// FAST AC path
pos += ((fast_ac >> 4) & 15) as usize; // run
let t_pos = UN_ZIGZAG[min(pos, 63)] & 63;
let t_pos = UN_ZIGZAG_TRANSPOSED[min(pos, 63)] & 63;

block[t_pos] = i32::from(fast_ac >> 8) * (qt_table[t_pos]); // Value
self.drop_bits((fast_ac & 15) as u8);
Expand All @@ -354,7 +354,7 @@ impl BitStream {
pos += r as usize;
r = self.get_bits(symbol as u8);
symbol = huff_extend(r, symbol);
let t_pos = UN_ZIGZAG[pos & 63] & 63;
let t_pos = UN_ZIGZAG_TRANSPOSED[pos & 63] & 63;

block[t_pos] = symbol * qt_table[t_pos];

Expand Down Expand Up @@ -409,6 +409,7 @@ impl BitStream {
*block = (*dc_prediction as i16).wrapping_mul(1_i16 << self.successive_low);
return Ok(());
}

#[inline]
pub(crate) fn decode_prog_dc_refine<T>(
&mut self, reader: &mut ZByteReader<T>, block: &mut i16
Expand All @@ -435,6 +436,7 @@ impl BitStream {
self.drop_bits(1);
return k;
}

pub(crate) fn decode_mcu_ac_first<T>(
&mut self, reader: &mut ZByteReader<T>, ac_table: &HuffmanTable, block: &mut [i16; 64]
) -> Result<bool, DecodeErrors>
Expand All @@ -458,7 +460,7 @@ impl BitStream {
if fac != 0 {
// fast ac path
k += ((fac >> 4) & 15) as usize; // run
block[UN_ZIGZAG[min(k, 63)] & 63] = (fac >> 8).wrapping_mul(1 << shift); // value
block[UN_ZIGZAG_TRANSPOSED[min(k, 63)] & 63] = (fac >> 8).wrapping_mul(1 << shift); // value
self.drop_bits((fac & 15) as u8);
k += 1;
} else {
Expand All @@ -471,7 +473,7 @@ impl BitStream {
k += r as usize;
r = self.get_bits(symbol as u8);
symbol = huff_extend(r, symbol);
block[UN_ZIGZAG[k & 63] & 63] = (symbol as i16).wrapping_mul(1 << shift);
block[UN_ZIGZAG_TRANSPOSED[k & 63] & 63] = (symbol as i16).wrapping_mul(1 << shift);
k += 1;
} else {
if r != 15 {
Expand Down Expand Up @@ -546,7 +548,7 @@ impl BitStream {

if k <= self.spec_end {
'advance_nonzero: loop {
let coefficient = &mut block[UN_ZIGZAG[k as usize & 63] & 63];
let coefficient = &mut block[UN_ZIGZAG_TRANSPOSED[k as usize & 63] & 63];

if *coefficient != 0 {
if self.get_bit() == 1 && (*coefficient & bit) == 0 {
Expand Down Expand Up @@ -578,7 +580,7 @@ impl BitStream {
}

if symbol != 0 {
let pos = UN_ZIGZAG[k as usize & 63];
let pos = UN_ZIGZAG_TRANSPOSED[k as usize & 63];
// output new non-zero coefficient.
block[pos & 63] = symbol as i16;
}
Expand All @@ -596,7 +598,7 @@ impl BitStream {
self.refill(reader)?;

while k <= self.spec_end {
let coefficient = &mut block[UN_ZIGZAG[k as usize & 63] & 63];
let coefficient = &mut block[UN_ZIGZAG_TRANSPOSED[k as usize & 63] & 63];

if *coefficient != 0 && self.get_bit() == 1 {
// check if we already modified it, if so do nothing, otherwise
Expand Down
13 changes: 7 additions & 6 deletions crates/zune-jpeg/src/headers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ use crate::components::Components;
use crate::decoder::{ICCChunk, JpegDecoder, MAX_COMPONENTS};
use crate::errors::DecodeErrors;
use crate::huffman::HuffmanTable;
use crate::misc::{SOFMarkers, UN_ZIGZAG};
use crate::misc::{SOFMarkers, UN_ZIGZAG_TRANSPOSED};

///**B.2.4.2 Huffman table-specification syntax**
#[allow(clippy::similar_names, clippy::cast_sign_loss)]
Expand Down Expand Up @@ -148,8 +148,8 @@ pub(crate) fn parse_dqt<T: ZReaderTrait>(img: &mut JpegDecoder<T>) -> Result<(),
DecodeErrors::Format(format!("Could not read symbols into the buffer\n{x}"))
})?;
qt_length -= (precision_value as u16) + 1 /*QT BIT*/;
// carry out un zig-zag here
un_zig_zag(&qt_values)
// carry out transposed un zig-zag here
un_zig_zag_transposed(&qt_values)
}
1 => {
// 16 bit quantization tables
Expand All @@ -160,7 +160,7 @@ pub(crate) fn parse_dqt<T: ZReaderTrait>(img: &mut JpegDecoder<T>) -> Result<(),
}
qt_length -= (precision_value as u16) + 1;

un_zig_zag(&qt_values)
un_zig_zag_transposed(&qt_values)
}
_ => {
return Err(DecodeErrors::DqtError(format!(
Expand Down Expand Up @@ -530,15 +530,16 @@ pub(crate) fn parse_app2<T: ZReaderTrait>(

/// Small utility function to print Un-zig-zagged quantization tables

fn un_zig_zag<T>(a: &[T]) -> [i32; 64]
fn un_zig_zag_transposed<T>(a: &[T]) -> [i32; 64]
where
T: Default + Copy,
i32: core::convert::From<T>
{
let mut output = [i32::default(); 64];

for i in 0..64 {
output[UN_ZIGZAG[i]] = i32::from(a[i]);
// Transpose everything so we can still vectorize as they'll be used on transposed data
output[UN_ZIGZAG_TRANSPOSED[i]] = i32::from(a[i]);
}

output
Expand Down
31 changes: 14 additions & 17 deletions crates/zune-jpeg/src/idct/avx2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -100,21 +100,21 @@ pub unsafe fn idct_int_avx2_inner(
// we only care about AC terms
let rw8 = _mm256_loadu_si256(in_vector[1..].as_ptr().cast());

let zero = _mm256_setzero_si256();

let mut non_zero = 0;
let or = (
_mm256_or_si256(rw1, rw8),
_mm256_or_si256(rw2, rw3),
_mm256_or_si256(rw4, rw5),
_mm256_or_si256(rw6, rw7),
);

non_zero += _mm256_movemask_epi8(_mm256_cmpeq_epi32(rw8, zero));
non_zero += _mm256_movemask_epi8(_mm256_cmpeq_epi32(rw1, zero));
non_zero += _mm256_movemask_epi8(_mm256_cmpeq_epi32(rw2, zero));
non_zero += _mm256_movemask_epi8(_mm256_cmpeq_epi64(rw3, zero));
let or = (
_mm256_or_si256(or.0, or.1),
_mm256_or_si256(or.2, or.3),
);

non_zero += _mm256_movemask_epi8(_mm256_cmpeq_epi64(rw4, zero));
non_zero += _mm256_movemask_epi8(_mm256_cmpeq_epi64(rw5, zero));
non_zero += _mm256_movemask_epi8(_mm256_cmpeq_epi64(rw6, zero));
non_zero += _mm256_movemask_epi8(_mm256_cmpeq_epi64(rw7, zero));
let or = _mm256_or_si256(or.0, or.1);

if non_zero == -8 {
if _mm256_testz_si256(or, or) != 0 {
// AC terms all zero, idct of the block is is ( coeff[0] * qt[0] )/8 + 128 (bias)
// (and clamped to 255)
let idct_value = _mm_set1_epi16(((in_vector[0] >> 3) + 128).clamp(0, 255) as i16);
Expand Down Expand Up @@ -214,17 +214,14 @@ pub unsafe fn idct_int_avx2_inner(
};
}

// Process rows
// process columns
dct_pass!(512, 10);
transpose(
&mut row0, &mut row1, &mut row2, &mut row3, &mut row4, &mut row5, &mut row6, &mut row7
);

// process columns
// Process rows
dct_pass!(SCALE_BITS, 17);
transpose(
&mut row0, &mut row1, &mut row2, &mut row3, &mut row4, &mut row5, &mut row6, &mut row7
);

// Pack i32 to i16's,
// clamp them to be between 0-255
Expand Down
7 changes: 2 additions & 5 deletions crates/zune-jpeg/src/idct/neon.rs
Original file line number Diff line number Diff line change
Expand Up @@ -201,17 +201,14 @@ pub unsafe fn idct_int_neon_inner(
};
}

// Process rows
// process columns
dct_pass!(512, 10);
transpose(
&mut row0, &mut row1, &mut row2, &mut row3, &mut row4, &mut row5, &mut row6, &mut row7
);

// process columns
// Process rows
dct_pass!(SCALE_BITS, 17);
transpose(
&mut row0, &mut row1, &mut row2, &mut row3, &mut row4, &mut row5, &mut row6, &mut row7
);

// Pack i32 to i16's,
// clamp them to be between 0-255
Expand Down
60 changes: 31 additions & 29 deletions crates/zune-jpeg/src/idct/scalar.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@ const SCALE_BITS: i32 = 512 + 65536 + (128 << 17);
#[allow(
clippy::too_many_lines,
clippy::op_ref,
clippy::cast_possible_truncation
clippy::cast_possible_truncation,
clippy::erasing_op,
clippy::identity_op,
)]
pub fn idct_int(in_vector: &mut [i32; 64], out_vector: &mut [i16], stride: usize) {
// Temporary variables.
Expand Down Expand Up @@ -57,17 +59,17 @@ pub fn idct_int(in_vector: &mut [i32; 64], out_vector: &mut [i16], stride: usize
} else {
// because the compiler fails to see that it can be auto_vectorised so i'll
// leave it here check out [idct_int_slow, and idct_int_1D to get what i mean ] https://godbolt.org/z/8hqW9z9j9
for ptr in 0..8 {
let p2 = in_vector[ptr + 16];
let p3 = in_vector[ptr + 48];
while i < 64 {
let p2 = in_vector[i + 2];
let p3 = in_vector[i + 6];

let p1 = (p2 + p3).wrapping_mul(2217);

let t2 = p1 + p3 * -7567;
let t3 = p1 + p2 * 3135;

let p2 = in_vector[ptr];
let p3 = in_vector[32 + ptr];
let p2 = in_vector[i];
let p3 = in_vector[i + 4];
let t0 = fsh(p2 + p3);
let t1 = fsh(p2 - p3);

Expand All @@ -77,10 +79,10 @@ pub fn idct_int(in_vector: &mut [i32; 64], out_vector: &mut [i16], stride: usize
let x2 = t1 - t2 + 512;

// odd part
let mut t0 = in_vector[ptr + 56];
let mut t1 = in_vector[ptr + 40];
let mut t2 = in_vector[ptr + 24];
let mut t3 = in_vector[ptr + 8];
let mut t0 = in_vector[i + 7];
let mut t1 = in_vector[i + 5];
let mut t2 = in_vector[i + 3];
let mut t3 = in_vector[i + 1];

let p3 = t0 + t2;
let p4 = t1 + t3;
Expand All @@ -105,30 +107,32 @@ pub fn idct_int(in_vector: &mut [i32; 64], out_vector: &mut [i16], stride: usize

// constants scaled things up by 1<<12; let's bring them back
// down, but keep 2 extra bits of precision
in_vector[ptr] = (x0 + t3) >> 10;
in_vector[ptr + 8] = (x1 + t2) >> 10;
in_vector[ptr + 16] = (x2 + t1) >> 10;
in_vector[ptr + 24] = (x3 + t0) >> 10;
in_vector[ptr + 32] = (x3 - t0) >> 10;
in_vector[ptr + 40] = (x2 - t1) >> 10;
in_vector[ptr + 48] = (x1 - t2) >> 10;
in_vector[ptr + 56] = (x0 - t3) >> 10;
in_vector[i] = (x0 + t3) >> 10;
in_vector[i + 1] = (x1 + t2) >> 10;
in_vector[i + 2] = (x2 + t1) >> 10;
in_vector[i + 3] = (x3 + t0) >> 10;
in_vector[i + 4] = (x3 - t0) >> 10;
in_vector[i + 5] = (x2 - t1) >> 10;
in_vector[i + 6] = (x1 - t2) >> 10;
in_vector[i + 7] = (x0 - t3) >> 10;

i += 8;
}

// This is vectorised in architectures supporting SSE 4.1
while i < 64 {
for ptr in 0..8 {
// We won't try to short circuit here because it rarely works

// Even part
let p2 = in_vector[i + 2];
let p3 = in_vector[i + 6];
let p2 = in_vector[ptr + 16];
let p3 = in_vector[ptr + 48];

let p1 = (p2 + p3) * 2217;
let t2 = p1 + p3 * -7567;
let t3 = p1 + p2 * 3135;

let p2 = in_vector[i];
let p3 = in_vector[i + 4];
let p2 = in_vector[ptr];
let p3 = in_vector[ptr + 32];

let t0 = fsh(p2 + p3);
let t1 = fsh(p2 - p3);
Expand All @@ -143,10 +147,10 @@ pub fn idct_int(in_vector: &mut [i32; 64], out_vector: &mut [i16], stride: usize
let x1 = t1 + t2 + SCALE_BITS;
let x2 = t1 - t2 + SCALE_BITS;
// odd part
let mut t0 = in_vector[i + 7];
let mut t1 = in_vector[i + 5];
let mut t2 = in_vector[i + 3];
let mut t3 = in_vector[i + 1];
let mut t0 = in_vector[ptr + 56];
let mut t1 = in_vector[ptr + 40];
let mut t2 = in_vector[ptr + 24];
let mut t3 = in_vector[ptr + 8];

let p3 = t0 + t2;
let p4 = t1 + t3;
Expand Down Expand Up @@ -184,8 +188,6 @@ pub fn idct_int(in_vector: &mut [i32; 64], out_vector: &mut [i16], stride: usize
out[6] = clamp((x1 - t2) >> 17);
out[7] = clamp((x0 - t3) >> 17);

i += 8;

pos += stride;
}
}
Expand Down
3 changes: 2 additions & 1 deletion crates/zune-jpeg/src/mcu.rs
Original file line number Diff line number Diff line change
Expand Up @@ -364,8 +364,9 @@ impl<T: ZReaderTrait> JpegDecoder<T> {
// iterate over each line, since color-convert needs only
// one line
for (j, samp) in raw_samples.iter_mut().enumerate().take(comp_len) {
*samp = &samples[j][pos * padded_width..(pos + 1) * padded_width]
*samp = &samples[j][pos * padded_width..(pos + 1) * padded_width];
}

color_convert(
&raw_samples,
self.color_convert_16,
Expand Down
15 changes: 15 additions & 0 deletions crates/zune-jpeg/src/misc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,21 @@ pub const UN_ZIGZAG: [usize; 64 + 16] = [
63, 63, 63, 63, 63, 63, 63, 63
];

#[rustfmt::skip]
pub const UN_ZIGZAG_TRANSPOSED: [usize; 64 + 16] = [
0, 8, 1, 2, 9, 16, 24, 17,
10, 3, 4, 11, 18, 25, 32, 40,
33, 26, 19, 12, 5, 6, 13, 20,
27, 34, 41, 48, 56, 49, 42, 35,
28, 21, 14, 7, 15, 22, 29, 36,
43, 50, 57, 58, 51, 44, 37, 30,
23, 31, 38, 45, 52, 59, 60, 53,
46, 39, 47, 54, 61, 62, 55, 63,
// Prevent overflowing
63, 63, 63, 63, 63, 63, 63, 63,
63, 63, 63, 63, 63, 63, 63, 63
];

/// Align data to a 16 byte boundary
#[repr(align(16))]
#[derive(Clone)]
Expand Down