From dc5b807a0c9efde8d6e4314cada5130c4fec1b4a Mon Sep 17 00:00:00 2001 From: neurolabusc Date: Fri, 31 Mar 2023 07:44:21 -0400 Subject: [PATCH 1/5] Faster Apple decompression using UDOT (https://dougallj.wordpress.com/2022/08/20/faster-zlib-deflate-decompression-on-the-apple-m1-and-x86/) --- adler32_simd.c | 159 +++++++++++++++++++++++++++++++++++++++++++++++++ configure | 4 +- 2 files changed, 161 insertions(+), 2 deletions(-) diff --git a/adler32_simd.c b/adler32_simd.c index 5f8c06d0a..d78f16ce3 100644 --- a/adler32_simd.c +++ b/adler32_simd.c @@ -219,8 +219,165 @@ uint32_t ZLIB_INTERNAL adler32_simd_( /* SSSE3 */ #elif defined(ADLER32_SIMD_NEON) +/* __APPLE__ is insufficent, as older iOS devices will not support UDOT, + however all NEON-supporting macOS devices will. */ +#ifdef __APPLE__ +#include +#if TARGET_OS_MAC +#define ADLER32_SIMD_NEON_UDOT +#define __ARM_FEATURE_DOTPROD +#endif +#endif + #include +#ifdef ADLER32_SIMD_NEON_UDOT + +uint32_t ZLIB_INTERNAL adler32_simd_( /* NEON */ + uint32_t adler, + const unsigned char *buf, + unsigned long len) +{ + /* + * Split Adler-32 into component sums. + */ + uint64_t s1 = adler & 0xffff; + uint64_t s2 = adler >> 16; + + /* + * Serially compute s1 & s2, until the data is 16-byte aligned. + */ + if ((uintptr_t)buf & 15) { + while ((uintptr_t)buf & 15) { + s2 += (s1 += *buf++); + --len; + } + + if (s1 >= BASE) + s1 -= BASE; + s2 %= BASE; + } + + /* + * Process the data in blocks. + */ + const unsigned BLOCK_SIZE = 1 << 6; + + unsigned long blocks = len / BLOCK_SIZE; + len -= blocks * BLOCK_SIZE; + + while (blocks) + { + unsigned n = 2902; /* Maximum blocks. */ + if (n > blocks) + n = blocks; + blocks -= n; + + /* + * Process n blocks of data. At most 2902 blocks can be + * processed before s2 must be reduced modulo BASE. This + * is greater than NMAX bytes as we're using 64-bit + * integers. + */ + const unsigned char MULTIPLIERS[0x40] = { + 0x40, 0x3f, 0x3e, 0x3d, 0x3c, 0x3b, 0x3a, 0x39, + 0x38, 0x37, 0x36, 0x35, 0x34, 0x33, 0x32, 0x31, + 0x30, 0x2f, 0x2e, 0x2d, 0x2c, 0x2b, 0x2a, 0x29, + 0x28, 0x27, 0x26, 0x25, 0x24, 0x23, 0x22, 0x21, + 0x20, 0x1f, 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, + 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, + 0x10, 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, + 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, + }; + uint8x16x4_t mul = vld1q_u8_x4(MULTIPLIERS); + + uint32x4_t accs[4] = { 0 }; + uint32x4_t sums[4] = { 0 }; + uint32x4_t extras[4] = { 0 }; + + const unsigned char *end = buf + n * BLOCK_SIZE; + do { + /* + * Load 64 input bytes. + */ + uint8x16x4_t raw = vld1q_u8_x4(buf); + buf += BLOCK_SIZE; + + for (int i = 0; i < 4; i++) { + accs[i] = vaddq_u32(accs[i], sums[i]); + sums[i] = vdotq_u32(sums[i], raw.val[i], vdupq_n_u8(1)); + extras[i] = vdotq_u32(extras[i], raw.val[i], mul.val[i]); + } + } while (buf != end); + + for (int i = 1; i < 4; i++) { + extras[0] = vaddq_u32(extras[0], extras[i]); + sums[0] = vaddq_u32(sums[0], sums[i]); + } + + uint64_t acc = 0; + for (int i = 0; i < 4; i++) { + acc += vaddlvq_u32(accs[i]); + } + uint64_t extra = vaddlvq_u32(extras[0]); + uint64_t sum = vaddlvq_u32(sums[0]); + + s2 += s1 * n * BLOCK_SIZE + acc * BLOCK_SIZE + extra; + s1 += sum; + + /* + * Reduce. + */ + s1 %= BASE; + s2 %= BASE; + } + + /* + * Handle leftover data. + */ + if (len) { + while (len >= 16) { + s2 += (s1 += *buf++); + s2 += (s1 += *buf++); + s2 += (s1 += *buf++); + s2 += (s1 += *buf++); + + s2 += (s1 += *buf++); + s2 += (s1 += *buf++); + s2 += (s1 += *buf++); + s2 += (s1 += *buf++); + + s2 += (s1 += *buf++); + s2 += (s1 += *buf++); + s2 += (s1 += *buf++); + s2 += (s1 += *buf++); + + s2 += (s1 += *buf++); + s2 += (s1 += *buf++); + s2 += (s1 += *buf++); + s2 += (s1 += *buf++); + + len -= 16; + } + + while (len--) { + s2 += (s1 += *buf++); + } + + if (s1 >= BASE) + s1 -= BASE; + s2 %= BASE; + } + + /* + * Return the recombined sums. + */ + return s1 | (s2 << 16); +} + +#else /* ADLER32_SIMD_NEON_UDOT */ + + uint32_t ZLIB_INTERNAL adler32_simd_( /* NEON */ uint32_t adler, const unsigned char *buf, @@ -384,4 +541,6 @@ uint32_t ZLIB_INTERNAL adler32_simd_( /* NEON */ return s1 | (s2 << 16); } +#endif /* ADLER32_SIMD_NEON_UDOT */ + #endif /* ADLER32_SIMD_SSSE3 */ diff --git a/configure b/configure index 6e22cbdf9..b2acdc96f 100755 --- a/configure +++ b/configure @@ -815,8 +815,8 @@ EOF echo "Checking for PCLMUL support ... No" | tee -a configure.log fi -elif [ x$TGT_ARCH = "xaarch64" ] ; then - +# on macOS `uname -m` returns `arm64` +elif [ x$TGT_ARCH = "xaarch64" -o x$TGT_ARCH = "xarm64" ] ; then # Check for NEON and CRC support cat > $test.c << EOF #include From a6cca27ac8f99c9e7bac837bcf04c2f1cb8fa941 Mon Sep 17 00:00:00 2001 From: neurolabusc Date: Sat, 1 Apr 2023 08:08:12 -0400 Subject: [PATCH 2/5] use 32-bit, shifted LDR instructions on AArch64 --- inffast_chunk.c | 183 ++++++++++++++++++++++++++++-------------------- inffast_chunk.h | 13 +++- 2 files changed, 117 insertions(+), 79 deletions(-) diff --git a/inffast_chunk.c b/inffast_chunk.c index 829a8ec3b..82410660b 100644 --- a/inffast_chunk.c +++ b/inffast_chunk.c @@ -35,6 +35,10 @@ # pragma message("Assembler code may have bugs -- use at your own risk") #else +#ifndef INFLATE_CHUNK_READ_64LE +# error INFLATE_CHUNK_SIMD_* requires INFLATE_CHUNK_READ_64LE +#endif + /* Decode literal, length, and distance codes and write out the resulting literal and match bytes until either not enough input or output is @@ -121,6 +125,8 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */ unsigned len; /* match length, unused bytes */ unsigned dist; /* match distance */ unsigned char FAR *from; /* where to copy match from */ + unsigned here32; /* table entry as integer */ + inflate_holder_t old; /* look-behind buffer for extra bits */ /* copy state to local variables */ state = (struct inflate_state FAR *)strm->state; @@ -144,97 +150,106 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */ lmask = (1U << state->lenbits) - 1; dmask = (1U << state->distbits) - 1; + /* This is extremely latency sensitive, so empty inline assembly blocks are + used to prevent the compiler from reassociating. */ +#define REFILL() do { \ + hold |= read64le(in) << bits; \ + in += 7; \ + asm volatile ("" : "+r"(in)); \ + uint64_t tmp = ((bits >> 3) & 7); \ + asm volatile ("" : "+r"(tmp)); \ + in -= tmp; \ + bits |= 56; \ + } while (0) + +#define TABLE_LOAD(table, index) do { \ + memcpy(&here32, &(table)[(index)], sizeof(code)); \ + memcpy(&here, &here32, sizeof(code)); \ + } while (0) + + if (bits < 10) { + REFILL(); + } + /* decode literals and length/distances until end-of-block or not enough input data or output space */ do { - if (bits < 15) { -#ifdef INFLATE_CHUNK_READ_64LE - hold |= read64le(in) << bits; - in += 6; - bits += 48; -#else - hold += (unsigned long)(*in++) << bits; - bits += 8; - hold += (unsigned long)(*in++) << bits; - bits += 8; -#endif + uint64_t next_hold = hold | (read64le(in) << bits); + in += 7; + uint64_t tmp = ((bits >> 3) & 7); + in -= tmp; + bits |= 56; + TABLE_LOAD(lcode, hold & lmask); + hold = next_hold; + old = hold; + hold >>= here.bits; + bits -= here32; + preloaded: + if (likely(here.op == 0)) { + *out++ = (unsigned char)(here.val); + TABLE_LOAD(lcode, hold & lmask); + old = hold; + hold >>= here.bits; + bits -= here32; + if (likely(here.op == 0)) { + *out++ = (unsigned char)(here.val); + TABLE_LOAD(lcode, hold & lmask); + old = hold; + hold >>= here.bits; + bits -= here32; + } } - here = lcode[hold & lmask]; dolen: - op = (unsigned)(here.bits); - hold >>= op; - bits -= op; op = (unsigned)(here.op); - if (op == 0) { /* literal */ + if (likely(op == 0)) { /* literal */ Tracevv((stderr, here.val >= 0x20 && here.val < 0x7f ? "inflate: literal '%c'\n" : "inflate: literal 0x%02x\n", here.val)); *out++ = (unsigned char)(here.val); } - else if (op & 16) { /* length base */ + else if (likely(op & 16)) { /* length base */ len = (unsigned)(here.val); - op &= 15; /* number of extra bits */ - if (op) { - if (bits < op) { -#ifdef INFLATE_CHUNK_READ_64LE - hold |= read64le(in) << bits; - in += 6; - bits += 48; -#else - hold += (unsigned long)(*in++) << bits; - bits += 8; -#endif - } - len += (unsigned)hold & ((1U << op) - 1); - hold >>= op; - bits -= op; - } + len += ((old & ~((uint64_t)-1 << here.bits)) >> (op & 15)); Tracevv((stderr, "inflate: length %u\n", len)); - if (bits < 15) { -#ifdef INFLATE_CHUNK_READ_64LE - hold |= read64le(in) << bits; - in += 6; - bits += 48; -#else - hold += (unsigned long)(*in++) << bits; - bits += 8; - hold += (unsigned long)(*in++) << bits; - bits += 8; -#endif + TABLE_LOAD(dcode, hold & dmask); + /* we have two fast-path loads: 10+10 + 15+5 = 40, + but we may need to refill here in the worst case */ + if (unlikely((bits & 63) < 15 + 13)) { + REFILL(); } - here = dcode[hold & dmask]; dodist: - op = (unsigned)(here.bits); - hold >>= op; - bits -= op; + old = hold; + hold >>= here.bits; + bits -= here32; op = (unsigned)(here.op); - if (op & 16) { /* distance base */ + if (likely(op & 16)) { /* distance base */ dist = (unsigned)(here.val); - op &= 15; /* number of extra bits */ - if (bits < op) { -#ifdef INFLATE_CHUNK_READ_64LE - hold |= read64le(in) << bits; - in += 6; - bits += 48; -#else - hold += (unsigned long)(*in++) << bits; - bits += 8; - if (bits < op) { - hold += (unsigned long)(*in++) << bits; - bits += 8; - } -#endif - } - dist += (unsigned)hold & ((1U << op) - 1); + dist += ((old & ~((uint64_t)-1 << here.bits)) >> (op & 15)); #ifdef INFLATE_STRICT - if (dist > dmax) { + if (unlikely(dist > dmax)) { strm->msg = (char *)"invalid distance too far back"; state->mode = BAD; break; } #endif - hold >>= op; - bits -= op; + if (unlikely((bits & 63) < 10)) { + REFILL(); + } + + /* preload and shift for next iteration */ + uint64_t next_hold = hold | (read64le(in) << bits); + in += 7; + asm volatile ("" : "+r"(in)); + uint64_t tmp = ((bits >> 3) & 7); + asm volatile ("" : "+r"(tmp)); + in -= tmp; + bits |= 56; + TABLE_LOAD(lcode, hold & lmask); + hold = next_hold; + old = hold; + hold >>= here.bits; + bits -= here32; + Tracevv((stderr, "inflate: distance %u\n", dist)); op = (unsigned)(out - beg); /* max distance in output */ if (dist > op) { /* see if copy from window */ @@ -244,14 +259,14 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */ strm->msg = (char *)"invalid distance too far back"; state->mode = BAD; - break; + goto chunk_break; } #ifdef INFLATE_ALLOW_INVALID_DISTANCE_TOOFAR_ARRR if (len <= op - whave) { do { *out++ = 0; } while (--len); - continue; + goto chunk_continue; } len -= op - whave; do { @@ -262,7 +277,7 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */ do { *out++ = *from++; } while (--len); - continue; + goto chunk_continue; } #endif } @@ -312,10 +327,21 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */ stay within 258 bytes of `out`. */ out = chunkcopy_lapped_relaxed(out, dist, len); + } + + chunk_continue: + if (likely(in < last && out < end)) + goto preloaded; + + chunk_break: + /* undo pre-shift */ + hold = old; + bits += here32; + break; } - else if ((op & 64) == 0) { /* 2nd level distance code */ - here = dcode[here.val + (hold & ((1U << op) - 1))]; + else if (likely((op & 64) == 0)) { /* 2nd level distance code */ + TABLE_LOAD(dcode, here.val + (hold & ((1U << op) - 1))); goto dodist; } else { @@ -324,11 +350,14 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */ break; } } - else if ((op & 64) == 0) { /* 2nd level length code */ - here = lcode[here.val + (hold & ((1U << op) - 1))]; + else if (likely((op & 64) == 0)) { /* 2nd level length code */ + TABLE_LOAD(lcode, here.val + (hold & ((1U << op) - 1))); + old = hold; + hold >>= here.bits; + bits -= here32; goto dolen; } - else if (op & 32) { /* end-of-block */ + else if (likely(op & 32)) { /* end-of-block */ Tracevv((stderr, "inflate: end of block\n")); state->mode = TYPE; break; @@ -340,6 +369,8 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */ } } while (in < last && out < end); + bits &= 63; + /* return unused bytes (on entry, bits < 8, so in won't go too far back) */ len = bits >> 3; in -= len; diff --git a/inffast_chunk.h b/inffast_chunk.h index de6aa0d8c..d9007ff6b 100644 --- a/inffast_chunk.h +++ b/inffast_chunk.h @@ -37,12 +37,19 @@ we can safely call inflate_fast() with only one up-front bounds check. One length/distance code pair (15 bits for the length code, 5 bits for length extra, 15 bits for the distance code, 13 bits for distance extra) requires - reading up to 48 input bits (6 bytes). The wide input data reading option - requires a little endian machine, and reads 64 input bits (8 bytes). + reading up to 48 input bits (6 bytes). + + For chunked decoding use a hopefully-pesimistic bound of two worst-case + advances: 7 + 7, plus one 8-byte refill. */ #ifdef INFLATE_CHUNK_READ_64LE #undef INFLATE_FAST_MIN_INPUT -#define INFLATE_FAST_MIN_INPUT 8 +#define INFLATE_FAST_MIN_INPUT 22 #endif +/* INFLATE_FAST_MIN_OUTPUT is usually 258, but we can copy two fast-path bytes + as well */ +#undef INFLATE_FAST_MIN_OUTPUT +#define INFLATE_FAST_MIN_OUTPUT 260 + void ZLIB_INTERNAL inflate_fast_chunk_ OF((z_streamp strm, unsigned start)); From a3611494ff192f12230e40e10fc07b8704aeacca Mon Sep 17 00:00:00 2001 From: neurolabusc Date: Sat, 1 Apr 2023 08:25:08 -0400 Subject: [PATCH 3/5] Revert --- inffast_chunk.c | 183 ++++++++++++++++++++---------------------------- inffast_chunk.h | 13 +--- 2 files changed, 79 insertions(+), 117 deletions(-) diff --git a/inffast_chunk.c b/inffast_chunk.c index 82410660b..829a8ec3b 100644 --- a/inffast_chunk.c +++ b/inffast_chunk.c @@ -35,10 +35,6 @@ # pragma message("Assembler code may have bugs -- use at your own risk") #else -#ifndef INFLATE_CHUNK_READ_64LE -# error INFLATE_CHUNK_SIMD_* requires INFLATE_CHUNK_READ_64LE -#endif - /* Decode literal, length, and distance codes and write out the resulting literal and match bytes until either not enough input or output is @@ -125,8 +121,6 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */ unsigned len; /* match length, unused bytes */ unsigned dist; /* match distance */ unsigned char FAR *from; /* where to copy match from */ - unsigned here32; /* table entry as integer */ - inflate_holder_t old; /* look-behind buffer for extra bits */ /* copy state to local variables */ state = (struct inflate_state FAR *)strm->state; @@ -150,106 +144,97 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */ lmask = (1U << state->lenbits) - 1; dmask = (1U << state->distbits) - 1; - /* This is extremely latency sensitive, so empty inline assembly blocks are - used to prevent the compiler from reassociating. */ -#define REFILL() do { \ - hold |= read64le(in) << bits; \ - in += 7; \ - asm volatile ("" : "+r"(in)); \ - uint64_t tmp = ((bits >> 3) & 7); \ - asm volatile ("" : "+r"(tmp)); \ - in -= tmp; \ - bits |= 56; \ - } while (0) - -#define TABLE_LOAD(table, index) do { \ - memcpy(&here32, &(table)[(index)], sizeof(code)); \ - memcpy(&here, &here32, sizeof(code)); \ - } while (0) - - if (bits < 10) { - REFILL(); - } - /* decode literals and length/distances until end-of-block or not enough input data or output space */ do { - uint64_t next_hold = hold | (read64le(in) << bits); - in += 7; - uint64_t tmp = ((bits >> 3) & 7); - in -= tmp; - bits |= 56; - TABLE_LOAD(lcode, hold & lmask); - hold = next_hold; - old = hold; - hold >>= here.bits; - bits -= here32; - preloaded: - if (likely(here.op == 0)) { - *out++ = (unsigned char)(here.val); - TABLE_LOAD(lcode, hold & lmask); - old = hold; - hold >>= here.bits; - bits -= here32; - if (likely(here.op == 0)) { - *out++ = (unsigned char)(here.val); - TABLE_LOAD(lcode, hold & lmask); - old = hold; - hold >>= here.bits; - bits -= here32; - } + if (bits < 15) { +#ifdef INFLATE_CHUNK_READ_64LE + hold |= read64le(in) << bits; + in += 6; + bits += 48; +#else + hold += (unsigned long)(*in++) << bits; + bits += 8; + hold += (unsigned long)(*in++) << bits; + bits += 8; +#endif } + here = lcode[hold & lmask]; dolen: + op = (unsigned)(here.bits); + hold >>= op; + bits -= op; op = (unsigned)(here.op); - if (likely(op == 0)) { /* literal */ + if (op == 0) { /* literal */ Tracevv((stderr, here.val >= 0x20 && here.val < 0x7f ? "inflate: literal '%c'\n" : "inflate: literal 0x%02x\n", here.val)); *out++ = (unsigned char)(here.val); } - else if (likely(op & 16)) { /* length base */ + else if (op & 16) { /* length base */ len = (unsigned)(here.val); - len += ((old & ~((uint64_t)-1 << here.bits)) >> (op & 15)); + op &= 15; /* number of extra bits */ + if (op) { + if (bits < op) { +#ifdef INFLATE_CHUNK_READ_64LE + hold |= read64le(in) << bits; + in += 6; + bits += 48; +#else + hold += (unsigned long)(*in++) << bits; + bits += 8; +#endif + } + len += (unsigned)hold & ((1U << op) - 1); + hold >>= op; + bits -= op; + } Tracevv((stderr, "inflate: length %u\n", len)); - TABLE_LOAD(dcode, hold & dmask); - /* we have two fast-path loads: 10+10 + 15+5 = 40, - but we may need to refill here in the worst case */ - if (unlikely((bits & 63) < 15 + 13)) { - REFILL(); + if (bits < 15) { +#ifdef INFLATE_CHUNK_READ_64LE + hold |= read64le(in) << bits; + in += 6; + bits += 48; +#else + hold += (unsigned long)(*in++) << bits; + bits += 8; + hold += (unsigned long)(*in++) << bits; + bits += 8; +#endif } + here = dcode[hold & dmask]; dodist: - old = hold; - hold >>= here.bits; - bits -= here32; + op = (unsigned)(here.bits); + hold >>= op; + bits -= op; op = (unsigned)(here.op); - if (likely(op & 16)) { /* distance base */ + if (op & 16) { /* distance base */ dist = (unsigned)(here.val); - dist += ((old & ~((uint64_t)-1 << here.bits)) >> (op & 15)); + op &= 15; /* number of extra bits */ + if (bits < op) { +#ifdef INFLATE_CHUNK_READ_64LE + hold |= read64le(in) << bits; + in += 6; + bits += 48; +#else + hold += (unsigned long)(*in++) << bits; + bits += 8; + if (bits < op) { + hold += (unsigned long)(*in++) << bits; + bits += 8; + } +#endif + } + dist += (unsigned)hold & ((1U << op) - 1); #ifdef INFLATE_STRICT - if (unlikely(dist > dmax)) { + if (dist > dmax) { strm->msg = (char *)"invalid distance too far back"; state->mode = BAD; break; } #endif - if (unlikely((bits & 63) < 10)) { - REFILL(); - } - - /* preload and shift for next iteration */ - uint64_t next_hold = hold | (read64le(in) << bits); - in += 7; - asm volatile ("" : "+r"(in)); - uint64_t tmp = ((bits >> 3) & 7); - asm volatile ("" : "+r"(tmp)); - in -= tmp; - bits |= 56; - TABLE_LOAD(lcode, hold & lmask); - hold = next_hold; - old = hold; - hold >>= here.bits; - bits -= here32; - + hold >>= op; + bits -= op; Tracevv((stderr, "inflate: distance %u\n", dist)); op = (unsigned)(out - beg); /* max distance in output */ if (dist > op) { /* see if copy from window */ @@ -259,14 +244,14 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */ strm->msg = (char *)"invalid distance too far back"; state->mode = BAD; - goto chunk_break; + break; } #ifdef INFLATE_ALLOW_INVALID_DISTANCE_TOOFAR_ARRR if (len <= op - whave) { do { *out++ = 0; } while (--len); - goto chunk_continue; + continue; } len -= op - whave; do { @@ -277,7 +262,7 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */ do { *out++ = *from++; } while (--len); - goto chunk_continue; + continue; } #endif } @@ -327,21 +312,10 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */ stay within 258 bytes of `out`. */ out = chunkcopy_lapped_relaxed(out, dist, len); - } - - chunk_continue: - if (likely(in < last && out < end)) - goto preloaded; - - chunk_break: - /* undo pre-shift */ - hold = old; - bits += here32; - break; } - else if (likely((op & 64) == 0)) { /* 2nd level distance code */ - TABLE_LOAD(dcode, here.val + (hold & ((1U << op) - 1))); + else if ((op & 64) == 0) { /* 2nd level distance code */ + here = dcode[here.val + (hold & ((1U << op) - 1))]; goto dodist; } else { @@ -350,14 +324,11 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */ break; } } - else if (likely((op & 64) == 0)) { /* 2nd level length code */ - TABLE_LOAD(lcode, here.val + (hold & ((1U << op) - 1))); - old = hold; - hold >>= here.bits; - bits -= here32; + else if ((op & 64) == 0) { /* 2nd level length code */ + here = lcode[here.val + (hold & ((1U << op) - 1))]; goto dolen; } - else if (likely(op & 32)) { /* end-of-block */ + else if (op & 32) { /* end-of-block */ Tracevv((stderr, "inflate: end of block\n")); state->mode = TYPE; break; @@ -369,8 +340,6 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */ } } while (in < last && out < end); - bits &= 63; - /* return unused bytes (on entry, bits < 8, so in won't go too far back) */ len = bits >> 3; in -= len; diff --git a/inffast_chunk.h b/inffast_chunk.h index d9007ff6b..de6aa0d8c 100644 --- a/inffast_chunk.h +++ b/inffast_chunk.h @@ -37,19 +37,12 @@ we can safely call inflate_fast() with only one up-front bounds check. One length/distance code pair (15 bits for the length code, 5 bits for length extra, 15 bits for the distance code, 13 bits for distance extra) requires - reading up to 48 input bits (6 bytes). - - For chunked decoding use a hopefully-pesimistic bound of two worst-case - advances: 7 + 7, plus one 8-byte refill. + reading up to 48 input bits (6 bytes). The wide input data reading option + requires a little endian machine, and reads 64 input bits (8 bytes). */ #ifdef INFLATE_CHUNK_READ_64LE #undef INFLATE_FAST_MIN_INPUT -#define INFLATE_FAST_MIN_INPUT 22 +#define INFLATE_FAST_MIN_INPUT 8 #endif -/* INFLATE_FAST_MIN_OUTPUT is usually 258, but we can copy two fast-path bytes - as well */ -#undef INFLATE_FAST_MIN_OUTPUT -#define INFLATE_FAST_MIN_OUTPUT 260 - void ZLIB_INTERNAL inflate_fast_chunk_ OF((z_streamp strm, unsigned start)); From 21b777cc0a7d007c8c9e8ccb69b510d704645b07 Mon Sep 17 00:00:00 2001 From: neurolabusc Date: Sat, 1 Apr 2023 08:30:49 -0400 Subject: [PATCH 4/5] Revert --- adler32_simd.c | 161 +------------------------------------------------ 1 file changed, 1 insertion(+), 160 deletions(-) diff --git a/adler32_simd.c b/adler32_simd.c index d78f16ce3..09b92428b 100644 --- a/adler32_simd.c +++ b/adler32_simd.c @@ -219,165 +219,8 @@ uint32_t ZLIB_INTERNAL adler32_simd_( /* SSSE3 */ #elif defined(ADLER32_SIMD_NEON) -/* __APPLE__ is insufficent, as older iOS devices will not support UDOT, - however all NEON-supporting macOS devices will. */ -#ifdef __APPLE__ -#include -#if TARGET_OS_MAC -#define ADLER32_SIMD_NEON_UDOT -#define __ARM_FEATURE_DOTPROD -#endif -#endif - #include -#ifdef ADLER32_SIMD_NEON_UDOT - -uint32_t ZLIB_INTERNAL adler32_simd_( /* NEON */ - uint32_t adler, - const unsigned char *buf, - unsigned long len) -{ - /* - * Split Adler-32 into component sums. - */ - uint64_t s1 = adler & 0xffff; - uint64_t s2 = adler >> 16; - - /* - * Serially compute s1 & s2, until the data is 16-byte aligned. - */ - if ((uintptr_t)buf & 15) { - while ((uintptr_t)buf & 15) { - s2 += (s1 += *buf++); - --len; - } - - if (s1 >= BASE) - s1 -= BASE; - s2 %= BASE; - } - - /* - * Process the data in blocks. - */ - const unsigned BLOCK_SIZE = 1 << 6; - - unsigned long blocks = len / BLOCK_SIZE; - len -= blocks * BLOCK_SIZE; - - while (blocks) - { - unsigned n = 2902; /* Maximum blocks. */ - if (n > blocks) - n = blocks; - blocks -= n; - - /* - * Process n blocks of data. At most 2902 blocks can be - * processed before s2 must be reduced modulo BASE. This - * is greater than NMAX bytes as we're using 64-bit - * integers. - */ - const unsigned char MULTIPLIERS[0x40] = { - 0x40, 0x3f, 0x3e, 0x3d, 0x3c, 0x3b, 0x3a, 0x39, - 0x38, 0x37, 0x36, 0x35, 0x34, 0x33, 0x32, 0x31, - 0x30, 0x2f, 0x2e, 0x2d, 0x2c, 0x2b, 0x2a, 0x29, - 0x28, 0x27, 0x26, 0x25, 0x24, 0x23, 0x22, 0x21, - 0x20, 0x1f, 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, - 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, - 0x10, 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, - 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, - }; - uint8x16x4_t mul = vld1q_u8_x4(MULTIPLIERS); - - uint32x4_t accs[4] = { 0 }; - uint32x4_t sums[4] = { 0 }; - uint32x4_t extras[4] = { 0 }; - - const unsigned char *end = buf + n * BLOCK_SIZE; - do { - /* - * Load 64 input bytes. - */ - uint8x16x4_t raw = vld1q_u8_x4(buf); - buf += BLOCK_SIZE; - - for (int i = 0; i < 4; i++) { - accs[i] = vaddq_u32(accs[i], sums[i]); - sums[i] = vdotq_u32(sums[i], raw.val[i], vdupq_n_u8(1)); - extras[i] = vdotq_u32(extras[i], raw.val[i], mul.val[i]); - } - } while (buf != end); - - for (int i = 1; i < 4; i++) { - extras[0] = vaddq_u32(extras[0], extras[i]); - sums[0] = vaddq_u32(sums[0], sums[i]); - } - - uint64_t acc = 0; - for (int i = 0; i < 4; i++) { - acc += vaddlvq_u32(accs[i]); - } - uint64_t extra = vaddlvq_u32(extras[0]); - uint64_t sum = vaddlvq_u32(sums[0]); - - s2 += s1 * n * BLOCK_SIZE + acc * BLOCK_SIZE + extra; - s1 += sum; - - /* - * Reduce. - */ - s1 %= BASE; - s2 %= BASE; - } - - /* - * Handle leftover data. - */ - if (len) { - while (len >= 16) { - s2 += (s1 += *buf++); - s2 += (s1 += *buf++); - s2 += (s1 += *buf++); - s2 += (s1 += *buf++); - - s2 += (s1 += *buf++); - s2 += (s1 += *buf++); - s2 += (s1 += *buf++); - s2 += (s1 += *buf++); - - s2 += (s1 += *buf++); - s2 += (s1 += *buf++); - s2 += (s1 += *buf++); - s2 += (s1 += *buf++); - - s2 += (s1 += *buf++); - s2 += (s1 += *buf++); - s2 += (s1 += *buf++); - s2 += (s1 += *buf++); - - len -= 16; - } - - while (len--) { - s2 += (s1 += *buf++); - } - - if (s1 >= BASE) - s1 -= BASE; - s2 %= BASE; - } - - /* - * Return the recombined sums. - */ - return s1 | (s2 << 16); -} - -#else /* ADLER32_SIMD_NEON_UDOT */ - - uint32_t ZLIB_INTERNAL adler32_simd_( /* NEON */ uint32_t adler, const unsigned char *buf, @@ -541,6 +384,4 @@ uint32_t ZLIB_INTERNAL adler32_simd_( /* NEON */ return s1 | (s2 << 16); } -#endif /* ADLER32_SIMD_NEON_UDOT */ - -#endif /* ADLER32_SIMD_SSSE3 */ +#endif /* ADLER32_SIMD_SSSE3 */ \ No newline at end of file From 88cd361b18c78134a433ba4b4ab58e5ffd84fea6 Mon Sep 17 00:00:00 2001 From: neurolabusc Date: Sat, 1 Apr 2023 08:31:44 -0400 Subject: [PATCH 5/5] EOLN --- adler32_simd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/adler32_simd.c b/adler32_simd.c index 09b92428b..5f8c06d0a 100644 --- a/adler32_simd.c +++ b/adler32_simd.c @@ -384,4 +384,4 @@ uint32_t ZLIB_INTERNAL adler32_simd_( /* NEON */ return s1 | (s2 << 16); } -#endif /* ADLER32_SIMD_SSSE3 */ \ No newline at end of file +#endif /* ADLER32_SIMD_SSSE3 */