From 0836a1cac5461da096074c0125c507f1b3fc0fdb Mon Sep 17 00:00:00 2001 From: greatroar <61184462+greatroar@users.noreply.github.com> Date: Fri, 18 Aug 2023 09:50:32 +0200 Subject: [PATCH] zstd: Remove offset from bitReader (#854) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We can reslice instead of maintaining a separate offset. This gets rid of some bounds checks. Also some other micro-optimizations to bit reading code. Combined results: │ zstd/old │ zstd/new │ │ B/s │ B/s vs base │ Decoder_DecoderSmall/kppkn.gtb.zst/buffered-8 427.6Mi ± 0% 428.2Mi ± 0% +0.13% (p=0.019 n=10) Decoder_DecoderSmall/kppkn.gtb.zst/unbuffered-8 511.6Mi ± 3% 516.9Mi ± 3% ~ (p=0.280 n=10) Decoder_DecoderSmall/geo.protodata.zst/buffered-8 1.110Gi ± 0% 1.110Gi ± 0% ~ (p=0.165 n=10) Decoder_DecoderSmall/geo.protodata.zst/unbuffered-8 824.7Mi ± 2% 827.3Mi ± 2% ~ (p=0.481 n=10) Decoder_DecoderSmall/plrabn12.txt.zst/buffered-8 330.4Mi ± 0% 330.3Mi ± 1% ~ (p=0.645 n=10) Decoder_DecoderSmall/plrabn12.txt.zst/unbuffered-8 533.3Mi ± 4% 538.8Mi ± 5% ~ (p=0.393 n=10) Decoder_DecoderSmall/lcet10.txt.zst/buffered-8 395.0Mi ± 0% 394.6Mi ± 0% -0.10% (p=0.034 n=10) Decoder_DecoderSmall/lcet10.txt.zst/unbuffered-8 556.5Mi ± 6% 546.2Mi ± 8% ~ (p=0.436 n=10) Decoder_DecoderSmall/asyoulik.txt.zst/buffered-8 342.2Mi ± 0% 342.2Mi ± 0% ~ (p=0.956 n=10) Decoder_DecoderSmall/asyoulik.txt.zst/unbuffered-8 436.7Mi ± 2% 435.4Mi ± 3% ~ (p=0.739 n=10) Decoder_DecoderSmall/alice29.txt.zst/buffered-8 335.6Mi ± 2% 337.0Mi ± 0% +0.43% (p=0.000 n=10) Decoder_DecoderSmall/alice29.txt.zst/unbuffered-8 552.6Mi ± 3% 550.7Mi ± 4% ~ (p=1.000 n=10) Decoder_DecoderSmall/html_x_4.zst/buffered-8 2.264Gi ± 0% 2.271Gi ± 0% +0.29% (p=0.035 n=10) Decoder_DecoderSmall/html_x_4.zst/unbuffered-8 1.558Gi ± 4% 1.554Gi ± 3% ~ (p=0.579 n=10) Decoder_DecoderSmall/paper-100k.pdf.zst/buffered-8 3.554Gi ± 5% 3.610Gi ± 0% +1.59% (p=0.000 n=10) Decoder_DecoderSmall/paper-100k.pdf.zst/unbuffered-8 1.701Gi ± 8% 1.709Gi ± 5% ~ (p=0.631 n=10) Decoder_DecoderSmall/fireworks.jpeg.zst/buffered-8 7.891Gi ± 4% 8.070Gi ± 0% +2.26% (p=0.000 n=10) Decoder_DecoderSmall/fireworks.jpeg.zst/unbuffered-8 3.062Gi ± 4% 3.129Gi ± 2% +2.16% (p=0.002 n=10) Decoder_DecoderSmall/urls.10K.zst/buffered-8 525.4Mi ± 6% 553.8Mi ± 0% +5.39% (p=0.000 n=10) Decoder_DecoderSmall/urls.10K.zst/unbuffered-8 763.7Mi ± 6% 819.7Mi ± 2% +7.34% (p=0.000 n=10) Decoder_DecoderSmall/html.zst/buffered-8 894.8Mi ± 0% 898.8Mi ± 2% +0.45% (p=0.043 n=10) Decoder_DecoderSmall/html.zst/unbuffered-8 722.3Mi ± 2% 717.7Mi ± 2% ~ (p=0.912 n=10) Decoder_DecoderSmall/comp-data.bin.zst/buffered-8 386.6Mi ± 2% 390.4Mi ± 0% +1.00% (p=0.000 n=10) Decoder_DecoderSmall/comp-data.bin.zst/unbuffered-8 145.2Mi ± 2% 148.7Mi ± 1% +2.42% (p=0.003 n=10) geomean 770.3Mi 777.5Mi +0.93% --- zstd/_generate/gen.go | 4 +- zstd/bitreader.go | 34 +++++------ zstd/seqdec.go | 17 ++---- zstd/seqdec_amd64.s | 128 ++++++++++++++++++++--------------------- zstd/seqdec_generic.go | 2 +- 5 files changed, 88 insertions(+), 97 deletions(-) diff --git a/zstd/_generate/gen.go b/zstd/_generate/gen.go index 50ce4404f0..7ef9a45aa8 100644 --- a/zstd/_generate/gen.go +++ b/zstd/_generate/gen.go @@ -156,8 +156,8 @@ func (o options) generateBody(name string, executeSingleTriple func(ctx *execute brPointer := GP64() Load(br.Field("value"), brValue) Load(br.Field("bitsRead"), brBitsRead) - Load(br.Field("off"), brOffset) Load(br.Field("in").Base(), brPointer) + Load(br.Field("in").Len(), brOffset) ADDQ(brOffset, brPointer) // Add current offset to read pointer. MOVQ(brPointer, brPointerStash) } @@ -438,7 +438,7 @@ func (o options) generateBody(name string, executeSingleTriple func(ctx *execute br := Dereference(Param("br")) Store(brValue, br.Field("value")) Store(brBitsRead.As8(), br.Field("bitsRead")) - Store(brOffset, br.Field("off")) + Store(brOffset, br.Field("in").Len()) if !o.useSeqs { Comment("Update the context") diff --git a/zstd/bitreader.go b/zstd/bitreader.go index 97299d499c..25ca983941 100644 --- a/zstd/bitreader.go +++ b/zstd/bitreader.go @@ -17,7 +17,6 @@ import ( // for aligning the input. type bitReader struct { in []byte - off uint // next byte to read is at in[off - 1] value uint64 // Maybe use [16]byte, but shifting is awkward. bitsRead uint8 } @@ -28,7 +27,6 @@ func (b *bitReader) init(in []byte) error { return errors.New("corrupt stream: too short") } b.in = in - b.off = uint(len(in)) // The highest bit of the last byte indicates where to start v := in[len(in)-1] if v == 0 { @@ -69,21 +67,19 @@ func (b *bitReader) fillFast() { if b.bitsRead < 32 { return } - // 2 bounds checks. - v := b.in[b.off-4:] - v = v[:4] + v := b.in[len(b.in)-4:] + b.in = b.in[:len(b.in)-4] low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24) b.value = (b.value << 32) | uint64(low) b.bitsRead -= 32 - b.off -= 4 } // fillFastStart() assumes the bitreader is empty and there is at least 8 bytes to read. func (b *bitReader) fillFastStart() { - // Do single re-slice to avoid bounds checks. - b.value = binary.LittleEndian.Uint64(b.in[b.off-8:]) + v := b.in[len(b.in)-8:] + b.in = b.in[:len(b.in)-8] + b.value = binary.LittleEndian.Uint64(v) b.bitsRead = 0 - b.off -= 8 } // fill() will make sure at least 32 bits are available. @@ -91,25 +87,25 @@ func (b *bitReader) fill() { if b.bitsRead < 32 { return } - if b.off >= 4 { - v := b.in[b.off-4:] - v = v[:4] + if len(b.in) >= 4 { + v := b.in[len(b.in)-4:] + b.in = b.in[:len(b.in)-4] low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24) b.value = (b.value << 32) | uint64(low) b.bitsRead -= 32 - b.off -= 4 return } - for b.off > 0 { - b.value = (b.value << 8) | uint64(b.in[b.off-1]) - b.bitsRead -= 8 - b.off-- + + b.bitsRead -= uint8(8 * len(b.in)) + for len(b.in) > 0 { + b.value = (b.value << 8) | uint64(b.in[len(b.in)-1]) + b.in = b.in[:len(b.in)-1] } } // finished returns true if all bits have been read from the bit stream. func (b *bitReader) finished() bool { - return b.off == 0 && b.bitsRead >= 64 + return len(b.in) == 0 && b.bitsRead >= 64 } // overread returns true if more bits have been requested than is on the stream. @@ -119,7 +115,7 @@ func (b *bitReader) overread() bool { // remain returns the number of bits remaining. func (b *bitReader) remain() uint { - return b.off*8 + 64 - uint(b.bitsRead) + return 8*uint(len(b.in)) + 64 - uint(b.bitsRead) } // close the bitstream and returns an error if out-of-buffer reads occurred. diff --git a/zstd/seqdec.go b/zstd/seqdec.go index 9405fcf101..d7fe6d82d9 100644 --- a/zstd/seqdec.go +++ b/zstd/seqdec.go @@ -245,7 +245,7 @@ func (s *sequenceDecs) decodeSync(hist []byte) error { return io.ErrUnexpectedEOF } var ll, mo, ml int - if br.off > 4+((maxOffsetBits+16+16)>>3) { + if len(br.in) > 4+((maxOffsetBits+16+16)>>3) { // inlined function: // ll, mo, ml = s.nextFast(br, llState, mlState, ofState) @@ -452,18 +452,13 @@ func (s *sequenceDecs) next(br *bitReader, llState, mlState, ofState decSymbol) // extra bits are stored in reverse order. br.fill() - if s.maxBits <= 32 { - mo += br.getBits(moB) - ml += br.getBits(mlB) - ll += br.getBits(llB) - } else { - mo += br.getBits(moB) + mo += br.getBits(moB) + if s.maxBits > 32 { br.fill() - // matchlength+literal length, max 32 bits - ml += br.getBits(mlB) - ll += br.getBits(llB) - } + // matchlength+literal length, max 32 bits + ml += br.getBits(mlB) + ll += br.getBits(llB) mo = s.adjustOffset(mo, ll, moB) return } diff --git a/zstd/seqdec_amd64.s b/zstd/seqdec_amd64.s index b6f4ba6fc5..974b99725f 100644 --- a/zstd/seqdec_amd64.s +++ b/zstd/seqdec_amd64.s @@ -5,11 +5,11 @@ // func sequenceDecs_decode_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int // Requires: CMOV TEXT ·sequenceDecs_decode_amd64(SB), $8-32 - MOVQ br+8(FP), AX - MOVQ 32(AX), DX - MOVBQZX 40(AX), BX - MOVQ 24(AX), SI - MOVQ (AX), AX + MOVQ br+8(FP), CX + MOVQ 24(CX), DX + MOVBQZX 32(CX), BX + MOVQ (CX), AX + MOVQ 8(CX), SI ADDQ SI, AX MOVQ AX, (SP) MOVQ ctx+16(FP), AX @@ -301,9 +301,9 @@ sequenceDecs_decode_amd64_match_len_ofs_ok: MOVQ R12, 152(AX) MOVQ R13, 160(AX) MOVQ br+8(FP), AX - MOVQ DX, 32(AX) - MOVB BL, 40(AX) - MOVQ SI, 24(AX) + MOVQ DX, 24(AX) + MOVB BL, 32(AX) + MOVQ SI, 8(AX) // Return success MOVQ $0x00000000, ret+24(FP) @@ -336,11 +336,11 @@ error_overread: // func sequenceDecs_decode_56_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int // Requires: CMOV TEXT ·sequenceDecs_decode_56_amd64(SB), $8-32 - MOVQ br+8(FP), AX - MOVQ 32(AX), DX - MOVBQZX 40(AX), BX - MOVQ 24(AX), SI - MOVQ (AX), AX + MOVQ br+8(FP), CX + MOVQ 24(CX), DX + MOVBQZX 32(CX), BX + MOVQ (CX), AX + MOVQ 8(CX), SI ADDQ SI, AX MOVQ AX, (SP) MOVQ ctx+16(FP), AX @@ -603,9 +603,9 @@ sequenceDecs_decode_56_amd64_match_len_ofs_ok: MOVQ R12, 152(AX) MOVQ R13, 160(AX) MOVQ br+8(FP), AX - MOVQ DX, 32(AX) - MOVB BL, 40(AX) - MOVQ SI, 24(AX) + MOVQ DX, 24(AX) + MOVB BL, 32(AX) + MOVQ SI, 8(AX) // Return success MOVQ $0x00000000, ret+24(FP) @@ -638,11 +638,11 @@ error_overread: // func sequenceDecs_decode_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int // Requires: BMI, BMI2, CMOV TEXT ·sequenceDecs_decode_bmi2(SB), $8-32 - MOVQ br+8(FP), CX - MOVQ 32(CX), AX - MOVBQZX 40(CX), DX - MOVQ 24(CX), BX - MOVQ (CX), CX + MOVQ br+8(FP), BX + MOVQ 24(BX), AX + MOVBQZX 32(BX), DX + MOVQ (BX), CX + MOVQ 8(BX), BX ADDQ BX, CX MOVQ CX, (SP) MOVQ ctx+16(FP), CX @@ -892,9 +892,9 @@ sequenceDecs_decode_bmi2_match_len_ofs_ok: MOVQ R11, 152(CX) MOVQ R12, 160(CX) MOVQ br+8(FP), CX - MOVQ AX, 32(CX) - MOVB DL, 40(CX) - MOVQ BX, 24(CX) + MOVQ AX, 24(CX) + MOVB DL, 32(CX) + MOVQ BX, 8(CX) // Return success MOVQ $0x00000000, ret+24(FP) @@ -927,11 +927,11 @@ error_overread: // func sequenceDecs_decode_56_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int // Requires: BMI, BMI2, CMOV TEXT ·sequenceDecs_decode_56_bmi2(SB), $8-32 - MOVQ br+8(FP), CX - MOVQ 32(CX), AX - MOVBQZX 40(CX), DX - MOVQ 24(CX), BX - MOVQ (CX), CX + MOVQ br+8(FP), BX + MOVQ 24(BX), AX + MOVBQZX 32(BX), DX + MOVQ (BX), CX + MOVQ 8(BX), BX ADDQ BX, CX MOVQ CX, (SP) MOVQ ctx+16(FP), CX @@ -1152,9 +1152,9 @@ sequenceDecs_decode_56_bmi2_match_len_ofs_ok: MOVQ R11, 152(CX) MOVQ R12, 160(CX) MOVQ br+8(FP), CX - MOVQ AX, 32(CX) - MOVB DL, 40(CX) - MOVQ BX, 24(CX) + MOVQ AX, 24(CX) + MOVB DL, 32(CX) + MOVQ BX, 8(CX) // Return success MOVQ $0x00000000, ret+24(FP) @@ -1797,11 +1797,11 @@ empty_seqs: // func sequenceDecs_decodeSync_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int // Requires: CMOV, SSE TEXT ·sequenceDecs_decodeSync_amd64(SB), $64-32 - MOVQ br+8(FP), AX - MOVQ 32(AX), DX - MOVBQZX 40(AX), BX - MOVQ 24(AX), SI - MOVQ (AX), AX + MOVQ br+8(FP), CX + MOVQ 24(CX), DX + MOVBQZX 32(CX), BX + MOVQ (CX), AX + MOVQ 8(CX), SI ADDQ SI, AX MOVQ AX, (SP) MOVQ ctx+16(FP), AX @@ -2295,9 +2295,9 @@ handle_loop: loop_finished: MOVQ br+8(FP), AX - MOVQ DX, 32(AX) - MOVB BL, 40(AX) - MOVQ SI, 24(AX) + MOVQ DX, 24(AX) + MOVB BL, 32(AX) + MOVQ SI, 8(AX) // Update the context MOVQ ctx+16(FP), AX @@ -2362,11 +2362,11 @@ error_not_enough_space: // func sequenceDecs_decodeSync_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int // Requires: BMI, BMI2, CMOV, SSE TEXT ·sequenceDecs_decodeSync_bmi2(SB), $64-32 - MOVQ br+8(FP), CX - MOVQ 32(CX), AX - MOVBQZX 40(CX), DX - MOVQ 24(CX), BX - MOVQ (CX), CX + MOVQ br+8(FP), BX + MOVQ 24(BX), AX + MOVBQZX 32(BX), DX + MOVQ (BX), CX + MOVQ 8(BX), BX ADDQ BX, CX MOVQ CX, (SP) MOVQ ctx+16(FP), CX @@ -2818,9 +2818,9 @@ handle_loop: loop_finished: MOVQ br+8(FP), CX - MOVQ AX, 32(CX) - MOVB DL, 40(CX) - MOVQ BX, 24(CX) + MOVQ AX, 24(CX) + MOVB DL, 32(CX) + MOVQ BX, 8(CX) // Update the context MOVQ ctx+16(FP), AX @@ -2885,11 +2885,11 @@ error_not_enough_space: // func sequenceDecs_decodeSync_safe_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int // Requires: CMOV, SSE TEXT ·sequenceDecs_decodeSync_safe_amd64(SB), $64-32 - MOVQ br+8(FP), AX - MOVQ 32(AX), DX - MOVBQZX 40(AX), BX - MOVQ 24(AX), SI - MOVQ (AX), AX + MOVQ br+8(FP), CX + MOVQ 24(CX), DX + MOVBQZX 32(CX), BX + MOVQ (CX), AX + MOVQ 8(CX), SI ADDQ SI, AX MOVQ AX, (SP) MOVQ ctx+16(FP), AX @@ -3485,9 +3485,9 @@ handle_loop: loop_finished: MOVQ br+8(FP), AX - MOVQ DX, 32(AX) - MOVB BL, 40(AX) - MOVQ SI, 24(AX) + MOVQ DX, 24(AX) + MOVB BL, 32(AX) + MOVQ SI, 8(AX) // Update the context MOVQ ctx+16(FP), AX @@ -3552,11 +3552,11 @@ error_not_enough_space: // func sequenceDecs_decodeSync_safe_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int // Requires: BMI, BMI2, CMOV, SSE TEXT ·sequenceDecs_decodeSync_safe_bmi2(SB), $64-32 - MOVQ br+8(FP), CX - MOVQ 32(CX), AX - MOVBQZX 40(CX), DX - MOVQ 24(CX), BX - MOVQ (CX), CX + MOVQ br+8(FP), BX + MOVQ 24(BX), AX + MOVBQZX 32(BX), DX + MOVQ (BX), CX + MOVQ 8(BX), BX ADDQ BX, CX MOVQ CX, (SP) MOVQ ctx+16(FP), CX @@ -4110,9 +4110,9 @@ handle_loop: loop_finished: MOVQ br+8(FP), CX - MOVQ AX, 32(CX) - MOVB DL, 40(CX) - MOVQ BX, 24(CX) + MOVQ AX, 24(CX) + MOVB DL, 32(CX) + MOVQ BX, 8(CX) // Update the context MOVQ ctx+16(FP), AX diff --git a/zstd/seqdec_generic.go b/zstd/seqdec_generic.go index ac2a80d291..2fb35b788c 100644 --- a/zstd/seqdec_generic.go +++ b/zstd/seqdec_generic.go @@ -29,7 +29,7 @@ func (s *sequenceDecs) decode(seqs []seqVals) error { } for i := range seqs { var ll, mo, ml int - if br.off > 4+((maxOffsetBits+16+16)>>3) { + if len(br.in) > 4+((maxOffsetBits+16+16)>>3) { // inlined function: // ll, mo, ml = s.nextFast(br, llState, mlState, ofState)