Skip to content

Commit 230c52c

Browse files
authored
Adds crc32c codec (#613)
* adds crc32c codec * changelog * pyproject.toml * codecov * move to checksum32.py * rm old docs * docs * pre-commit * codecov * codecov * docstrings * crc32c as optional dep * codecov
1 parent 88464f6 commit 230c52c

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

69 files changed

+208
-29
lines changed

.github/workflows/ci.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ jobs:
6161
conda activate env
6262
export DISABLE_NUMCODECS_AVX2=""
6363
# TODO: put back zfpy import when it supports numpy 2.0
64-
python -m pip install -v -e .[test,test_extras,msgpack]
64+
python -m pip install -v -e .[test,test_extras,msgpack,crc32c]
6565
6666
- name: Install pcodec
6767
if: matrix.python-version != '3.13.0'

docs/checksum32.rst

+11

docs/release.rst

+4

fixture/adler32/array.05.npy

7.94 KB
Binary file not shown.

fixture/adler32/array.06.npy

7.94 KB
Binary file not shown.

fixture/adler32/array.07.npy

7.94 KB
Binary file not shown.

fixture/adler32/array.08.npy

7.94 KB
Binary file not shown.

fixture/adler32/array.09.npy

7.94 KB
Binary file not shown.

fixture/adler32/array.10.npy

7.94 KB
Binary file not shown.

fixture/adler32/array.11.npy

7.94 KB
Binary file not shown.

fixture/adler32/array.12.npy

7.94 KB
Binary file not shown.
7.82 KB
Binary file not shown.
7.82 KB
Binary file not shown.
7.82 KB
Binary file not shown.
7.82 KB
Binary file not shown.
7.82 KB
Binary file not shown.
7.82 KB
Binary file not shown.
7.82 KB
Binary file not shown.
7.82 KB
Binary file not shown.

fixture/crc32/array.05.npy

7.94 KB
Binary file not shown.

fixture/crc32/array.06.npy

7.94 KB
Binary file not shown.

fixture/crc32/array.07.npy

7.94 KB
Binary file not shown.

fixture/crc32/array.08.npy

7.94 KB
Binary file not shown.

fixture/crc32/array.09.npy

7.94 KB
Binary file not shown.

fixture/crc32/array.10.npy

7.94 KB
Binary file not shown.

fixture/crc32/array.11.npy

7.94 KB
Binary file not shown.

fixture/crc32/array.12.npy

7.94 KB
Binary file not shown.

fixture/crc32/codec.00/encoded.05.dat

7.82 KB
Binary file not shown.

fixture/crc32/codec.00/encoded.06.dat

7.82 KB
Binary file not shown.

fixture/crc32/codec.00/encoded.07.dat

7.82 KB
Binary file not shown.

fixture/crc32/codec.00/encoded.08.dat

7.82 KB
Binary file not shown.

fixture/crc32/codec.00/encoded.09.dat

7.82 KB
Binary file not shown.

fixture/crc32/codec.00/encoded.10.dat

7.82 KB
Binary file not shown.

fixture/crc32/codec.00/encoded.11.dat

7.82 KB
Binary file not shown.

fixture/crc32/codec.00/encoded.12.dat

7.82 KB
Binary file not shown.

fixture/crc32c/array.00.npy

4.03 KB
Binary file not shown.

fixture/crc32c/array.01.npy

7.94 KB
Binary file not shown.

fixture/crc32c/array.02.npy

7.94 KB
Binary file not shown.

fixture/crc32c/array.03.npy

1.1 KB
Binary file not shown.

fixture/crc32c/array.04.npy

3.05 KB
Binary file not shown.

fixture/crc32c/array.05.npy

7.94 KB
Binary file not shown.

fixture/crc32c/array.06.npy

7.94 KB
Binary file not shown.

fixture/crc32c/array.07.npy

7.94 KB
Binary file not shown.

fixture/crc32c/array.08.npy

7.94 KB
Binary file not shown.

fixture/crc32c/array.09.npy

7.94 KB
Binary file not shown.

fixture/crc32c/array.10.npy

7.94 KB
Binary file not shown.

fixture/crc32c/array.11.npy

7.94 KB
Binary file not shown.

fixture/crc32c/array.12.npy

7.94 KB
Binary file not shown.

fixture/crc32c/codec.00/config.json

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
{
2+
"id": "crc32c"
3+
}
3.91 KB
Binary file not shown.
7.82 KB
Binary file not shown.
7.82 KB
Binary file not shown.
1004 Bytes
Binary file not shown.
2.93 KB
Binary file not shown.
7.82 KB
Binary file not shown.
7.82 KB
Binary file not shown.
7.82 KB
Binary file not shown.
7.82 KB
Binary file not shown.
7.82 KB
Binary file not shown.
7.82 KB
Binary file not shown.
7.82 KB
Binary file not shown.
7.82 KB
Binary file not shown.

fixture/delta/bool/array.00.npy

238 Bytes
Binary file not shown.
+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
{
2+
"astype": "|b1",
3+
"dtype": "|b1",
4+
"id": "delta"
5+
}
110 Bytes
Binary file not shown.

numcodecs/__init__.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -117,9 +117,10 @@
117117

118118
register_codec(MsgPack)
119119

120-
from numcodecs.checksum32 import CRC32, Adler32, JenkinsLookup3
120+
from numcodecs.checksum32 import CRC32, CRC32C, Adler32, JenkinsLookup3
121121

122122
register_codec(CRC32)
123+
register_codec(CRC32C)
123124
register_codec(Adler32)
124125
register_codec(JenkinsLookup3)
125126

numcodecs/checksum32.py

+81-9
Original file line numberDiff line numberDiff line change
@@ -1,42 +1,111 @@
11
import struct
22
import zlib
3+
from typing import Literal
34

45
import numpy as np
56

67
from .abc import Codec
78
from .compat import ensure_contiguous_ndarray, ndarray_copy
89
from .jenkins import jenkins_lookup3
910

11+
CHECKSUM_LOCATION = Literal['start', 'end']
12+
1013

1114
class Checksum32(Codec):
1215
# override in sub-class
1316
checksum = None
17+
location: CHECKSUM_LOCATION = 'start'
18+
19+
def __init__(self, location: CHECKSUM_LOCATION | None = None):
20+
if location is not None:
21+
self.location = location
22+
if self.location not in ['start', 'end']:
23+
raise ValueError(f"Invalid checksum location: {self.location}")
1424

1525
def encode(self, buf):
1626
arr = ensure_contiguous_ndarray(buf).view('u1')
1727
checksum = self.checksum(arr) & 0xFFFFFFFF
1828
enc = np.empty(arr.nbytes + 4, dtype='u1')
19-
enc[:4].view('<u4')[0] = checksum
20-
ndarray_copy(arr, enc[4:])
29+
if self.location == 'start':
30+
checksum_view = enc[:4]
31+
payload_view = enc[4:]
32+
else:
33+
checksum_view = enc[-4:]
34+
payload_view = enc[:-4]
35+
checksum_view.view('<u4')[0] = checksum
36+
ndarray_copy(arr, payload_view)
2137
return enc
2238

2339
def decode(self, buf, out=None):
40+
if len(buf) < 4:
41+
raise ValueError("Input buffer is too short to contain a 32-bit checksum.")
42+
if out is not None:
43+
ensure_contiguous_ndarray(out) # check that out is a valid ndarray
44+
2445
arr = ensure_contiguous_ndarray(buf).view('u1')
25-
expect = arr[:4].view('<u4')[0]
26-
checksum = self.checksum(arr[4:]) & 0xFFFFFFFF
46+
if self.location == 'start':
47+
checksum_view = arr[:4]
48+
payload_view = arr[4:]
49+
else:
50+
checksum_view = arr[-4:]
51+
payload_view = arr[:-4]
52+
expect = checksum_view.view('<u4')[0]
53+
checksum = self.checksum(payload_view) & 0xFFFFFFFF
2754
if expect != checksum:
28-
raise RuntimeError('checksum failed')
29-
return ndarray_copy(arr[4:], out)
55+
raise RuntimeError(
56+
f"Stored and computed {self.codec_id} checksum do not match. Stored: {expect}. Computed: {checksum}."
57+
)
58+
return ndarray_copy(payload_view, out)
3059

3160

3261
class CRC32(Checksum32):
62+
"""Codec add a crc32 checksum to the buffer.
63+
64+
Parameters
65+
----------
66+
location : 'start' or 'end'
67+
Where to place the checksum in the buffer.
68+
"""
69+
3370
codec_id = 'crc32'
3471
checksum = zlib.crc32
72+
location = 'start'
73+
74+
75+
class CRC32C(Checksum32):
76+
"""Codec add a crc32c checksum to the buffer.
77+
78+
Parameters
79+
----------
80+
location : 'start' or 'end'
81+
Where to place the checksum in the buffer.
82+
"""
83+
84+
codec_id = 'crc32c'
85+
86+
def checksum(self, buf):
87+
try:
88+
from crc32c import crc32c as crc32c_
89+
90+
return crc32c_(buf)
91+
except ImportError: # pragma: no cover
92+
raise ImportError("crc32c must be installed to use the CRC32C checksum codec.")
93+
94+
location = 'end'
3595

3696

3797
class Adler32(Checksum32):
98+
"""Codec add a adler32 checksum to the buffer.
99+
100+
Parameters
101+
----------
102+
location : 'start' or 'end'
103+
Where to place the checksum in the buffer.
104+
"""
105+
38106
codec_id = 'adler32'
39107
checksum = zlib.adler32
108+
location = 'start'
40109

41110

42111
class JenkinsLookup3(Checksum32):
@@ -50,9 +119,12 @@ class JenkinsLookup3(Checksum32):
50119
the data portion and compared with the four-byte checksum, raising
51120
RuntimeError if inconsistent.
52121
53-
Attributes:
54-
initval: initial seed passed to the hash algorithm, default: 0
55-
prefix: bytes prepended to the buffer before evaluating the hash, default: None
122+
Parameters
123+
----------
124+
initval : int
125+
initial seed passed to the hash algorithm, default: 0
126+
prefix : int
127+
bytes prepended to the buffer before evaluating the hash, default: None
56128
"""
57129

58130
checksum = jenkins_lookup3

numcodecs/tests/test_checksum32.py

+98-18
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,12 @@
33
import numpy as np
44
import pytest
55

6-
from numcodecs.checksum32 import CRC32, Adler32
6+
from numcodecs.checksum32 import CRC32, CRC32C, Adler32
77
from numcodecs.tests.common import (
88
check_backwards_compatibility,
99
check_config,
1010
check_encode_decode,
11+
check_err_decode_object_buffer,
1112
check_err_encode_object_buffer,
1213
check_repr,
1314
)
@@ -21,38 +22,117 @@
2122
np.random.normal(loc=1000, scale=1, size=(100, 10)),
2223
np.random.randint(0, 2, size=1000, dtype=bool).reshape(100, 10, order='F'),
2324
np.random.choice([b'a', b'bb', b'ccc'], size=1000).reshape(10, 10, 10),
25+
np.random.randint(0, 2**60, size=1000, dtype='u8').view('M8[ns]'),
26+
np.random.randint(0, 2**60, size=1000, dtype='u8').view('m8[ns]'),
27+
np.random.randint(0, 2**25, size=1000, dtype='u8').view('M8[m]'),
28+
np.random.randint(0, 2**25, size=1000, dtype='u8').view('m8[m]'),
29+
np.random.randint(-(2**63), -(2**63) + 20, size=1000, dtype='i8').view('M8[ns]'),
30+
np.random.randint(-(2**63), -(2**63) + 20, size=1000, dtype='i8').view('m8[ns]'),
31+
np.random.randint(-(2**63), -(2**63) + 20, size=1000, dtype='i8').view('M8[m]'),
32+
np.random.randint(-(2**63), -(2**63) + 20, size=1000, dtype='i8').view('m8[m]'),
2433
]
2534

26-
codecs = [CRC32(), Adler32()]
35+
codecs = [
36+
CRC32(),
37+
CRC32(location="end"),
38+
CRC32C(location="start"),
39+
CRC32C(),
40+
Adler32(),
41+
Adler32(location="end"),
42+
]
43+
44+
45+
@pytest.mark.parametrize(("codec", "arr"), itertools.product(codecs, arrays))
46+
def test_encode_decode(codec, arr):
47+
check_encode_decode(arr, codec)
48+
49+
50+
@pytest.mark.parametrize(("codec", "arr"), itertools.product(codecs, arrays))
51+
def test_errors(codec, arr):
52+
enc = codec.encode(arr)
53+
with pytest.raises(RuntimeError):
54+
codec.decode(enc[:-1])
55+
56+
57+
@pytest.mark.parametrize("codec", codecs)
58+
def test_config(codec):
59+
check_config(codec)
60+
61+
62+
@pytest.mark.parametrize("codec", codecs)
63+
def test_err_input_too_small(codec):
64+
buf = b'000' # 3 bytes are too little for a 32-bit checksum
65+
with pytest.raises(ValueError):
66+
codec.decode(buf)
2767

2868

29-
def test_encode_decode():
30-
for codec, arr in itertools.product(codecs, arrays):
31-
check_encode_decode(arr, codec)
69+
@pytest.mark.parametrize("codec", codecs)
70+
def test_err_encode_non_contiguous(codec):
71+
# non-contiguous memory
72+
arr = np.arange(1000, dtype='i4')[::2]
73+
with pytest.raises(ValueError):
74+
codec.encode(arr)
3275

3376

34-
def test_errors():
35-
for codec, arr in itertools.product(codecs, arrays):
36-
enc = codec.encode(arr)
37-
with pytest.raises(RuntimeError):
38-
codec.decode(enc[:-1])
77+
@pytest.mark.parametrize("codec", codecs)
78+
def test_err_encode_list(codec):
79+
data = ['foo', 'bar', 'baz']
80+
with pytest.raises(TypeError):
81+
codec.encode(data)
3982

4083

41-
def test_config():
42-
for codec in codecs:
43-
check_config(codec)
84+
def test_err_location():
85+
with pytest.raises(ValueError):
86+
CRC32(location="foo")
87+
with pytest.raises(ValueError):
88+
CRC32C(location="foo")
89+
with pytest.raises(ValueError):
90+
Adler32(location="foo")
4491

4592

4693
def test_repr():
47-
check_repr("CRC32()")
48-
check_repr("Adler32()")
94+
check_repr("CRC32(location='start')")
95+
check_repr("CRC32C(location='start')")
96+
check_repr("Adler32(location='start')")
97+
check_repr("CRC32(location='end')")
98+
check_repr("CRC32C(location='end')")
99+
check_repr("Adler32(location='end')")
49100

50101

51102
def test_backwards_compatibility():
52103
check_backwards_compatibility(CRC32.codec_id, arrays, [CRC32()])
53104
check_backwards_compatibility(Adler32.codec_id, arrays, [Adler32()])
105+
check_backwards_compatibility(CRC32C.codec_id, arrays, [CRC32C()])
106+
107+
108+
@pytest.mark.parametrize("codec", codecs)
109+
def test_err_encode_object_buffer(codec):
110+
check_err_encode_object_buffer(codec)
111+
112+
113+
@pytest.mark.parametrize("codec", codecs)
114+
def test_err_decode_object_buffer(codec):
115+
check_err_decode_object_buffer(codec)
116+
117+
118+
@pytest.mark.parametrize("codec", codecs)
119+
def test_err_out_too_small(codec):
120+
arr = np.arange(10, dtype='i4')
121+
out = np.empty_like(arr)[:-1]
122+
with pytest.raises(ValueError):
123+
codec.decode(codec.encode(arr), out)
124+
125+
126+
def test_crc32c_checksum():
127+
arr = np.arange(0, 64, dtype="uint8")
128+
buf = CRC32C(location="end").encode(arr)
129+
assert np.frombuffer(buf, dtype="<u4", offset=(len(buf) - 4))[0] == np.uint32(4218238699)
54130

55131

56-
def test_err_encode_object_buffer():
57-
check_err_encode_object_buffer(CRC32())
58-
check_err_encode_object_buffer(Adler32())
132+
@pytest.mark.parametrize("codec", codecs)
133+
def test_err_checksum(codec):
134+
arr = np.arange(0, 64, dtype="uint8")
135+
buf = bytearray(codec.encode(arr))
136+
buf[-1] = 0 # corrupt the checksum
137+
with pytest.raises(RuntimeError):
138+
codec.decode(buf)

pyproject.toml

+3
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,9 @@ zfpy = [
7070
pcodec = [
7171
"pcodec>=0.2.0",
7272
]
73+
crc32c = [
74+
"crc32c>=2.7",
75+
]
7376

7477
[tool.setuptools]
7578
license-files = ["LICENSE.txt"]

0 commit comments

Comments
 (0)