Skip to content

Commit a52048d

Browse files
moradologydcherian
andauthored
Fix UTF generation for numpy in property-based tests (#2801)
* Fix UTF generation for numpy in property-based tests * Add changelog entry --------- Co-authored-by: Deepak Cherian <[email protected]>
1 parent ab5925b commit a52048d

File tree

2 files changed

+22
-1
lines changed

2 files changed

+22
-1
lines changed

changes/2801.bugfix.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Ensure utf8 compliant strings are used to construct numpy arrays in property-based tests

src/zarr/testing/strategies.py

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,21 @@ def v2_dtypes() -> st.SearchStrategy[np.dtype]:
5151
)
5252

5353

54+
def safe_unicode_for_dtype(dtype: np.dtype[np.str_]) -> st.SearchStrategy[str]:
55+
"""Generate UTF-8-safe text constrained to max_len of dtype."""
56+
# account for utf-32 encoding (i.e. 4 bytes/character)
57+
max_len = max(1, dtype.itemsize // 4)
58+
59+
return st.text(
60+
alphabet=st.characters(
61+
blacklist_categories=["Cs"], # Avoid *technically allowed* surrogates
62+
min_codepoint=32,
63+
),
64+
min_size=1,
65+
max_size=max_len,
66+
)
67+
68+
5469
# From https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#node-names
5570
# 1. must not be the empty string ("")
5671
# 2. must not include the character "/"
@@ -86,7 +101,12 @@ def numpy_arrays(
86101
Generate numpy arrays that can be saved in the provided Zarr format.
87102
"""
88103
zarr_format = draw(zarr_formats)
89-
return draw(npst.arrays(dtype=v3_dtypes() if zarr_format == 3 else v2_dtypes(), shape=shapes))
104+
dtype = draw(v3_dtypes() if zarr_format == 3 else v2_dtypes())
105+
if np.issubdtype(dtype, np.str_):
106+
safe_unicode_strings = safe_unicode_for_dtype(dtype)
107+
return draw(npst.arrays(dtype=dtype, shape=shapes, elements=safe_unicode_strings))
108+
109+
return draw(npst.arrays(dtype=dtype, shape=shapes))
90110

91111

92112
@st.composite # type: ignore[misc]

0 commit comments

Comments
 (0)