Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@
[0.2.5] - 2025-XX-XX
--------------------

- In previous versions, if a metadata column had byte values outside the ASCII range,
the file written would raise a `ValueError` when decompressed. This is now fixed,
and files written with this bug are now read correctly.
(benjeffery, #115)
- Drop Python 3.9 support, require Python >= 3.10 (#112, benjeffery)
- Support zarr v3 (#114, benjeffery)

Expand Down
Binary file added tests/files/issue95_metadata_bug.tsz
Binary file not shown.
Binary file added tests/files/issue95_metadata_dtype.trees
Binary file not shown.
115 changes: 115 additions & 0 deletions tests/test_compression.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,6 +355,104 @@ def test_wrong_format(self):
with self.assertRaises(exceptions.FileFormatError):
tszip.decompress(self.path)

def test_struct_metadata_roundtrip(self):
ts = msprime.simulate(10, random_seed=1)

struct_metadata = {
"reverse_node_map": [847973, 1442881, 356055, 2542708, 285222, 175110]
}

tables = ts.dump_tables()
schema = {
"codec": "struct",
"type": "object",
"properties": {
"reverse_node_map": {
"type": "array",
"items": {
"type": "integer",
"binaryFormat": "I",
}, # unsigned 32-bit int
}
},
}
tables.metadata_schema = tskit.MetadataSchema(schema)
tables.metadata = struct_metadata
ts_with_metadata = tables.tree_sequence()
tszip.compress(ts_with_metadata, self.path)
ts_decompressed = tszip.decompress(self.path)
self.assertEqual(ts_decompressed.metadata, ts_with_metadata.metadata)

def test_utf8_time_units_roundtrip(self):
"""Test that time_units with non-ASCII UTF-8 characters work correctly."""
ts = msprime.simulate(10, random_seed=1)
tables = ts.dump_tables()
# Use time_units with characters that require multi-byte UTF-8 encoding (>127)
tables.time_units = "μβrånches per γενεᾱ 世代" # Greek, Nordic, Chinese chars
ts_with_unicode_units = tables.tree_sequence()

tszip.compress(ts_with_unicode_units, self.path)
ts_decompressed = tszip.decompress(self.path)
self.assertEqual(ts_decompressed.time_units, ts_with_unicode_units.time_units)

def test_json_metadata_roundtrip(self):
ts = msprime.simulate(10, random_seed=1)

json_metadata = {
"description": "Test tree sequence with JSON metadata",
"sample_count": 10,
"parameters": {
"Ne": 1000,
"mutation_rate": 1e-8,
"recombination_rate": 1e-8,
},
"tags": ["test", "simulation", "msprime"],
"version": 1.0,
"unicode_text": "Héllo Wørld! 你好世界 🧬🌳", # Characters with ASCII > 127
"author": "José María González-Pérez", # Accented characters
}

tables = ts.dump_tables()
schema = {
"codec": "json",
"type": "object",
"properties": {
"description": {"type": "string"},
"sample_count": {"type": "integer"},
"parameters": {
"type": "object",
"properties": {
"Ne": {"type": "number"},
"mutation_rate": {"type": "number"},
"recombination_rate": {"type": "number"},
},
},
"tags": {"type": "array", "items": {"type": "string"}},
"version": {"type": "number"},
"unicode_text": {"type": "string"},
"author": {"type": "string"},
},
}
tables.metadata_schema = tskit.MetadataSchema(schema)
tables.metadata = json_metadata
ts_with_metadata = tables.tree_sequence()
tszip.compress(ts_with_metadata, self.path)
ts_decompressed = tszip.decompress(self.path)
self.assertEqual(ts_decompressed.metadata, json_metadata)
self.assertEqual(
ts_decompressed.metadata_schema, ts_with_metadata.metadata_schema
)

def test_raw_metadata_with_high_bytes(self):
ts = msprime.simulate(10, random_seed=1)
tables = ts.dump_tables()
raw_metadata_bytes = bytes([65, 66, 200, 150, 255, 128]) # Contains bytes > 127
tables.metadata = raw_metadata_bytes
ts_with_metadata = tables.tree_sequence()
tszip.compress(ts_with_metadata, self.path)
ts_decompressed = tszip.decompress(self.path)
self.assertEqual(ts_decompressed.metadata, raw_metadata_bytes)


class TestFileErrors(unittest.TestCase):
"""
Expand Down Expand Up @@ -411,3 +509,20 @@ def test_open_both(self):
ts = tszip.load(files / "1.0.0.trees.tsz")
ts2 = tszip.load(files / "1.0.0.trees")
assert ts == ts2

def test_issue95_metadata_dtype_regression(self):
# Test that we can decompress files with struct metadata that were compressed by
# version <=0.2.5 that stored metadata as the wrong dtype.

files = pathlib.Path(__file__).parent / "files"

ts_original = tszip.load(files / "issue95_metadata_dtype.trees")
# This file was compressed with 0.2.5 and should now decompress successfully
ts_decompressed = tszip.load(files / "issue95_metadata_bug.tsz")

assert ts_decompressed.metadata == ts_original.metadata
assert isinstance(ts_decompressed.metadata, dict)
assert "reverse_node_map" in ts_decompressed.metadata
assert len(ts_decompressed.metadata["reverse_node_map"]) == len(
ts_original.metadata["reverse_node_map"]
)
12 changes: 9 additions & 3 deletions tszip/compression.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,9 +215,9 @@ def compress_zarr(ts, root, variants_only=False):
"reference_sequence/data",
"reference_sequence/url",
]:
columns[name] = np.frombuffer(columns[name].encode("utf-8"), np.int8)
columns[name] = np.frombuffer(columns[name].encode("utf-8"), np.uint8)
if name.endswith("metadata"):
columns[name] = np.frombuffer(columns[name], np.int8)
columns[name] = np.frombuffer(columns[name], np.uint8)

# Some columns benefit from being quantised
coordinates = np.unique(
Expand Down Expand Up @@ -335,7 +335,13 @@ def decompress_zarr(root):
if key.endswith("metadata_schema") or key == "time_units":
dict_repr[key] = bytes(value).decode("utf-8")
elif key.endswith("metadata"):
dict_repr[key] = bytes(value)
# Handle backward compatibility: <=0.2.5 versions stored metadata as int8
# which can have negative values outside the valid byte range (0-255)
try:
dict_repr[key] = bytes(value)
except ValueError:
uint8_value = np.array(value, dtype=np.int8).astype(np.uint8)
dict_repr[key] = bytes(uint8_value)
else:
dict_repr[key] = value
return tskit.TableCollection.fromdict(dict_repr).tree_sequence()
Expand Down
Loading