Skip to content

Commit 9a11916

Browse files
committed
Fix metadata encoding issue
1 parent 266b264 commit 9a11916

File tree

5 files changed

+131
-3
lines changed

5 files changed

+131
-3
lines changed

CHANGELOG.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,13 @@
22
[0.2.5] - 2025-XX-XX
33
--------------------
44

5+
- Fix a metadata encoding bug (#95) that could cause decompression to raise `ValueError`.
6+
Previous versions of tszip incorrectly stored metadata as int8 arrays.
7+
For plain ASCII text in either json or struct codec this would cause any issues,
8+
however for non-ASCII text or struct data encoded with non-ASCII bytes this could
9+
cause decompression to fail. This fix correctly writes metadata, and loads erroneous
10+
int8 metadata written by previous versions of tszip correctly.
11+
(benjeffery, #115)
512
- Drop Python 3.9 support, require Python >= 3.10 (#112, benjeffery)
613
- Support zarr v3 (#114, benjeffery)
714

1.5 MB
Binary file not shown.
5.51 MB
Binary file not shown.

tests/test_compression.py

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -355,6 +355,104 @@ def test_wrong_format(self):
355355
with self.assertRaises(exceptions.FileFormatError):
356356
tszip.decompress(self.path)
357357

358+
def test_struct_metadata_roundtrip(self):
359+
ts = msprime.simulate(10, random_seed=1)
360+
361+
struct_metadata = {
362+
"reverse_node_map": [847973, 1442881, 356055, 2542708, 285222, 175110]
363+
}
364+
365+
tables = ts.dump_tables()
366+
schema = {
367+
"codec": "struct",
368+
"type": "object",
369+
"properties": {
370+
"reverse_node_map": {
371+
"type": "array",
372+
"items": {
373+
"type": "integer",
374+
"binaryFormat": "I",
375+
}, # unsigned 32-bit int
376+
}
377+
},
378+
}
379+
tables.metadata_schema = tskit.MetadataSchema(schema)
380+
tables.metadata = struct_metadata
381+
ts_with_metadata = tables.tree_sequence()
382+
tszip.compress(ts_with_metadata, self.path)
383+
ts_decompressed = tszip.decompress(self.path)
384+
self.assertEqual(ts_decompressed.metadata, ts_with_metadata.metadata)
385+
386+
def test_utf8_time_units_roundtrip(self):
387+
"""Test that time_units with non-ASCII UTF-8 characters work correctly."""
388+
ts = msprime.simulate(10, random_seed=1)
389+
tables = ts.dump_tables()
390+
# Use time_units with characters that require multi-byte UTF-8 encoding (>127)
391+
tables.time_units = "μβrånches per γενεᾱ 世代" # Greek, Nordic, Chinese chars
392+
ts_with_unicode_units = tables.tree_sequence()
393+
394+
tszip.compress(ts_with_unicode_units, self.path)
395+
ts_decompressed = tszip.decompress(self.path)
396+
self.assertEqual(ts_decompressed.time_units, ts_with_unicode_units.time_units)
397+
398+
def test_json_metadata_roundtrip(self):
399+
ts = msprime.simulate(10, random_seed=1)
400+
401+
json_metadata = {
402+
"description": "Test tree sequence with JSON metadata",
403+
"sample_count": 10,
404+
"parameters": {
405+
"Ne": 1000,
406+
"mutation_rate": 1e-8,
407+
"recombination_rate": 1e-8,
408+
},
409+
"tags": ["test", "simulation", "msprime"],
410+
"version": 1.0,
411+
"unicode_text": "Héllo Wørld! 你好世界 🧬🌳", # Characters with ASCII > 127
412+
"author": "José María González-Pérez", # Accented characters
413+
}
414+
415+
tables = ts.dump_tables()
416+
schema = {
417+
"codec": "json",
418+
"type": "object",
419+
"properties": {
420+
"description": {"type": "string"},
421+
"sample_count": {"type": "integer"},
422+
"parameters": {
423+
"type": "object",
424+
"properties": {
425+
"Ne": {"type": "number"},
426+
"mutation_rate": {"type": "number"},
427+
"recombination_rate": {"type": "number"},
428+
},
429+
},
430+
"tags": {"type": "array", "items": {"type": "string"}},
431+
"version": {"type": "number"},
432+
"unicode_text": {"type": "string"},
433+
"author": {"type": "string"},
434+
},
435+
}
436+
tables.metadata_schema = tskit.MetadataSchema(schema)
437+
tables.metadata = json_metadata
438+
ts_with_metadata = tables.tree_sequence()
439+
tszip.compress(ts_with_metadata, self.path)
440+
ts_decompressed = tszip.decompress(self.path)
441+
self.assertEqual(ts_decompressed.metadata, json_metadata)
442+
self.assertEqual(
443+
ts_decompressed.metadata_schema, ts_with_metadata.metadata_schema
444+
)
445+
446+
def test_raw_metadata_with_high_bytes(self):
447+
ts = msprime.simulate(10, random_seed=1)
448+
tables = ts.dump_tables()
449+
raw_metadata_bytes = bytes([65, 66, 200, 150, 255, 128]) # Contains bytes > 127
450+
tables.metadata = raw_metadata_bytes
451+
ts_with_metadata = tables.tree_sequence()
452+
tszip.compress(ts_with_metadata, self.path)
453+
ts_decompressed = tszip.decompress(self.path)
454+
self.assertEqual(ts_decompressed.metadata, raw_metadata_bytes)
455+
358456

359457
class TestFileErrors(unittest.TestCase):
360458
"""
@@ -411,3 +509,20 @@ def test_open_both(self):
411509
ts = tszip.load(files / "1.0.0.trees.tsz")
412510
ts2 = tszip.load(files / "1.0.0.trees")
413511
assert ts == ts2
512+
513+
def test_issue95_metadata_dtype_regression(self):
514+
# Test that we can decompress files with struct metadata that were compressed by
515+
# version <=0.2.5 that stored metadata as the wrong dtype.
516+
517+
files = pathlib.Path(__file__).parent / "files"
518+
519+
ts_original = tszip.load(files / "issue95_metadata_dtype.trees")
520+
# This file was compressed with 0.2.5 and should now decompress successfully
521+
ts_decompressed = tszip.load(files / "issue95_metadata_bug.tsz")
522+
523+
assert ts_decompressed.metadata == ts_original.metadata
524+
assert isinstance(ts_decompressed.metadata, dict)
525+
assert "reverse_node_map" in ts_decompressed.metadata
526+
assert len(ts_decompressed.metadata["reverse_node_map"]) == len(
527+
ts_original.metadata["reverse_node_map"]
528+
)

tszip/compression.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -215,9 +215,9 @@ def compress_zarr(ts, root, variants_only=False):
215215
"reference_sequence/data",
216216
"reference_sequence/url",
217217
]:
218-
columns[name] = np.frombuffer(columns[name].encode("utf-8"), np.int8)
218+
columns[name] = np.frombuffer(columns[name].encode("utf-8"), np.uint8)
219219
if name.endswith("metadata"):
220-
columns[name] = np.frombuffer(columns[name], np.int8)
220+
columns[name] = np.frombuffer(columns[name], np.uint8)
221221

222222
# Some columns benefit from being quantised
223223
coordinates = np.unique(
@@ -335,7 +335,13 @@ def decompress_zarr(root):
335335
if key.endswith("metadata_schema") or key == "time_units":
336336
dict_repr[key] = bytes(value).decode("utf-8")
337337
elif key.endswith("metadata"):
338-
dict_repr[key] = bytes(value)
338+
# Handle backward compatibility: <=0.2.5 versions stored metadata as int8
339+
# which can have negative values outside the valid byte range (0-255)
340+
try:
341+
dict_repr[key] = bytes(value)
342+
except ValueError:
343+
uint8_value = np.array(value, dtype=np.int8).astype(np.uint8)
344+
dict_repr[key] = bytes(uint8_value)
339345
else:
340346
dict_repr[key] = value
341347
return tskit.TableCollection.fromdict(dict_repr).tree_sequence()

0 commit comments

Comments
 (0)