Skip to content

Commit 13e427d

Browse files
committed
Fix metadata encoding issue
1 parent 266b264 commit 13e427d

File tree

2 files changed

+100
-2
lines changed

2 files changed

+100
-2
lines changed

tests/test_compression.py

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -355,6 +355,104 @@ def test_wrong_format(self):
355355
with self.assertRaises(exceptions.FileFormatError):
356356
tszip.decompress(self.path)
357357

358+
def test_struct_metadata_roundtrip(self):
359+
ts = msprime.simulate(10, random_seed=1)
360+
361+
struct_metadata = {
362+
"reverse_node_map": [847973, 1442881, 356055, 2542708, 285222, 175110]
363+
}
364+
365+
tables = ts.dump_tables()
366+
schema = {
367+
"codec": "struct",
368+
"type": "object",
369+
"properties": {
370+
"reverse_node_map": {
371+
"type": "array",
372+
"items": {
373+
"type": "integer",
374+
"binaryFormat": "I",
375+
}, # unsigned 32-bit int
376+
}
377+
},
378+
}
379+
tables.metadata_schema = tskit.MetadataSchema(schema)
380+
tables.metadata = struct_metadata
381+
ts_with_metadata = tables.tree_sequence()
382+
tszip.compress(ts_with_metadata, self.path)
383+
ts_decompressed = tszip.decompress(self.path)
384+
self.assertEqual(ts_decompressed.metadata, ts_with_metadata.metadata)
385+
386+
def test_utf8_time_units_roundtrip(self):
387+
"""Test that time_units with non-ASCII UTF-8 characters work correctly."""
388+
ts = msprime.simulate(10, random_seed=1)
389+
tables = ts.dump_tables()
390+
# Use time_units with characters that require multi-byte UTF-8 encoding (>127)
391+
tables.time_units = "μβrånches per γενεᾱ 世代" # Greek, Nordic, Chinese chars
392+
ts_with_unicode_units = tables.tree_sequence()
393+
394+
tszip.compress(ts_with_unicode_units, self.path)
395+
ts_decompressed = tszip.decompress(self.path)
396+
self.assertEqual(ts_decompressed.time_units, ts_with_unicode_units.time_units)
397+
398+
def test_json_metadata_roundtrip(self):
399+
ts = msprime.simulate(10, random_seed=1)
400+
401+
json_metadata = {
402+
"description": "Test tree sequence with JSON metadata",
403+
"sample_count": 10,
404+
"parameters": {
405+
"Ne": 1000,
406+
"mutation_rate": 1e-8,
407+
"recombination_rate": 1e-8,
408+
},
409+
"tags": ["test", "simulation", "msprime"],
410+
"version": 1.0,
411+
"unicode_text": "Héllo Wørld! 你好世界 🧬🌳", # Characters with ASCII > 127
412+
"author": "José María González-Pérez", # Accented characters
413+
}
414+
415+
tables = ts.dump_tables()
416+
schema = {
417+
"codec": "json",
418+
"type": "object",
419+
"properties": {
420+
"description": {"type": "string"},
421+
"sample_count": {"type": "integer"},
422+
"parameters": {
423+
"type": "object",
424+
"properties": {
425+
"Ne": {"type": "number"},
426+
"mutation_rate": {"type": "number"},
427+
"recombination_rate": {"type": "number"},
428+
},
429+
},
430+
"tags": {"type": "array", "items": {"type": "string"}},
431+
"version": {"type": "number"},
432+
"unicode_text": {"type": "string"},
433+
"author": {"type": "string"},
434+
},
435+
}
436+
tables.metadata_schema = tskit.MetadataSchema(schema)
437+
tables.metadata = json_metadata
438+
ts_with_metadata = tables.tree_sequence()
439+
tszip.compress(ts_with_metadata, self.path)
440+
ts_decompressed = tszip.decompress(self.path)
441+
self.assertEqual(ts_decompressed.metadata, json_metadata)
442+
self.assertEqual(
443+
ts_decompressed.metadata_schema, ts_with_metadata.metadata_schema
444+
)
445+
446+
def test_raw_metadata_with_high_bytes(self):
447+
ts = msprime.simulate(10, random_seed=1)
448+
tables = ts.dump_tables()
449+
raw_metadata_bytes = bytes([65, 66, 200, 150, 255, 128]) # Contains bytes > 127
450+
tables.metadata = raw_metadata_bytes
451+
ts_with_metadata = tables.tree_sequence()
452+
tszip.compress(ts_with_metadata, self.path)
453+
ts_decompressed = tszip.decompress(self.path)
454+
self.assertEqual(ts_decompressed.metadata, raw_metadata_bytes)
455+
358456

359457
class TestFileErrors(unittest.TestCase):
360458
"""

tszip/compression.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -215,9 +215,9 @@ def compress_zarr(ts, root, variants_only=False):
215215
"reference_sequence/data",
216216
"reference_sequence/url",
217217
]:
218-
columns[name] = np.frombuffer(columns[name].encode("utf-8"), np.int8)
218+
columns[name] = np.frombuffer(columns[name].encode("utf-8"), np.uint8)
219219
if name.endswith("metadata"):
220-
columns[name] = np.frombuffer(columns[name], np.int8)
220+
columns[name] = np.frombuffer(columns[name], np.uint8)
221221

222222
# Some columns benefit from being quantised
223223
coordinates = np.unique(

0 commit comments

Comments
 (0)