tskit-dev
diff --git a/‎CHANGELOG.rst‎
Lines changed: 7 additions & 0 deletions b/‎CHANGELOG.rst‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎tests/files/issue95_metadata_bug.tsz‎
1.5 MB b/‎tests/files/issue95_metadata_bug.tsz‎
1.5 MB
diff --git a/‎tests/files/issue95_metadata_dtype.trees‎
5.51 MB b/‎tests/files/issue95_metadata_dtype.trees‎
5.51 MB
diff --git a/‎tests/test_compression.py‎
Lines changed: 115 additions & 0 deletions b/‎tests/test_compression.py‎
Lines changed: 115 additions & 0 deletions
diff --git a/‎tszip/compression.py‎
Lines changed: 9 additions & 3 deletions b/‎tszip/compression.py‎
Lines changed: 9 additions & 3 deletions
@@ -2,6 +2,13 @@
 [0.2.5] - 2025-XX-XX
 --------------------
 
+- Fix a metadata encoding bug (#95) that could cause decompression to raise `ValueError`.
+  Previous versions of tszip incorrectly stored metadata as int8 arrays.
+  For plain ASCII text in either json or struct codec this would cause any issues,
+  however for non-ASCII text or struct data encoded with non-ASCII bytes this could
+  cause decompression to fail. This fix correctly writes metadata, and loads erroneous
+  int8 metadata written by previous versions of tszip correctly.
+  (benjeffery, #115)
 - Drop Python 3.9 support, require Python >= 3.10 (#112, benjeffery)
 - Support zarr v3 (#114, benjeffery)
 
 
@@ -355,6 +355,104 @@ def test_wrong_format(self):
             with self.assertRaises(exceptions.FileFormatError):
                 tszip.decompress(self.path)
 
+    def test_struct_metadata_roundtrip(self):
+        ts = msprime.simulate(10, random_seed=1)
+
+        struct_metadata = {
+            "reverse_node_map": [847973, 1442881, 356055, 2542708, 285222, 175110]
+        }
+
+        tables = ts.dump_tables()
+        schema = {
+            "codec": "struct",
+            "type": "object",
+            "properties": {
+                "reverse_node_map": {
+                    "type": "array",
+                    "items": {
+                        "type": "integer",
+                        "binaryFormat": "I",
+                    },  # unsigned 32-bit int
+                }
+            },
+        }
+        tables.metadata_schema = tskit.MetadataSchema(schema)
+        tables.metadata = struct_metadata
+        ts_with_metadata = tables.tree_sequence()
+        tszip.compress(ts_with_metadata, self.path)
+        ts_decompressed = tszip.decompress(self.path)
+        self.assertEqual(ts_decompressed.metadata, ts_with_metadata.metadata)
+
+    def test_utf8_time_units_roundtrip(self):
+        """Test that time_units with non-ASCII UTF-8 characters work correctly."""
+        ts = msprime.simulate(10, random_seed=1)
+        tables = ts.dump_tables()
+        # Use time_units with characters that require multi-byte UTF-8 encoding (>127)
+        tables.time_units = "μβrånches per γενεᾱ 世代"  # Greek, Nordic, Chinese chars
+        ts_with_unicode_units = tables.tree_sequence()
+
+        tszip.compress(ts_with_unicode_units, self.path)
+        ts_decompressed = tszip.decompress(self.path)
+        self.assertEqual(ts_decompressed.time_units, ts_with_unicode_units.time_units)
+
+    def test_json_metadata_roundtrip(self):
+        ts = msprime.simulate(10, random_seed=1)
+
+        json_metadata = {
+            "description": "Test tree sequence with JSON metadata",
+            "sample_count": 10,
+            "parameters": {
+                "Ne": 1000,
+                "mutation_rate": 1e-8,
+                "recombination_rate": 1e-8,
+            },
+            "tags": ["test", "simulation", "msprime"],
+            "version": 1.0,
+            "unicode_text": "Héllo Wørld! 你好世界 🧬🌳",  # Characters with ASCII > 127
+            "author": "José María González-Pérez",  # Accented characters
+        }
+
+        tables = ts.dump_tables()
+        schema = {
+            "codec": "json",
+            "type": "object",
+            "properties": {
+                "description": {"type": "string"},
+                "sample_count": {"type": "integer"},
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "Ne": {"type": "number"},
+                        "mutation_rate": {"type": "number"},
+                        "recombination_rate": {"type": "number"},
+                    },
+                },
+                "tags": {"type": "array", "items": {"type": "string"}},
+                "version": {"type": "number"},
+                "unicode_text": {"type": "string"},
+                "author": {"type": "string"},
+            },
+        }
+        tables.metadata_schema = tskit.MetadataSchema(schema)
+        tables.metadata = json_metadata
+        ts_with_metadata = tables.tree_sequence()
+        tszip.compress(ts_with_metadata, self.path)
+        ts_decompressed = tszip.decompress(self.path)
+        self.assertEqual(ts_decompressed.metadata, json_metadata)
+        self.assertEqual(
+            ts_decompressed.metadata_schema, ts_with_metadata.metadata_schema
+        )
+
+    def test_raw_metadata_with_high_bytes(self):
+        ts = msprime.simulate(10, random_seed=1)
+        tables = ts.dump_tables()
+        raw_metadata_bytes = bytes([65, 66, 200, 150, 255, 128])  # Contains bytes > 127
+        tables.metadata = raw_metadata_bytes
+        ts_with_metadata = tables.tree_sequence()
+        tszip.compress(ts_with_metadata, self.path)
+        ts_decompressed = tszip.decompress(self.path)
+        self.assertEqual(ts_decompressed.metadata, raw_metadata_bytes)
+
 
 class TestFileErrors(unittest.TestCase):
     """
@@ -411,3 +509,20 @@ def test_open_both(self):
         ts = tszip.load(files / "1.0.0.trees.tsz")
         ts2 = tszip.load(files / "1.0.0.trees")
         assert ts == ts2
+
+    def test_issue95_metadata_dtype_regression(self):
+        # Test that we can decompress files with struct metadata that were compressed by
+        # version <=0.2.5 that stored metadata as the wrong dtype.
+
+        files = pathlib.Path(__file__).parent / "files"
+
+        ts_original = tszip.load(files / "issue95_metadata_dtype.trees")
+        # This file was compressed with 0.2.5 and should now decompress successfully
+        ts_decompressed = tszip.load(files / "issue95_metadata_bug.tsz")
+
+        assert ts_decompressed.metadata == ts_original.metadata
+        assert isinstance(ts_decompressed.metadata, dict)
+        assert "reverse_node_map" in ts_decompressed.metadata
+        assert len(ts_decompressed.metadata["reverse_node_map"]) == len(
+            ts_original.metadata["reverse_node_map"]
+        )
@@ -215,9 +215,9 @@ def compress_zarr(ts, root, variants_only=False):
             "reference_sequence/data",
             "reference_sequence/url",
         ]:
-            columns[name] = np.frombuffer(columns[name].encode("utf-8"), np.int8)
+            columns[name] = np.frombuffer(columns[name].encode("utf-8"), np.uint8)
         if name.endswith("metadata"):
-            columns[name] = np.frombuffer(columns[name], np.int8)
+            columns[name] = np.frombuffer(columns[name], np.uint8)
 
     # Some columns benefit from being quantised
     coordinates = np.unique(
@@ -335,7 +335,13 @@ def decompress_zarr(root):
             if key.endswith("metadata_schema") or key == "time_units":
                 dict_repr[key] = bytes(value).decode("utf-8")
             elif key.endswith("metadata"):
-                dict_repr[key] = bytes(value)
+                # Handle backward compatibility: <=0.2.5 versions stored metadata as int8
+                # which can have negative values outside the valid byte range (0-255)
+                try:
+                    dict_repr[key] = bytes(value)
+                except ValueError:
+                    uint8_value = np.array(value, dtype=np.int8).astype(np.uint8)
+                    dict_repr[key] = bytes(uint8_value)
             else:
                 dict_repr[key] = value
     return tskit.TableCollection.fromdict(dict_repr).tree_sequence()