⚡ Extended symbol set for NEXUS outputs to 62; added support for simi…

…larity and distance matrices in PHYLIP format
jjmccollum · Feb 4, 2025 · 2786a36 · 2786a36
1 parent b7d9cad
commit 2786a36
Show file tree

Hide file tree

Showing 5 changed files with 126 additions and 16 deletions.
diff --git a/docs/advanced.rst b/docs/advanced.rst
@@ -618,9 +618,10 @@ For ``nexus`` outputs, the ``CharStateLabels`` block (which provides human-reada
 This is necessary if you intend to pass your NEXUS-formatted data to phylogenetic programs like MrBayes that do not recognize this block.
 Note that all reading labels will be slugified so that all characters (e.g., Greek characters) are converted to ASCII characters and spaces and other punctuation marks are replaced by underscores; this is to conformance with the recommendations for the NEXUS format.
 
-Note that for the ``nexus``, ``hennig86``, ``phylip``, and ``fasta`` output formats, only up to 32 states (represented by the symbols 0-9 and a-v) are supported at this time.
+Note that for ``hennig86``, ``phylip``, and ``fasta`` output formats, only up to 32 states (represented by the symbols 0-9 and a-v) are supported at this time.
 This is a requirement for Hennig86 format, and some phylogenetic programs that use these formats (such as IQTREE and RAxML) do not support symbols outside of the basic 36 alphanumeric characters or a 32-character alphabet at this time.
 The ``stemma`` output format currently supports up to 62 states.
+Outputs in ``nexus`` format also support up to 62 states to accommodate software like PAUP* and Andrew Edmondson's fork of MrBayes (https://github.com/edmondac/MrBayes), but note that some of the programs listed above will not work with ``nexus`` inputs with a state alphabet this large. 
 
 Collations can also be converted to tabular formats.
 Within Python, the ``collation`` class's ``to_numpy`` method can be invoked to convert a collation to a NumPy ``array`` with rows for variant readings, columns for witnesses, and frequency values in the cells.
@@ -637,12 +638,15 @@ The same class's ``to_long_table`` method produces a NumPy ``array`` with column
 The ``to_dataframe`` method invokes ``to_numpy`` by default, but if the ``table_type`` argument is ``distance``, ``nexus`` or ``long``, then it will invoke ``to_distance_matrix``, ``to_nexus_table`` or ``to_long_table``, respectively.
 It returns a Pandas ``DataFrame`` augmented with row and column labels (or, in the case of a long table, just column labels).
 
-From the command line, the standard reading-witness matrix or long table can be written to a specified CSV, TSV, or Excel (.xlsx) file.
+From the command line, the types of matrices listed above can be written to a specified CSV, TSV, or Excel (.xlsx) file.
 If you specify the output filename with its extension, ``teiphy`` will infer which format to use.
 If you want to write a distance matrix, a similarity matrix, a NEXUS-style table, or a long table to output instead of a reading-witness matrix, then you can do so by specifying the ``--table distance``, ``--table similarity``, ``--table nexus``, or ``--table long`` command-line argument, respectively.
 If you are writing a reading-witness matrix to output, you can set the method's ``split_missing`` argument using the ``--split-missing`` command-line flag.
 If you are writing a distance or similarity matrix to output, then you can set the method's ``proportion`` and ``show_ext`` arguments using using the ``--proportion`` and ``--show-ext`` command-line flags, respectively.
 As with plain NEXUS outputs, if you are writing a NEXUS table to output, then you can set the method's ``ambiguous_as_missing`` argument using the ``--ambiguous-as-missing`` command-line flag.
+You can also write a pairwise distance or similarity matrix to a PHYLIP (.phy, .ph) file if you specify ``--table distance`` or ``--table similarity`` as an option with a PHYLIP output.
+(Note that only these two table types are support for this output format; if you specify any other type of table with a PHYLIP output, then the option will be ignored, and a standard PHYLIP output will be generated instead.)
+The ``--proportion`` and ``--show-ext`` flags are supported for PHYLIP matrix outputs.
 
 Other Options
 -------------

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "teiphy"
-version = "0.1.19"
+version = "0.1.20"
 description = "Converts TEI XML collations to NEXUS and other formats"
 authors = ["Joey McCollum and Robert Turnbull"]
 license = "MIT"

diff --git a/teiphy/collation.py b/teiphy/collation.py
@@ -679,9 +679,11 @@ def get_nexus_symbols(self):
         Returns:
             A list of individual characters representing states in readings.
         """
-        # NOTE: PAUP* allows up to 64 symbols, but IQTREE does not appear to support symbols outside of 0-9 and a-z, and base symbols must be case-insensitive,
-        # so we will settle for a maximum of 32 singleton symbols for now
-        possible_symbols = list(string.digits) + list(string.ascii_lowercase)[:22]
+        # NOTE: IQTREE does not appear to support symbols outside of 0-9 and a-z, and its base symbols must be case-insensitive.
+        # The official version of MrBayes is likewise limited to 32 symbols.
+        # But PAUP* allows up to 64 symbols, and Andrew Edmondson's fork of MrBayes does, as well.
+        # So this method will support symbols from 0-9, a-z, and A-Z (for a total of 62 states)
+        possible_symbols = list(string.digits) + list(string.ascii_lowercase) + list(string.ascii_uppercase)
         # The number of symbols needed is equal to the length of the longest substantive reading vector:
         nsymbols = 0
         # If there are no witnesses, then no symbols are needed at all:
@@ -2200,6 +2202,55 @@ def to_excel(
             return df.to_excel(file_addr, index=False)
         return df.to_excel(file_addr)
 
+    def to_phylip_matrix(
+        self,
+        file_addr: Union[Path, str],
+        drop_constant: bool = False,
+        proportion: bool = False,
+        table_type: TableType = TableType.distance,
+        show_ext: bool = False,
+    ):
+        """Writes this Collation as a PHYLIP-formatted distance/similarity matrix to the file with the given address.
+
+        Args:
+            file_addr: A string representing the path to an output PHYLIP file; the file type should be .ph or .phy.
+            drop_constant: An optional flag indicating whether to ignore variation units with one substantive reading.
+                Default value is False.
+            proportion: An optional flag indicating whether or not to calculate distances as proportions over extant, unambiguous variation units.
+                Default value is False.
+            table_type: A TableType option indicating which type of tabular output to generate.
+                For PHYLIP-formatted outputs, distance and similarity matrices are the only supported table types.
+                Default value is "distance".
+            show_ext: An optional flag indicating whether each cell in a distance or similarity matrix
+                should include the number of their extant, unambiguous variation units after the number of their disagreements/agreements.
+                Only applicable for tabular output formats of type \"distance\" or \"similarity\".
+                Default value is False.
+        """
+        # Convert the collation to a Pandas DataFrame first:
+        matrix = None
+        witness_labels = []
+        # Proceed based on the table type:
+        if table_type == TableType.distance:
+            # Convert the collation to a NumPy array and get its row and column labels first:
+            matrix, witness_labels = self.to_distance_matrix(
+                drop_constant=drop_constant, proportion=proportion, show_ext=show_ext
+            )
+        elif table_type == TableType.similarity:
+            # Convert the collation to a NumPy array and get its row and column labels first:
+            matrix, witness_labels = self.to_similarity_matrix(
+                drop_constant=drop_constant, proportion=proportion, show_ext=show_ext
+            )
+        # Generate all parent folders for this file that don't already exist:
+        Path(file_addr).parent.mkdir(parents=True, exist_ok=True)
+        with open(file_addr, "w", encoding="utf-8") as f:
+            # The first line contains the number of taxa:
+            f.write("%d\n" % len(witness_labels))
+            # Every subsequent line contains a witness label, followed by the values in its row of the matrix:
+            for i, wit_id in enumerate(witness_labels):
+                wit_label = slugify(wit_id, lowercase=False, allow_unicode=True, separator='_')
+                f.write("%s %s\n" % (wit_label, " ".join([str(v) for v in matrix[i]])))
+        return
+
     def get_stemma_symbols(self):
         """Returns a list of one-character symbols needed to represent the states of all substantive readings in STEMMA format.
 
@@ -2434,11 +2485,12 @@ def to_file(
             ancestral_logger (AncestralLogger, optional): An AncestralLogger option indicating which class of logger (if any) to use for ancestral states.
                 This option is intended for inputs to BEAST 2.
             table_type (TableType, optional): A TableType option indicating which type of tabular output to generate.
-                Only applicable for tabular outputs.
+                Only applicable for tabular outputs and PHYLIP outputs.
+                If the output is a PHYLIP file, then the type of tabular output must be "distance" or "similarity"; otherwise, it will be ignored.
                 Default value is "matrix".
             show_ext (bool, optional): An optional flag indicating whether each cell in a distance or similarity matrix
                 should include the number of variation units where both witnesses are extant after the number of their disagreements/agreements.
-                Only applicable for tabular output formats of type \"distance\" or \"similarity\".
+                Only applicable for tabular output formats of type "distance" or "similarity".
                 Default value is False.
             seed (optional, int): A seed for random number generation (for setting initial values of unspecified transcriptional rates in BEAST 2 XML output).
         """
@@ -2463,6 +2515,14 @@ def to_file(
             return self.to_hennig86(file_addr, drop_constant=drop_constant)
 
         if format == format.PHYLIP:
+            if table_type in [TableType.distance, TableType.similarity]:
+                return self.to_phylip_matrix(
+                    file_addr,
+                    drop_constant=drop_constant,
+                    proportion=proportion,
+                    table_type=table_type,
+                    show_ext=show_ext,
+                )
             return self.to_phylip(file_addr, drop_constant=drop_constant)
 
         if format == format.FASTA:

diff --git a/teiphy/main.py b/teiphy/main.py
@@ -81,7 +81,7 @@ def to_file(
     ),
     table: TableType = typer.Option(
         TableType.matrix,
-        help="The type of table to use for CSV/Excel output. If \"matrix\", then the table will have rows for witnesses and columns for all variant readings, with frequency values in cells (the --split-missing flag can be used with this option). If \"distance\", then the table will have rows and columns for witnesses, with the number or proportion of disagreements between each pair in the corresponding cell (the --proportion flag can be used with this option). If \"similarity\", then the table will have rows and columns for witnesses, with the number or proportion of agreements between each pair in the corresponding cell (the --proportion flag can be used with this option). If \"nexus\", then the table will have rows for witnesses and columns for variation units with reading IDs in cells (the --ambiguous-as-missing flag can be used with this option). If \"long\", then the table will consist of repeated rows with column entries for taxa, characters, reading indices, and reading texts.",
+        help="The type of table to use for CSV/TSV/Excel/PHYLIP output. If \"matrix\", then the table will have rows for witnesses and columns for all variant readings, with frequency values in cells (the --split-missing flag can be used with this option). If \"distance\", then the table will have rows and columns for witnesses, with the number or proportion of disagreements between each pair in the corresponding cell (the --proportion flag can be used with this option). If \"similarity\", then the table will have rows and columns for witnesses, with the number or proportion of agreements between each pair in the corresponding cell (the --proportion flag can be used with this option). If \"nexus\", then the table will have rows for witnesses and columns for variation units with reading IDs in cells (the --ambiguous-as-missing flag can be used with this option). If \"long\", then the table will consist of repeated rows with column entries for taxa, characters, reading indices, and reading texts. If the output is a PHYLIP file, then the type of tabular output must be \"distance\" or \"similarity\"; otherwise, it will be ignored.",
     ),
     split_missing: bool = typer.Option(
         False,

diff --git a/tests/test_main.py b/tests/test_main.py
@@ -1415,19 +1415,25 @@ def test_to_csv_show_ext_distance_table():
         print(text)
         assert text.startswith(",UBS,P46,01,02,03,04,06")
         assert "\nUBS," in text
-        assert ",19/41," in text # note that type "lac" readings are not treated as missing with the above inputs, so the only variation not counted for the second part is the one where P46 is ambiguous
+        assert (
+            ",19/41," in text
+        )  # note that type "lac" readings are not treated as missing with the above inputs, so the only variation not counted for the second part is the one where P46 is ambiguous
 
 
 def test_to_csv_proportion_show_ext_distance_table():
     with tempfile.TemporaryDirectory() as tmp_dir:
         output = Path(tmp_dir) / "test.csv"
-        result = runner.invoke(app, ["--verbose", "--table", "distance", "--proportion", "--show-ext", str(input_example), str(output)])
+        result = runner.invoke(
+            app, ["--verbose", "--table", "distance", "--proportion", "--show-ext", str(input_example), str(output)]
+        )
         assert result.exit_code == 0
         assert output.exists()
         text = output.read_text(encoding="utf-8-sig")
         assert text.startswith(",UBS,P46,01,02,03,04,06")
         assert "\nUBS," in text
-        assert ",0.4634146341463415/41," in text # note that type "lac" readings are not treated as missing with the above inputs, so the only variation not counted for the second part is the one where P46 is ambiguous
+        assert (
+            ",0.4634146341463415/41," in text
+        )  # note that type "lac" readings are not treated as missing with the above inputs, so the only variation not counted for the second part is the one where P46 is ambiguous
 
 
 def test_to_csv_similarity_table():
@@ -1459,26 +1465,34 @@ def test_to_csv_proportion_similarity_table():
 def test_to_csv_show_ext_similarity_table():
     with tempfile.TemporaryDirectory() as tmp_dir:
         output = Path(tmp_dir) / "test.csv"
-        result = runner.invoke(app, ["--verbose", "--table", "similarity", "--show-ext", str(input_example), str(output)])
+        result = runner.invoke(
+            app, ["--verbose", "--table", "similarity", "--show-ext", str(input_example), str(output)]
+        )
         assert result.exit_code == 0
         assert output.exists()
         text = output.read_text(encoding="utf-8-sig")
         print(text)
         assert text.startswith(",UBS,P46,01,02,03,04,06")
         assert "\nUBS," in text
-        assert "22/41" in text # note that type "lac" readings are not treated as missing with the above inputs, so the only variation not counted for the second part is the one where P46 is ambiguous
+        assert (
+            "22/41" in text
+        )  # note that type "lac" readings are not treated as missing with the above inputs, so the only variation not counted for the second part is the one where P46 is ambiguous
 
 
 def test_to_csv_proportion_show_ext_similarity_table():
     with tempfile.TemporaryDirectory() as tmp_dir:
         output = Path(tmp_dir) / "test.csv"
-        result = runner.invoke(app, ["--verbose", "--table", "similarity", "--proportion", "--show-ext", str(input_example), str(output)])
+        result = runner.invoke(
+            app, ["--verbose", "--table", "similarity", "--proportion", "--show-ext", str(input_example), str(output)]
+        )
         assert result.exit_code == 0
         assert output.exists()
         text = output.read_text(encoding="utf-8-sig")
         assert text.startswith(",UBS,P46,01,02,03,04,06")
         assert "\nUBS," in text
-        assert "0.5365853658536586/41" in text # note that type "lac" readings are not treated as missing with the above inputs, so the only variation not counted for the second part is the one where P46 is ambiguous
+        assert (
+            "0.5365853658536586/41" in text
+        )  # note that type "lac" readings are not treated as missing with the above inputs, so the only variation not counted for the second part is the one where P46 is ambiguous
 
 
 def test_to_csv_drop_constant_long_table():
@@ -1599,6 +1613,38 @@ def test_to_excel_nexus_table():
         assert output.exists()
 
 
+def test_to_phylip_distance_matrix():
+    parser = et.XMLParser(remove_comments=True)
+    xml = et.parse(input_example, parser=parser)
+    xml_witnesses = xml.xpath("//tei:listWit/tei:witness", namespaces={"tei": tei_ns})
+
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        output = Path(tmp_dir) / "test.phy"
+        result = runner.invoke(app, ["--verbose", "--table", "distance", str(input_example), str(output)])
+        assert result.exit_code == 0
+        assert output.exists()
+        text = output.read_text(encoding="utf-8-sig")
+        assert text.startswith("%d" % (len(xml_witnesses)))
+        assert (
+            "UBS 0 19" in text
+        )  # note that type "lac" readings are not treated as missing with the above inputs, so the only variation not counted as a disagreement is the one where P46 is ambiguous
+
+
+def test_to_phylip_similarity_matrix():
+    parser = et.XMLParser(remove_comments=True)
+    xml = et.parse(input_example, parser=parser)
+    xml_witnesses = xml.xpath("//tei:listWit/tei:witness", namespaces={"tei": tei_ns})
+
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        output = Path(tmp_dir) / "test.phy"
+        result = runner.invoke(app, ["--verbose", "--table", "similarity", str(input_example), str(output)])
+        assert result.exit_code == 0
+        assert output.exists()
+        text = output.read_text(encoding="utf-8-sig")
+        assert text.startswith("%d" % (len(xml_witnesses)))
+        assert "UBS 42 22" in text  # UBS agrees with itself 42 times and agrees with P46 22 times
+
+
 def test_to_stemma():
     with tempfile.TemporaryDirectory() as tmp_dir:
         output = Path(tmp_dir) / "test"