⚡ Added support (plus tests and documentation) for supplying/updating…

… witness date ranges through an external CSV file (#94)
jjmccollum · Jan 7, 2025 · c0193ec · c0193ec
1 parent eca9834
commit c0193ec
Show file tree

Hide file tree

Showing 8 changed files with 166 additions and 18 deletions.
diff --git a/docs/advanced.rst b/docs/advanced.rst
@@ -585,6 +585,14 @@ The ``AncestralSequenceLogger`` class (part of the ``BEAST_CLASSIC`` package) re
 In writing to BEAST 2.7 XML files, ``teiphy`` can include elements for either (or neither) logger based on the ``--ancestral-logger`` argument.
 The default option, ``state``, will include an ``AncestralStateLogger`` element in the XML file, while ``sequence`` will include an ``AncestralSequenceLogger`` element, and ``none`` will not include any logging elements for ancestral states.
 
+Overriding or Supplying Dates from a CSV file
+---------------------------------------------
+
+You can also specify date ranges for some witnesses in a separate CSV file.
+For the sake of completeness, it is recommended that you specify date ranges for witnesses in your TEI XML collation, but you may have pulled your collation data and witness date ranges from different sources, or you might want to overwrite existing date ranges in the collation with updated values.
+You can specify a path to the CSV file containing witness IDs and their date ranges using the ``--dates-file`` command-line option.
+The CSV file should not have any header rows, and every row should be formatted as ``"id",min,max``, where the first column contains a string (encoded as such by being surrounded by double quotes) corresponding to the witness ID and the other two columns are either empty (if one or both ends of the date range are unknown) or integers corresponding to years (where negative integers are assumed to refer to dates BCE). 
+
 Supported Output Formats and Options
 ------------------------------------
 
@@ -601,6 +609,7 @@ Note that all reading labels will be slugified so that all characters (e.g., Gre
 
 Note that for the ``nexus``, ``hennig86``, ``phylip``, and ``fasta`` output formats, only up to 32 states (represented by the symbols 0-9 and a-v) are supported at this time.
 This is a requirement for Hennig86 format, and some phylogenetic programs that use these formats (such as IQTREE and RAxML) do not support symbols outside of the basic 36 alphanumeric characters or a 32-character alphabet at this time.
+The ``stemma`` output format currently supports up to 62 states.
 
 Collations can also be converted to tabular formats.
 Within Python, the ``collation`` class's ``to_numpy`` method can be invoked to convert a collation to a NumPy ``array`` with rows for variant readings, columns for witnesses, and frequency values in the cells.

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "teiphy"
-version = "0.1.14"
+version = "0.1.15"
 description = "Converts TEI XML collations to NEXUS and other formats"
 authors = ["Joey McCollum and Robert Turnbull"]
 license = "MIT"

diff --git a/teiphy/collation.py b/teiphy/collation.py
@@ -75,6 +75,7 @@ def __init__(
         trivial_reading_types: List[str] = [],
         missing_reading_types: List[str] = [],
         fill_corrector_lacunae: bool = False,
+        dates_file: Union[Path, str] = None,
         verbose: bool = False,
     ):
         """Constructs a new Collation instance with the given settings.
@@ -85,6 +86,7 @@ def __init__(
             trivial_reading_types: An optional set of reading types (e.g., "reconstructed", "defective", "orthographic", "subreading") whose readings should be collapsed under the previous substantive reading.
             missing_reading_types: An optional set of reading types (e.g., "lac", "overlap") whose readings should be treated as missing data.
             fill_corrector_lacunae: An optional flag indicating whether or not to fill "lacunae" in witnesses with type "corrector".
+            dates_file: An optional path to a CSV file containing witness IDs, minimum dates, and maximum dates. If specified, then for all witnesses in the first column, any existing date ranges for them in the TEI XML collation will be ignored.
             verbose: An optional flag indicating whether or not to print timing and debugging details for the user.
         """
         self.manuscript_suffixes = manuscript_suffixes
@@ -111,6 +113,9 @@ def __init__(
         self.parse_origin_date_range(xml)
         self.parse_list_wit(xml)
         self.validate_wits(xml)
+        # If a dates file was specified, then update the witness date ranges manually:
+        if dates_file is not None:
+            self.update_witness_date_ranges_from_dates_file(dates_file)
         # If the upper bound on a work's date of origin is not defined, then attempt to assign it an upper bound based on the witness dates;
         # otherwise, attempt to assign lower bounds to witness dates based on it:
         if self.origin_date_range[1] is None:
@@ -282,6 +287,37 @@ def validate_wits(self, xml: et.ElementTree):
             print("Finished witness validation in %0.4fs." % (t1 - t0))
         return
 
+    def update_witness_date_ranges_from_dates_file(self, dates_file: Union[Path, str]):
+        """Given a CSV-formatted dates file, update the date ranges of all witnesses whose IDs are in the first column of the dates file
+        (overwriting existing date ranges if necessary).
+        """
+        if self.verbose:
+            print("Updating witness dates from file %s..." % (str(dates_file)))
+        t0 = time.time()
+        dates_df = pd.read_csv(dates_file, index_col=0, names=["id", "min", "max"])
+        for witness in self.witnesses:
+            wit_id = witness.id
+            if wit_id in dates_df.index:
+                # For every witness in the list whose ID is specified in the dates file,
+                # update their date ranges (as long as the date ranges in the file are are well-formed):
+                min_date = int(dates_df.loc[wit_id]["min"]) if not np.isnan(dates_df.loc[wit_id]["min"]) else None
+                max_date = (
+                    int(dates_df.loc[wit_id]["max"])
+                    if not np.isnan(dates_df.loc[wit_id]["max"])
+                    else datetime.now().year
+                )
+                print(min_date, max_date)
+                if min_date is not None and max_date is not None and min_date > max_date:
+                    raise ParsingException(
+                        "In dates file %s, for witness ID %s, the minimum date %d is greater than the maximum date %d."
+                        % (str(dates_file), wit_id, min_date, max_date)
+                    )
+                witness.date_range = [min_date, max_date]
+        t1 = time.time()
+        if self.verbose:
+            print("Finished witness date range updates in %0.4fs." % (t1 - t0))
+        return
+
     def update_origin_date_range_from_witness_date_ranges(self):
         """Conditionally updates the upper bound on the date of origin of the work represented by this Collation
         based on the bounds on the witnesses' dates.
@@ -627,7 +663,7 @@ def to_nexus(
 
         Args:
             file_addr: A string representing the path to an output NEXUS file; the file type should be .nex, .nexus, or .nxs.
-            drop_constant (bool, optional): An optional flag indicating whether to ignore variation units with one substantive reading.
+            drop_constant: An optional flag indicating whether to ignore variation units with one substantive reading.
             char_state_labels: An optional flag indicating whether or not to include the CharStateLabels block.
             frequency: An optional flag indicating whether to use the StatesFormat=Frequency setting
                 instead of the StatesFormat=StatesPresent setting
@@ -1913,10 +1949,10 @@ def to_csv(
 
         Args:
             file_addr: A string representing the path to an output CSV file; the file type should be .csv.
-            drop_constant (bool, optional): An optional flag indicating whether to ignore variation units with one substantive reading.
+            drop_constant: An optional flag indicating whether to ignore variation units with one substantive reading.
             ambiguous_as_missing: An optional flag indicating whether to treat all ambiguous states as missing data.
-            proportion (bool, optional): An optional flag indicating whether or not to calculate distances as proportions over extant, unambiguous variation units.
-            table_type (TableType, optional): A TableType option indicating which type of tabular output to generate.
+            proportion: An optional flag indicating whether or not to calculate distances as proportions over extant, unambiguous variation units.
+            table_type: A TableType option indicating which type of tabular output to generate.
                 Only applicable for tabular outputs.
                 Default value is "matrix".
             split_missing: An optional flag indicating whether or not to treat missing characters/variation units as having a contribution of 1 split over all states/readings; if False, then missing data is ignored (i.e., all states are 0). Default value is True.
@@ -1956,13 +1992,13 @@ def to_excel(
 
         Args:
             file_addr: A string representing the path to an output Excel file; the file type should be .xlsx.
-            drop_constant (bool, optional): An optional flag indicating whether to ignore variation units with one substantive reading.
+            drop_constant: An optional flag indicating whether to ignore variation units with one substantive reading.
             ambiguous_as_missing: An optional flag indicating whether to treat all ambiguous states as missing data.
-            proportion (bool, optional): An optional flag indicating whether or not to calculate distances as proportions over extant, unambiguous variation units.
-            table_type (TableType, optional): A TableType option indicating which type of tabular output to generate.
+            proportion: An optional flag indicating whether or not to calculate distances as proportions over extant, unambiguous variation units.
+            table_type: A TableType option indicating which type of tabular output to generate.
                 Only applicable for tabular outputs.
                 Default value is "matrix".
-            split_missing (bool, optional): An optional flag indicating whether or not to treat missing characters/variation units as having a contribution of 1 split over all states/readings; if False, then missing data is ignored (i.e., all states are 0). Default value is True.
+            split_missing: An optional flag indicating whether or not to treat missing characters/variation units as having a contribution of 1 split over all states/readings; if False, then missing data is ignored (i.e., all states are 0). Default value is True.
         """
         # Convert the collation to a Pandas DataFrame first:
         df = self.to_dataframe(
@@ -2009,7 +2045,7 @@ def to_stemma(self, file_addr: Union[Path, str]):
         Args:
             file_addr: A string representing the path to an output STEMMA prep file; the file should have no extension.
             The accompanying chron file will match this file name, except that it will have "_chron" appended to the end.
-            drop_constant (bool, optional): An optional flag indicating whether to ignore variation units with one substantive reading.
+            drop_constant: An optional flag indicating whether to ignore variation units with one substantive reading.
         """
         # Populate a list of sites that will correspond to columns of the sequence alignment
         # (by default, constant sites are dropped):
@@ -2196,21 +2232,21 @@ def to_file(
                 with a proportion of disagreements to variation units where both witnesses are extant.
                 It is only applied if the table_type option is "distance".
                 Default value is False.
-            calibrate_dates: An optional flag indicating whether to add an Assumptions block that specifies date distributions for witnesses
+            calibrate_dates (bool, optional): An optional flag indicating whether to add an Assumptions block that specifies date distributions for witnesses
                 in NEXUS output.
                 This option is intended for inputs to BEAST 2.
-            mrbayes: An optional flag indicating whether to add a MrBayes block that specifies model settings and age calibrations for witnesses
+            mrbayes (bool, optional): An optional flag indicating whether to add a MrBayes block that specifies model settings and age calibrations for witnesses
                 in NEXUS output.
                 This option is intended for inputs to MrBayes.
-            clock_model: A ClockModel option indicating which type of clock model to use.
+            clock_model (ClockModel, optional): A ClockModel option indicating which type of clock model to use.
                 This option is intended for inputs to MrBayes and BEAST 2.
                 MrBayes does not presently support a local clock model, so it will default to a strict clock model if a local clock model is specified.
-            ancestral_logger: An AncestralLogger option indicating which class of logger (if any) to use for ancestral states.
+            ancestral_logger (AncestralLogger, optional): An AncestralLogger option indicating which class of logger (if any) to use for ancestral states.
                 This option is intended for inputs to BEAST 2.
-            table_type: A TableType option indicating which type of tabular output to generate.
+            table_type (TableType, optional): A TableType option indicating which type of tabular output to generate.
                 Only applicable for tabular outputs.
                 Default value is "matrix".
-            seed: A seed for random number generation (for setting initial values of unspecified transcriptional rates in BEAST 2 XML output).
+            seed (optional, int): A seed for random number generation (for setting initial values of unspecified transcriptional rates in BEAST 2 XML output).
         """
         file_addr = Path(file_addr)
         format = format or Format.infer(

diff --git a/teiphy/main.py b/teiphy/main.py
@@ -95,6 +95,16 @@ def to_file(
         help="Print the current version.",
     ),
     format: Format = typer.Option(None, case_sensitive=False, help="The output format."),
+    dates_file: Path = typer.Option(
+        None,
+        exists=True,
+        file_okay=True,
+        dir_okay=False,
+        writable=False,
+        readable=True,
+        resolve_path=True,
+        help="CSV file containing witness IDs in the first column and minimum and maximum dates for those witnesses in the next two columns. If specified, then for all witnesses in the first column, any existing date ranges for them in the TEI XML collation will be ignored.",
+    ),
     input: Path = typer.Argument(
         ...,
         exists=True,
@@ -119,15 +129,20 @@ def to_file(
     # Make sure the input is an XML file:
     if input.suffix.lower() != ".xml":
         print("Error opening input file: The input file is not an XML file. Make sure the input file type is .xml.")
+        exit(1)
     # If it is, then try to parse it:
     xml = None
     try:
         parser = et.XMLParser(remove_comments=True)
         xml = et.parse(input, parser=parser)
     except Exception as err:
         print(f"Error opening input file: {err}")
-
-    coll = Collation(xml, suffixes, trivial_reading_types, missing_reading_types, fill_correctors, verbose)
+        exit(1)
+    # Make sure the dates_file input, if specified, is a CSV file:
+    if dates_file is not None and dates_file.suffix.lower() != ".csv":
+        print("Error opening dates file: The dates file is not a CSV file. Make sure the dates file type is .csv.")
+        exit(1)
+    coll = Collation(xml, suffixes, trivial_reading_types, missing_reading_types, fill_correctors, dates_file, verbose)
     coll.to_file(
         output,
         format=format,

diff --git a/tests/bad_dates.csv b/tests/bad_dates.csv
@@ -0,0 +1,5 @@
+"UBS",50,50
+"P46",,
+"01",300,
+"02",,500
+"06",600,500
diff --git a/tests/non_csv_dates.txt b/tests/non_csv_dates.txt
@@ -0,0 +1,5 @@
+"UBS",50,50
+"P46",,
+"01",300,
+"02",,500
+"06",600,500
diff --git a/tests/some_dates.csv b/tests/some_dates.csv
@@ -0,0 +1,5 @@
+"UBS",50,50
+"P46",,
+"01",300,
+"02",,500
+"06",500,600
diff --git a/tests/test_main.py b/tests/test_main.py
@@ -28,6 +28,9 @@
 intrinsic_odds_excess_indegree_example = test_dir / "intrinsic_odds_excess_indegree_example.xml"
 intrinsic_odds_cycle_example = test_dir / "intrinsic_odds_cycle_example.xml"
 intrinsic_odds_no_relations_example = test_dir / "intrinsic_odds_no_relations_example.xml"
+some_dates_csv_file = test_dir / "some_dates.csv"
+bad_dates_csv_file = test_dir / "bad_dates.csv"
+non_csv_dates_file = test_dir / "non_csv_dates.txt"
 
 
 def test_version():
@@ -78,6 +81,76 @@ def test_bad_date_witness_input():
         )
 
 
+def test_dates_file_input():
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        output = Path(tmp_dir) / "test.nexus"
+        result = runner.invoke(
+            app,
+            [
+                "--verbose",
+                "--calibrate-dates",
+                "--dates-file",
+                str(some_dates_csv_file),
+                str(input_example),
+                str(output),
+            ],
+        )
+        text = output.read_text(encoding="utf-8")
+        assert "Begin ASSUMPTIONS;" in text
+        assert (
+            "CALIBRATE UBS = fixed(%d)" % (datetime.now().year - 80) in text
+        )  # the UBS witness, whose lower and upper bounds equal 50, will have its lower and upper bounds updated to 80 to ensure that it is not earlier than the origin
+        assert (
+            "CALIBRATE P46 = uniform(%d,%d)" % (0, datetime.now().year - 80) in text
+        )  # neither bound specified, but both inferred
+        assert (
+            "CALIBRATE 01 = uniform(%d,%d)" % (0, datetime.now().year - 300) in text
+        )  # lower bound specified, upper bound inferred
+        assert (
+            "CALIBRATE 02 = uniform(%d,%d)" % (datetime.now().year - 500, datetime.now().year - 80) in text
+        )  # upper bound specified, lower bound inferred
+        assert (
+            "CALIBRATE 06 = uniform(%d,%d)" % (datetime.now().year - 600, datetime.now().year - 500) in text
+        )  # both bounds specified and distinct
+
+
+def test_bad_dates_file_input():
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        output = Path(tmp_dir) / "test.nexus"
+        result = runner.invoke(
+            app,
+            [
+                "--verbose",
+                "--calibrate-dates",
+                "--dates-file",
+                str(bad_dates_csv_file),
+                str(input_example),
+                str(output),
+            ],
+        )
+        assert isinstance(result.exception, ParsingException)
+        assert "In dates file" in str(result.exception)
+
+
+def test_non_csv_dates_file_input():
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        output = Path(tmp_dir) / "test.nexus"
+        result = runner.invoke(
+            app,
+            [
+                "--verbose",
+                "--calibrate-dates",
+                "--dates-file",
+                str(non_csv_dates_file),
+                str(input_example),
+                str(output),
+            ],
+        )
+        assert result.stdout.startswith(
+            "Error opening dates file: The dates file is not a CSV file. Make sure the dates file type is .csv."
+        )
+
+
 def test_to_nexus():
     with tempfile.TemporaryDirectory() as tmp_dir:
         output = Path(tmp_dir) / "test.nexus"