Skip to content

Commit

Permalink
⚡ Added support (plus tests and documentation) for supplying/updating…
Browse files Browse the repository at this point in the history
… witness date ranges through an external CSV file (#94)
  • Loading branch information
jjmccollum committed Jan 7, 2025
1 parent eca9834 commit c0193ec
Show file tree
Hide file tree
Showing 8 changed files with 166 additions and 18 deletions.
9 changes: 9 additions & 0 deletions docs/advanced.rst
Original file line number Diff line number Diff line change
Expand Up @@ -585,6 +585,14 @@ The ``AncestralSequenceLogger`` class (part of the ``BEAST_CLASSIC`` package) re
In writing to BEAST 2.7 XML files, ``teiphy`` can include elements for either (or neither) logger based on the ``--ancestral-logger`` argument.
The default option, ``state``, will include an ``AncestralStateLogger`` element in the XML file, while ``sequence`` will include an ``AncestralSequenceLogger`` element, and ``none`` will not include any logging elements for ancestral states.

Overriding or Supplying Dates from a CSV file
---------------------------------------------

You can also specify date ranges for some witnesses in a separate CSV file.
For the sake of completeness, it is recommended that you specify date ranges for witnesses in your TEI XML collation, but you may have pulled your collation data and witness date ranges from different sources, or you might want to overwrite existing date ranges in the collation with updated values.
You can specify a path to the CSV file containing witness IDs and their date ranges using the ``--dates-file`` command-line option.
The CSV file should not have any header rows, and every row should be formatted as ``"id",min,max``, where the first column contains a string (encoded as such by being surrounded by double quotes) corresponding to the witness ID and the other two columns are either empty (if one or both ends of the date range are unknown) or integers corresponding to years (where negative integers are assumed to refer to dates BCE).

Supported Output Formats and Options
------------------------------------

Expand All @@ -601,6 +609,7 @@ Note that all reading labels will be slugified so that all characters (e.g., Gre

Note that for the ``nexus``, ``hennig86``, ``phylip``, and ``fasta`` output formats, only up to 32 states (represented by the symbols 0-9 and a-v) are supported at this time.
This is a requirement for Hennig86 format, and some phylogenetic programs that use these formats (such as IQTREE and RAxML) do not support symbols outside of the basic 36 alphanumeric characters or a 32-character alphabet at this time.
The ``stemma`` output format currently supports up to 62 states.

Collations can also be converted to tabular formats.
Within Python, the ``collation`` class's ``to_numpy`` method can be invoked to convert a collation to a NumPy ``array`` with rows for variant readings, columns for witnesses, and frequency values in the cells.
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "teiphy"
version = "0.1.14"
version = "0.1.15"
description = "Converts TEI XML collations to NEXUS and other formats"
authors = ["Joey McCollum and Robert Turnbull"]
license = "MIT"
Expand Down
66 changes: 51 additions & 15 deletions teiphy/collation.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ def __init__(
trivial_reading_types: List[str] = [],
missing_reading_types: List[str] = [],
fill_corrector_lacunae: bool = False,
dates_file: Union[Path, str] = None,
verbose: bool = False,
):
"""Constructs a new Collation instance with the given settings.
Expand All @@ -85,6 +86,7 @@ def __init__(
trivial_reading_types: An optional set of reading types (e.g., "reconstructed", "defective", "orthographic", "subreading") whose readings should be collapsed under the previous substantive reading.
missing_reading_types: An optional set of reading types (e.g., "lac", "overlap") whose readings should be treated as missing data.
fill_corrector_lacunae: An optional flag indicating whether or not to fill "lacunae" in witnesses with type "corrector".
dates_file: An optional path to a CSV file containing witness IDs, minimum dates, and maximum dates. If specified, then for all witnesses in the first column, any existing date ranges for them in the TEI XML collation will be ignored.
verbose: An optional flag indicating whether or not to print timing and debugging details for the user.
"""
self.manuscript_suffixes = manuscript_suffixes
Expand All @@ -111,6 +113,9 @@ def __init__(
self.parse_origin_date_range(xml)
self.parse_list_wit(xml)
self.validate_wits(xml)
# If a dates file was specified, then update the witness date ranges manually:
if dates_file is not None:
self.update_witness_date_ranges_from_dates_file(dates_file)
# If the upper bound on a work's date of origin is not defined, then attempt to assign it an upper bound based on the witness dates;
# otherwise, attempt to assign lower bounds to witness dates based on it:
if self.origin_date_range[1] is None:
Expand Down Expand Up @@ -282,6 +287,37 @@ def validate_wits(self, xml: et.ElementTree):
print("Finished witness validation in %0.4fs." % (t1 - t0))
return

def update_witness_date_ranges_from_dates_file(self, dates_file: Union[Path, str]):
"""Given a CSV-formatted dates file, update the date ranges of all witnesses whose IDs are in the first column of the dates file
(overwriting existing date ranges if necessary).
"""
if self.verbose:
print("Updating witness dates from file %s..." % (str(dates_file)))
t0 = time.time()
dates_df = pd.read_csv(dates_file, index_col=0, names=["id", "min", "max"])
for witness in self.witnesses:
wit_id = witness.id
if wit_id in dates_df.index:
# For every witness in the list whose ID is specified in the dates file,
# update their date ranges (as long as the date ranges in the file are are well-formed):
min_date = int(dates_df.loc[wit_id]["min"]) if not np.isnan(dates_df.loc[wit_id]["min"]) else None
max_date = (
int(dates_df.loc[wit_id]["max"])
if not np.isnan(dates_df.loc[wit_id]["max"])
else datetime.now().year
)
print(min_date, max_date)
if min_date is not None and max_date is not None and min_date > max_date:
raise ParsingException(
"In dates file %s, for witness ID %s, the minimum date %d is greater than the maximum date %d."
% (str(dates_file), wit_id, min_date, max_date)
)
witness.date_range = [min_date, max_date]
t1 = time.time()
if self.verbose:
print("Finished witness date range updates in %0.4fs." % (t1 - t0))
return

def update_origin_date_range_from_witness_date_ranges(self):
"""Conditionally updates the upper bound on the date of origin of the work represented by this Collation
based on the bounds on the witnesses' dates.
Expand Down Expand Up @@ -627,7 +663,7 @@ def to_nexus(
Args:
file_addr: A string representing the path to an output NEXUS file; the file type should be .nex, .nexus, or .nxs.
drop_constant (bool, optional): An optional flag indicating whether to ignore variation units with one substantive reading.
drop_constant: An optional flag indicating whether to ignore variation units with one substantive reading.
char_state_labels: An optional flag indicating whether or not to include the CharStateLabels block.
frequency: An optional flag indicating whether to use the StatesFormat=Frequency setting
instead of the StatesFormat=StatesPresent setting
Expand Down Expand Up @@ -1913,10 +1949,10 @@ def to_csv(
Args:
file_addr: A string representing the path to an output CSV file; the file type should be .csv.
drop_constant (bool, optional): An optional flag indicating whether to ignore variation units with one substantive reading.
drop_constant: An optional flag indicating whether to ignore variation units with one substantive reading.
ambiguous_as_missing: An optional flag indicating whether to treat all ambiguous states as missing data.
proportion (bool, optional): An optional flag indicating whether or not to calculate distances as proportions over extant, unambiguous variation units.
table_type (TableType, optional): A TableType option indicating which type of tabular output to generate.
proportion: An optional flag indicating whether or not to calculate distances as proportions over extant, unambiguous variation units.
table_type: A TableType option indicating which type of tabular output to generate.
Only applicable for tabular outputs.
Default value is "matrix".
split_missing: An optional flag indicating whether or not to treat missing characters/variation units as having a contribution of 1 split over all states/readings; if False, then missing data is ignored (i.e., all states are 0). Default value is True.
Expand Down Expand Up @@ -1956,13 +1992,13 @@ def to_excel(
Args:
file_addr: A string representing the path to an output Excel file; the file type should be .xlsx.
drop_constant (bool, optional): An optional flag indicating whether to ignore variation units with one substantive reading.
drop_constant: An optional flag indicating whether to ignore variation units with one substantive reading.
ambiguous_as_missing: An optional flag indicating whether to treat all ambiguous states as missing data.
proportion (bool, optional): An optional flag indicating whether or not to calculate distances as proportions over extant, unambiguous variation units.
table_type (TableType, optional): A TableType option indicating which type of tabular output to generate.
proportion: An optional flag indicating whether or not to calculate distances as proportions over extant, unambiguous variation units.
table_type: A TableType option indicating which type of tabular output to generate.
Only applicable for tabular outputs.
Default value is "matrix".
split_missing (bool, optional): An optional flag indicating whether or not to treat missing characters/variation units as having a contribution of 1 split over all states/readings; if False, then missing data is ignored (i.e., all states are 0). Default value is True.
split_missing: An optional flag indicating whether or not to treat missing characters/variation units as having a contribution of 1 split over all states/readings; if False, then missing data is ignored (i.e., all states are 0). Default value is True.
"""
# Convert the collation to a Pandas DataFrame first:
df = self.to_dataframe(
Expand Down Expand Up @@ -2009,7 +2045,7 @@ def to_stemma(self, file_addr: Union[Path, str]):
Args:
file_addr: A string representing the path to an output STEMMA prep file; the file should have no extension.
The accompanying chron file will match this file name, except that it will have "_chron" appended to the end.
drop_constant (bool, optional): An optional flag indicating whether to ignore variation units with one substantive reading.
drop_constant: An optional flag indicating whether to ignore variation units with one substantive reading.
"""
# Populate a list of sites that will correspond to columns of the sequence alignment
# (by default, constant sites are dropped):
Expand Down Expand Up @@ -2196,21 +2232,21 @@ def to_file(
with a proportion of disagreements to variation units where both witnesses are extant.
It is only applied if the table_type option is "distance".
Default value is False.
calibrate_dates: An optional flag indicating whether to add an Assumptions block that specifies date distributions for witnesses
calibrate_dates (bool, optional): An optional flag indicating whether to add an Assumptions block that specifies date distributions for witnesses
in NEXUS output.
This option is intended for inputs to BEAST 2.
mrbayes: An optional flag indicating whether to add a MrBayes block that specifies model settings and age calibrations for witnesses
mrbayes (bool, optional): An optional flag indicating whether to add a MrBayes block that specifies model settings and age calibrations for witnesses
in NEXUS output.
This option is intended for inputs to MrBayes.
clock_model: A ClockModel option indicating which type of clock model to use.
clock_model (ClockModel, optional): A ClockModel option indicating which type of clock model to use.
This option is intended for inputs to MrBayes and BEAST 2.
MrBayes does not presently support a local clock model, so it will default to a strict clock model if a local clock model is specified.
ancestral_logger: An AncestralLogger option indicating which class of logger (if any) to use for ancestral states.
ancestral_logger (AncestralLogger, optional): An AncestralLogger option indicating which class of logger (if any) to use for ancestral states.
This option is intended for inputs to BEAST 2.
table_type: A TableType option indicating which type of tabular output to generate.
table_type (TableType, optional): A TableType option indicating which type of tabular output to generate.
Only applicable for tabular outputs.
Default value is "matrix".
seed: A seed for random number generation (for setting initial values of unspecified transcriptional rates in BEAST 2 XML output).
seed (optional, int): A seed for random number generation (for setting initial values of unspecified transcriptional rates in BEAST 2 XML output).
"""
file_addr = Path(file_addr)
format = format or Format.infer(
Expand Down
19 changes: 17 additions & 2 deletions teiphy/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,16 @@ def to_file(
help="Print the current version.",
),
format: Format = typer.Option(None, case_sensitive=False, help="The output format."),
dates_file: Path = typer.Option(
None,
exists=True,
file_okay=True,
dir_okay=False,
writable=False,
readable=True,
resolve_path=True,
help="CSV file containing witness IDs in the first column and minimum and maximum dates for those witnesses in the next two columns. If specified, then for all witnesses in the first column, any existing date ranges for them in the TEI XML collation will be ignored.",
),
input: Path = typer.Argument(
...,
exists=True,
Expand All @@ -119,15 +129,20 @@ def to_file(
# Make sure the input is an XML file:
if input.suffix.lower() != ".xml":
print("Error opening input file: The input file is not an XML file. Make sure the input file type is .xml.")
exit(1)
# If it is, then try to parse it:
xml = None
try:
parser = et.XMLParser(remove_comments=True)
xml = et.parse(input, parser=parser)
except Exception as err:
print(f"Error opening input file: {err}")

coll = Collation(xml, suffixes, trivial_reading_types, missing_reading_types, fill_correctors, verbose)
exit(1)
# Make sure the dates_file input, if specified, is a CSV file:
if dates_file is not None and dates_file.suffix.lower() != ".csv":
print("Error opening dates file: The dates file is not a CSV file. Make sure the dates file type is .csv.")
exit(1)
coll = Collation(xml, suffixes, trivial_reading_types, missing_reading_types, fill_correctors, dates_file, verbose)
coll.to_file(
output,
format=format,
Expand Down
5 changes: 5 additions & 0 deletions tests/bad_dates.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
"UBS",50,50
"P46",,
"01",300,
"02",,500
"06",600,500
5 changes: 5 additions & 0 deletions tests/non_csv_dates.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
"UBS",50,50
"P46",,
"01",300,
"02",,500
"06",600,500
5 changes: 5 additions & 0 deletions tests/some_dates.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
"UBS",50,50
"P46",,
"01",300,
"02",,500
"06",500,600
73 changes: 73 additions & 0 deletions tests/test_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@
intrinsic_odds_excess_indegree_example = test_dir / "intrinsic_odds_excess_indegree_example.xml"
intrinsic_odds_cycle_example = test_dir / "intrinsic_odds_cycle_example.xml"
intrinsic_odds_no_relations_example = test_dir / "intrinsic_odds_no_relations_example.xml"
some_dates_csv_file = test_dir / "some_dates.csv"
bad_dates_csv_file = test_dir / "bad_dates.csv"
non_csv_dates_file = test_dir / "non_csv_dates.txt"


def test_version():
Expand Down Expand Up @@ -78,6 +81,76 @@ def test_bad_date_witness_input():
)


def test_dates_file_input():
with tempfile.TemporaryDirectory() as tmp_dir:
output = Path(tmp_dir) / "test.nexus"
result = runner.invoke(
app,
[
"--verbose",
"--calibrate-dates",
"--dates-file",
str(some_dates_csv_file),
str(input_example),
str(output),
],
)
text = output.read_text(encoding="utf-8")
assert "Begin ASSUMPTIONS;" in text
assert (
"CALIBRATE UBS = fixed(%d)" % (datetime.now().year - 80) in text
) # the UBS witness, whose lower and upper bounds equal 50, will have its lower and upper bounds updated to 80 to ensure that it is not earlier than the origin
assert (
"CALIBRATE P46 = uniform(%d,%d)" % (0, datetime.now().year - 80) in text
) # neither bound specified, but both inferred
assert (
"CALIBRATE 01 = uniform(%d,%d)" % (0, datetime.now().year - 300) in text
) # lower bound specified, upper bound inferred
assert (
"CALIBRATE 02 = uniform(%d,%d)" % (datetime.now().year - 500, datetime.now().year - 80) in text
) # upper bound specified, lower bound inferred
assert (
"CALIBRATE 06 = uniform(%d,%d)" % (datetime.now().year - 600, datetime.now().year - 500) in text
) # both bounds specified and distinct


def test_bad_dates_file_input():
with tempfile.TemporaryDirectory() as tmp_dir:
output = Path(tmp_dir) / "test.nexus"
result = runner.invoke(
app,
[
"--verbose",
"--calibrate-dates",
"--dates-file",
str(bad_dates_csv_file),
str(input_example),
str(output),
],
)
assert isinstance(result.exception, ParsingException)
assert "In dates file" in str(result.exception)


def test_non_csv_dates_file_input():
with tempfile.TemporaryDirectory() as tmp_dir:
output = Path(tmp_dir) / "test.nexus"
result = runner.invoke(
app,
[
"--verbose",
"--calibrate-dates",
"--dates-file",
str(non_csv_dates_file),
str(input_example),
str(output),
],
)
assert result.stdout.startswith(
"Error opening dates file: The dates file is not a CSV file. Make sure the dates file type is .csv."
)


def test_to_nexus():
with tempfile.TemporaryDirectory() as tmp_dir:
output = Path(tmp_dir) / "test.nexus"
Expand Down

0 comments on commit c0193ec

Please sign in to comment.