From 88dcc543bfcde6b7c5bd4d706cbab7e19d8a94b4 Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Mon, 13 Dec 2021 15:16:42 +0000 Subject: [PATCH 1/2] Add tests to verify reference sequence untouched Also check that sim_mutations maintains time_units Closes #1951 --- tests/test_mutations.py | 34 ++++++++++++++++++++++++++++++++++ tests/test_simulate_from.py | 24 ++++++++++++++++++++++++ 2 files changed, 58 insertions(+) diff --git a/tests/test_mutations.py b/tests/test_mutations.py index a4b01bdc5..36b3dcb16 100644 --- a/tests/test_mutations.py +++ b/tests/test_mutations.py @@ -2168,3 +2168,37 @@ def test_sim_ancestry_mutate(self): assert ts.num_sites == 0 mts = msprime.mutate(ts, rate=1, random_seed=3) assert mts.num_sites > 0 + + +class TestInputUnmodified: + """ + Check that things that shouldn't be touched by sim_mutations, aren't. + """ + + def test_refseq_just_data(self): + ts1 = msprime.sim_ancestry(2, sequence_length=10, random_seed=1) + tables = ts1.dump_tables() + tables.reference_sequence.data = "A" * 10 + ts2 = msprime.sim_mutations(tables.tree_sequence(), rate=1, random_seed=2) + assert ts2.reference_sequence.data == "A" * 10 + tables.reference_sequence.assert_equals(ts2.reference_sequence) + + def test_refseq_all_fields(self): + ts1 = msprime.sim_ancestry(2, sequence_length=10, random_seed=1) + tables = ts1.dump_tables() + tables.reference_sequence.data = "A" + tables.reference_sequence.metadata_schema = ( + tskit.MetadataSchema.permissive_json() + ) + tables.reference_sequence.metadata = {"a": 1, "b": 2} + tables.reference_sequence.url = "http://stuff.stuff" + ts2 = msprime.sim_mutations(tables.tree_sequence(), rate=1, random_seed=2) + tables.reference_sequence.assert_equals(ts2.reference_sequence) + + @pytest.mark.parametrize("time_units", ["", "generations", "mya"]) + def test_time_units(self, time_units): + ts1 = msprime.sim_ancestry(2, sequence_length=10, random_seed=1) + tables = ts1.dump_tables() + tables.time_units = time_units + ts2 = msprime.sim_mutations(tables.tree_sequence(), rate=1, random_seed=2) + assert ts2.time_units == time_units diff --git a/tests/test_simulate_from.py b/tests/test_simulate_from.py index afaa60cde..1a77c31d5 100644 --- a/tests/test_simulate_from.py +++ b/tests/test_simulate_from.py @@ -1162,6 +1162,30 @@ def test_population_size_set(self): ts3 = msprime.sim_ancestry(2, population_size=100, random_seed=2) assert ts2.equals(ts3, ignore_provenance=True) + def test_refseq_just_data_maintained(self): + ts1 = msprime.sim_ancestry(2, end_time=0, sequence_length=10, random_seed=1) + tables = ts1.dump_tables() + tables.reference_sequence.data = "A" * 10 + ts2 = msprime.sim_ancestry( + initial_state=tables, population_size=100, random_seed=2 + ) + assert ts2.reference_sequence.data == "A" * 10 + tables.reference_sequence.assert_equals(ts2.reference_sequence) + + def test_refseq_all_fields_maintained(self): + ts1 = msprime.sim_ancestry(2, end_time=0, sequence_length=10, random_seed=1) + tables = ts1.dump_tables() + tables.reference_sequence.data = "A" + tables.reference_sequence.metadata_schema = ( + tskit.MetadataSchema.permissive_json() + ) + tables.reference_sequence.metadata = {"a": 1, "b": 2} + tables.reference_sequence.url = "http://stuff.stuff" + ts2 = msprime.sim_ancestry( + initial_state=tables, population_size=100, random_seed=2 + ) + tables.reference_sequence.assert_equals(ts2.reference_sequence) + class TestSlimOutput: """ From 74b701841da4cf584bface6ca50ac95a5e60d135 Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Mon, 13 Dec 2021 15:35:19 +0000 Subject: [PATCH 2/2] Error if time_units is uncalibrated in sim_mutations Closes #1877 --- CHANGELOG.md | 13 ++++++++- msprime/__init__.py | 1 + msprime/ancestry.py | 26 ++++++++++++++++++ msprime/mutations.py | 5 ++++ msprime/pedigrees.py | 1 + tests/test_ancestry.py | 54 ++++++++++++++++++++++++++++++++++--- tests/test_mutations.py | 12 +++++++++ tests/test_pedigree.py | 1 + tests/test_simulate_from.py | 8 ++++++ 9 files changed, 117 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index dc9e46071..9f1d2fa30 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,14 @@ # Changelog -**New features**: +## [1.1.0] - 2021-12-14 + +**New features** +- Add support for tree sequence ``time_units`` field. The ``time_units`` will + be set to "generations" for the output of ``sim_ancestry`` (and ``simulate``), + unless the ``initial_state`` argument is used. In this case, the + ``time_units`` value will be inherited from the input. + ({pr}`1953`, {issue}`1951`, {issue}`1877`, {issue}`1948`, {user}`jeromekelleher`). **Bug fixes**: @@ -18,6 +25,10 @@ ``Demography.from_old_style`` ({issue}`1950`, {pr}`1954`, {user}`jeromekelleher`) +**Maintenance **: + +- Update tskit to Python 0.4.0 and C 0.99.15. + ## [1.0.4] - 2021-12-01 **New features**: diff --git a/msprime/__init__.py b/msprime/__init__.py index 729e75a56..cebdc4b38 100644 --- a/msprime/__init__.py +++ b/msprime/__init__.py @@ -45,6 +45,7 @@ StandardCoalescent, SweepGenicSelection, FixedPedigree, + TimeUnitsMismatchWarning, ) from msprime.core import __version__ diff --git a/msprime/ancestry.py b/msprime/ancestry.py index b8bdea80d..7b41f5337 100644 --- a/msprime/ancestry.py +++ b/msprime/ancestry.py @@ -30,6 +30,7 @@ import math import sys import tempfile +import warnings from typing import Any from typing import ClassVar @@ -45,6 +46,15 @@ logger: logging.Logger = logging.getLogger(__name__) +TIME_UNITS_GENERATIONS = "generations" + + +class TimeUnitsMismatchWarning(UserWarning): + """ + Warning raise when the time units specified in different parts of a + simulation do not match. + """ + def _model_factory(model: None | str | AncestryModel) -> AncestryModel: """ @@ -195,6 +205,7 @@ def _demography_factory( def _build_initial_tables(*, sequence_length, samples, ploidy, demography): # NOTE: this is only used in the simulate() codepath. tables = tskit.TableCollection(sequence_length) + tables.time_units = TIME_UNITS_GENERATIONS for index, (population, time) in enumerate(samples): tables.nodes.add_row( @@ -804,6 +815,20 @@ def _parse_sim_ancestry( raise TypeError( "initial_state must either be a TreeSequence or TableCollection instance" ) + if initial_state.time_units == tskit.TIME_UNITS_UNCALIBRATED: + raise ValueError( + "Cannot use a tree sequence with uncalibrated time_units as " + "the initial state" + ) + if initial_state.time_units != TIME_UNITS_GENERATIONS: + message = ( + f"The initial_state has time_units={initial_state.time_units} but " + "time is measured in generations in msprime. This may lead to " + "significant discrepancies between the timescales. " + "If you wish to suppress this warning, you can use, e.g., " + "warnings.simplefilter('ignore', msprime.TimeUnitsMismatchWarning)" + ) + warnings.warn(message, TimeUnitsMismatchWarning) if sequence_length is None: # These are all the cases in which we derive the sequence_length @@ -940,6 +965,7 @@ def _parse_sim_ancestry( "Either the samples or initial_state arguments must be provided" ) initial_state = tskit.TableCollection(sequence_length) + initial_state.time_units = TIME_UNITS_GENERATIONS demography.insert_populations(initial_state) if not init_for_debugger: sample_sets = _parse_samples(samples, demography) diff --git a/msprime/mutations.py b/msprime/mutations.py index 6851f8970..a195131fa 100644 --- a/msprime/mutations.py +++ b/msprime/mutations.py @@ -1356,6 +1356,11 @@ def sim_mutations( if not isinstance(rate_map, intervals.RateMap): raise TypeError("rate must be a float or a RateMap") + if tables.time_units == tskit.TIME_UNITS_UNCALIBRATED: + raise ValueError( + "Simulating mutations doesn't make sense when time is uncalibrated" + ) + start_time = -sys.float_info.max if start_time is None else float(start_time) end_time = sys.float_info.max if end_time is None else float(end_time) if start_time > end_time: diff --git a/msprime/pedigrees.py b/msprime/pedigrees.py index 0b8ef8e54..9abf1b649 100644 --- a/msprime/pedigrees.py +++ b/msprime/pedigrees.py @@ -31,6 +31,7 @@ def __init__(self, demography=None): demography = demog_mod.Demography.isolated_model([1]) self.demography = demography self.tables = tskit.TableCollection(0) + self.tables.time_units = "generations" demography.insert_populations(self.tables) assert len(self.tables.individuals) == 0 diff --git a/tests/test_ancestry.py b/tests/test_ancestry.py index 2eb8a5af8..3f97c34b3 100644 --- a/tests/test_ancestry.py +++ b/tests/test_ancestry.py @@ -519,7 +519,9 @@ def test_sequence_length_errors(self): ) # An initial state with a sequence_length that disagrees. - initial_state = tskit.TableCollection(1234).tree_sequence() + initial_state = tskit.TableCollection(1234) + initial_state.time_units = "generations" + initial_state = initial_state.tree_sequence() with pytest.raises(ValueError): ancestry._parse_sim_ancestry(initial_state=initial_state, sequence_length=1) with pytest.raises(ValueError): @@ -753,12 +755,14 @@ def test_dtwf_population_size(self): def test_initial_state_errors(self): tables = tskit.TableCollection(1) + tables.time_units = "generations" tables.populations.add_row() # sequence_length doesn't match. with pytest.raises(ValueError): ancestry._parse_sim_ancestry(initial_state=tables, sequence_length=100) # Must have at least one population tables = tskit.TableCollection(1) + tables.time_units = "generations" with pytest.raises(ValueError): ancestry._parse_sim_ancestry(initial_state=tables) for bad_type in [[], "sdf", {}]: @@ -801,6 +805,7 @@ def test_samples_and_initial_state(self): # Specifying both is also an error. tables = tskit.TableCollection(1) + tables.time_units = "generations" tables.populations.add_row() with pytest.raises(ValueError): ancestry._parse_sim_ancestry(2, initial_state=tables) @@ -822,18 +827,22 @@ def test_pedigree_requires_no_population_size(self): @pytest.mark.parametrize("ploidy", [1, 3]) def test_pedigree_requires_ploidy2(self, ploidy): + tables = tskit.TableCollection(1) + tables.time_units = "generations" with pytest.raises(ValueError, match="must have ploidy=2"): ancestry._parse_sim_ancestry( model="fixed_pedigree", - initial_state=tskit.TableCollection(1), + initial_state=tables, ploidy=ploidy, ) def test_pedigree_requires_no_samples(self): + tables = tskit.TableCollection(1) + tables.time_units = "generations" with pytest.raises(ValueError, match="Cannot specify both samples and"): ancestry._parse_sim_ancestry( model="fixed_pedigree", - initial_state=tskit.TableCollection(1), + initial_state=tables, samples=10, ) @@ -2249,3 +2258,42 @@ def test_sim_ancestry_unknown_mid_plus_flanks(self): ) with pytest.raises(ValueError, match="Missing regions of the genome"): msprime.sim_ancestry(2, recombination_rate=rate_map, random_seed=1) + + +class TestTimeUnits: + def test_simulate_gives_generations(self): + ts = msprime.simulate(2, random_seed=1) + assert ts.time_units == "generations" + + def test_sim_ancestry_gives_generations(self): + ts = msprime.sim_ancestry(2, random_seed=1) + assert ts.time_units == "generations" + + @pytest.mark.parametrize("time_units", ["", "unknown", "mya", "ticks"]) + def test_initial_state_warns_not_generations(self, time_units): + tables = msprime.sim_ancestry(2, end_time=0, random_seed=1).dump_tables() + tables.time_units = time_units + with pytest.warns(msprime.TimeUnitsMismatchWarning, match="time_units"): + ts = msprime.sim_ancestry( + initial_state=tables, population_size=10, random_seed=1 + ) + assert ts.time_units == time_units + + def test_initial_state_suppress_message(self): + tables = msprime.sim_ancestry(2, end_time=0, random_seed=1).dump_tables() + tables.time_units = "ticks" + with warnings.catch_warnings(record=True) as w: + with warnings.catch_warnings(): + warnings.simplefilter("ignore", msprime.TimeUnitsMismatchWarning) + msprime.sim_ancestry( + initial_state=tables, population_size=10, random_seed=1 + ) + assert len(w) == 0 + + def test_initial_state_errors_uncalibrated(self): + tables = msprime.sim_ancestry(2, end_time=0, random_seed=1).dump_tables() + tables.time_units = tskit.TIME_UNITS_UNCALIBRATED + with pytest.raises(ValueError, match="time_units"): + msprime.sim_ancestry( + initial_state=tables, population_size=10, random_seed=1 + ) diff --git a/tests/test_mutations.py b/tests/test_mutations.py index 36b3dcb16..a5c3a55c6 100644 --- a/tests/test_mutations.py +++ b/tests/test_mutations.py @@ -524,6 +524,18 @@ def test_unicode_alleles(self): for mutation in site.mutations: assert mutation.derived_state == alleles[1] + @pytest.mark.parametrize("rate", [0, 1]) + def test_uncalibrated_time_units(self, rate): + ts = msprime.sim_ancestry(8, random_seed=2) + tables = ts.dump_tables() + tables.time_units = "uncalibrated" + ts = tables.tree_sequence() + with pytest.raises(ValueError, match="uncalibrated"): + msprime.sim_mutations(ts, rate=rate, random_seed=1) + # Make sure also works on legacy interface + with pytest.raises(ValueError, match="uncalibrated"): + msprime.mutate(ts, rate=rate, random_seed=1) + def test_zero_mutation_rate(self): ts = msprime.sim_ancestry(10, random_seed=1) mutated = msprime.sim_mutations(ts, 0) diff --git a/tests/test_pedigree.py b/tests/test_pedigree.py index 0e947fa35..23c70edf0 100644 --- a/tests/test_pedigree.py +++ b/tests/test_pedigree.py @@ -266,6 +266,7 @@ def test_valid_pedigree(self): num_generations=10, ) tc = tskit.TableCollection(1) + tc.time_units = "generations" tc.individuals.metadata_schema = tb.metadata_schema for row in tb: tc.individuals.append(row) diff --git a/tests/test_simulate_from.py b/tests/test_simulate_from.py index 1a77c31d5..fca551a06 100644 --- a/tests/test_simulate_from.py +++ b/tests/test_simulate_from.py @@ -598,6 +598,7 @@ def verify_simple_model( model=self.model, ) tables = tskit.TableCollection(ts1.sequence_length) + tables.time_units = "generations" tables.populations.add_row() for _ in range(n): tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0, population=0) @@ -658,6 +659,7 @@ def test_two_populations_migration(self): random_seed=seed, ) tables = tskit.TableCollection(1) + tables.time_units = "generations" tables.populations.add_row() tables.populations.add_row() for _ in range(n): @@ -952,6 +954,7 @@ def test_population_split(self): def test_extra_pops_no_metadata(self): tables = tskit.TableCollection(1) + tables.time_units = "generations" tables.populations.add_row() tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, population=0) tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, population=0) @@ -970,6 +973,7 @@ def test_extra_pops_no_metadata(self): def test_extra_pops_struct_metadata(self): tables = tskit.TableCollection(1) + tables.time_units = "generations" tables.populations.metadata_schema = tskit.MetadataSchema( { "codec": "struct", @@ -999,6 +1003,7 @@ def test_extra_pops_struct_metadata(self): def test_extra_pops_missing_name_metadata(self): tables = tskit.TableCollection(1) + tables.time_units = "generations" tables.populations.metadata_schema = tskit.MetadataSchema( { "codec": "json", @@ -1028,6 +1033,7 @@ def test_extra_pops_missing_name_metadata(self): def test_extra_pops_missing_description_metadata(self): tables = tskit.TableCollection(1) + tables.time_units = "generations" tables.populations.metadata_schema = tskit.MetadataSchema( { "codec": "json", @@ -1057,6 +1063,7 @@ def test_extra_pops_missing_description_metadata(self): def test_extra_pops_minimal_schema(self): tables = tskit.TableCollection(1) + tables.time_units = "generations" tables.populations.metadata_schema = tskit.MetadataSchema.permissive_json() tables.populations.add_row(metadata={"name": "X"}) tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, population=0) @@ -1109,6 +1116,7 @@ def test_extra_pops_missing_extra_metadata(self): def test_extra_pops_set_extra_metadata(self): tables = tskit.TableCollection(1) + tables.time_units = "generations" tables.populations.metadata_schema = tskit.MetadataSchema( { "codec": "json",