diff --git a/c/CHANGELOG.rst b/c/CHANGELOG.rst index 9b5d5e7f38..bdfa633317 100644 --- a/c/CHANGELOG.rst +++ b/c/CHANGELOG.rst @@ -1,3 +1,35 @@ +-------------------- +[1.2.0] - 2025-XX-XX +-------------------- + +**Breaking changes** + +- ``tsk_treeseq_init`` now requires that mutation parents in the table collection + are correct and consistent with the topology of the tree at each mutation site. + Returns ``TSK_ERR_BAD_MUTATION_PARENT`` if this is not the case, or + ``TSK_ERR_MUTATION_PARENT_AFTER_CHILD`` if the mutations are not in an order + compatible with the correct mutation parent. + (:user:`benjeffery`, :issue:`2729`, :issue:`2732`, :pr:`3212`). + +**Features** + +- Add ``TSK_TS_INIT_COMPUTE_MUTATION_PARENTS`` to ``tsk_treeseq_init`` + to compute mutation parents from the tree sequence topology. + Note that the mutations must be in the correct order. + (:user:`benjeffery`, :issue:`2757`, :pr:`3212`). + +- Add ``TSK_CHECK_MUTATION_PARENTS`` option to ``tsk_table_collection_check_integrity`` + to check that mutation parents are consistent with the tree sequence topology. + This option implies ``TSK_CHECK_TREES``. + (:user:`benjeffery`, :issue:`2729`, :issue:`2732`, :pr:`3212`). + +- Add the ``TSK_NO_CHECK_INTEGRITY`` option to ``tsk_table_collection_compute_mutation_parents`` + to skip the integrity checks that are normally run when computing mutation parents. + This is useful for speeding up the computation of mutation parents when the + tree sequence is certainly known to be valid. + (:user:`benjeffery`, :pr:`3212`). + + -------------------- [1.1.4] - 2025-03-31 -------------------- diff --git a/c/tests/test_tables.c b/c/tests/test_tables.c index 6de6675ff6..aebc1632fb 100644 --- a/c/tests/test_tables.c +++ b/c/tests/test_tables.c @@ -9011,7 +9011,7 @@ test_sort_tables_mutation_times(void) CU_ASSERT_EQUAL_FATAL(ret, 0); /* Check to make sure we have legal mutations */ - ret = tsk_treeseq_init(&ts, &tables, 0); + ret = tsk_treeseq_init(&ts, &tables, TSK_TS_INIT_COMPUTE_MUTATION_PARENTS); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_treeseq_copy_tables(&ts, &t1, 0); @@ -10576,6 +10576,64 @@ test_table_collection_check_integrity_bad_indexes(void) tsk_table_collection_free(&tables); } +static void +test_check_integrity_bad_mutation_parent_topology(void) +{ + int ret; + tsk_id_t ret_trees; + tsk_table_collection_t tables; + const char *sites = "0 0\n"; + /* Make a mutation on a parallel branch the parent*/ + const char *bad_mutations = "0 0 1 -1\n" + "0 1 1 0\n"; + + /* A mutation above is set as child*/ + const char *reverse_mutations = "0 0 1 -1\n" + "0 4 1 0\n"; + + const char *reverse_sites = "0.5 0\n" + "0 0\n"; + + ret = tsk_table_collection_init(&tables, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + + tables.sequence_length = 1; + parse_nodes(single_tree_ex_nodes, &tables.nodes); + CU_ASSERT_EQUAL_FATAL(tables.nodes.num_rows, 7); + parse_edges(single_tree_ex_edges, &tables.edges); + CU_ASSERT_EQUAL_FATAL(tables.edges.num_rows, 6); + parse_sites(sites, &tables.sites); + CU_ASSERT_EQUAL_FATAL(tables.sites.num_rows, 1); + parse_mutations(bad_mutations, &tables.mutations); + CU_ASSERT_EQUAL_FATAL(tables.mutations.num_rows, 2); + tables.sequence_length = 1.0; + + ret = tsk_table_collection_build_index(&tables, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + + ret_trees = tsk_table_collection_check_integrity(&tables, TSK_CHECK_TREES); + CU_ASSERT_EQUAL_FATAL(ret_trees, 1); + ret_trees + = tsk_table_collection_check_integrity(&tables, TSK_CHECK_MUTATION_PARENTS); + CU_ASSERT_EQUAL_FATAL(ret_trees, TSK_ERR_BAD_MUTATION_PARENT); + + parse_mutations(reverse_mutations, &tables.mutations); + ret_trees = tsk_table_collection_check_integrity(&tables, TSK_CHECK_TREES); + CU_ASSERT_EQUAL_FATAL(ret_trees, 1); + ret_trees + = tsk_table_collection_check_integrity(&tables, TSK_CHECK_MUTATION_PARENTS); + CU_ASSERT_EQUAL_FATAL(ret_trees, TSK_ERR_MUTATION_PARENT_AFTER_CHILD); + + /* Now check that TSK_CHECK_MUTATION_PARENTS implies TSK_CHECK_TREES + by triggering an error with reversed sites */ + parse_sites(reverse_sites, &tables.sites); + ret_trees + = tsk_table_collection_check_integrity(&tables, TSK_CHECK_MUTATION_PARENTS); + CU_ASSERT_EQUAL_FATAL(ret_trees, TSK_ERR_UNSORTED_SITES); + + tsk_table_collection_free(&tables); +} + static void test_table_collection_subset_with_options(tsk_flags_t options) { @@ -11658,6 +11716,8 @@ main(int argc, char **argv) test_table_collection_check_integrity_bad_indexes_example }, { "test_table_collection_check_integrity_bad_indexes", test_table_collection_check_integrity_bad_indexes }, + { "test_check_integrity_bad_mutation_parent_topology", + test_check_integrity_bad_mutation_parent_topology }, { "test_table_collection_subset", test_table_collection_subset }, { "test_table_collection_subset_unsorted", test_table_collection_subset_unsorted }, diff --git a/c/tests/test_trees.c b/c/tests/test_trees.c index dcf4d2a69d..b93d9b0865 100644 --- a/c/tests/test_trees.c +++ b/c/tests/test_trees.c @@ -3886,16 +3886,16 @@ test_simplest_mutation_edges(void) "1 2 2 0\n"; const char *sites = "0.5 0\n" "1.5 0\n"; - const char *mutations = "0 1 1\n" + const char *mutations = "0 2 1\n" + "0 1 1\n" "0 0 1\n" - "0 2 1\n" - "1 0 1\n" + "1 2 1\n" "1 1 1\n" - "1 2 1\n"; + "1 0 1\n"; tsk_treeseq_t ts; tsk_tree_t tree; /* We have mutations over roots, samples and just isolated nodes */ - tsk_id_t mutation_edges[] = { -1, 0, -1, 1, -1, -1 }; + tsk_id_t mutation_edges[] = { -1, -1, 0, -1, -1, 1 }; tsk_size_t i, j, k, t; tsk_mutation_t mut; tsk_site_t site; @@ -4238,7 +4238,7 @@ test_single_tree_bad_mutations(void) ret = tsk_treeseq_init(&ts, &tables, load_flags); CU_ASSERT_EQUAL(ret, TSK_ERR_MUTATION_PARENT_AFTER_CHILD); tsk_treeseq_free(&ts); - tables.mutations.parent[3] = TSK_NULL; + tables.mutations.parent[3] = 2; /* time < node time */ tables.mutations.time[2] = 0; @@ -8715,6 +8715,112 @@ test_init_take_ownership_no_edge_metadata(void) tsk_treeseq_free(&ts); } +static void +test_init_compute_mutation_parents(void) +{ + int ret; + tsk_table_collection_t *tables, *tables2; + tsk_treeseq_t ts; + const char *sites = "0 0\n"; + /* Make a mutation on a parallel branch the parent*/ + const char *bad_mutations = "0 0 1 -1\n" + "0 1 1 0\n"; + + tables = tsk_malloc(sizeof(tsk_table_collection_t)); + CU_ASSERT_NOT_EQUAL_FATAL(tables, NULL); + tables2 = tsk_malloc(sizeof(tsk_table_collection_t)); + CU_ASSERT_NOT_EQUAL_FATAL(tables2, NULL); + + CU_ASSERT_FATAL(tables != NULL); + ret = tsk_table_collection_init(tables, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + + tables->sequence_length = 1; + parse_nodes(single_tree_ex_nodes, &tables->nodes); + CU_ASSERT_EQUAL_FATAL(tables->nodes.num_rows, 7); + parse_edges(single_tree_ex_edges, &tables->edges); + CU_ASSERT_EQUAL_FATAL(tables->edges.num_rows, 6); + parse_sites(sites, &tables->sites); + CU_ASSERT_EQUAL_FATAL(tables->sites.num_rows, 1); + parse_mutations(bad_mutations, &tables->mutations); + CU_ASSERT_EQUAL_FATAL(tables->mutations.num_rows, 2); + tables->sequence_length = 1.0; + ret = tsk_table_collection_copy(tables, tables2, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + + ret = tsk_treeseq_init(&ts, tables, TSK_TS_INIT_BUILD_INDEXES); + CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_MUTATION_PARENT); + tsk_treeseq_free(&ts); + + ret = tsk_treeseq_init( + &ts, tables, TSK_TS_INIT_BUILD_INDEXES | TSK_TS_INIT_COMPUTE_MUTATION_PARENTS); + CU_ASSERT_EQUAL_FATAL(ret, 0); + tsk_treeseq_free(&ts); + + /* When we use take ownership, the check of parents shouldn't overwrite them*/ + ret = tsk_treeseq_init(&ts, tables, TSK_TAKE_OWNERSHIP | TSK_TS_INIT_BUILD_INDEXES); + CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_MUTATION_PARENT); + CU_ASSERT_EQUAL(tables->mutations.parent[0], -1); + CU_ASSERT_EQUAL(tables->mutations.parent[1], 0); + tsk_treeseq_free(&ts); + + /* When we use take ownership and compute, the tables are overwritten*/ + ret = tsk_treeseq_init(&ts, tables2, + TSK_TAKE_OWNERSHIP | TSK_TS_INIT_BUILD_INDEXES + | TSK_TS_INIT_COMPUTE_MUTATION_PARENTS); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_EQUAL(tables2->mutations.parent[0], -1); + CU_ASSERT_EQUAL(tables2->mutations.parent[1], -1); + + /* Don't need to free tables as we took ownership */ + tsk_treeseq_free(&ts); +} + +static void +test_init_compute_mutation_parents_errors(void) +{ + int ret; + tsk_id_t row_ret; + tsk_table_collection_t tables; + tsk_treeseq_t ts; + const char *sites = "0.5 0\n" + "0 0\n"; + + ret = tsk_table_collection_init(&tables, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + + tables.sequence_length = 1; + parse_nodes(single_tree_ex_nodes, &tables.nodes); + CU_ASSERT_EQUAL_FATAL(tables.nodes.num_rows, 7); + parse_edges(single_tree_ex_edges, &tables.edges); + CU_ASSERT_EQUAL_FATAL(tables.edges.num_rows, 6); + parse_sites(sites, &tables.sites); + CU_ASSERT_EQUAL_FATAL(tables.sites.num_rows, 2); + tables.sequence_length = 1.0; + + ret = tsk_treeseq_init( + &ts, &tables, TSK_TS_INIT_BUILD_INDEXES | TSK_TS_INIT_COMPUTE_MUTATION_PARENTS); + CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_UNSORTED_SITES); + tsk_treeseq_free(&ts); + + tsk_site_table_clear(&tables.sites); + row_ret = tsk_site_table_add_row(&tables.sites, 0.5, "A", 1, NULL, 0); + CU_ASSERT_EQUAL_FATAL(row_ret, 0); + row_ret = tsk_mutation_table_add_row( + &tables.mutations, 0, 0, TSK_NULL, TSK_UNKNOWN_TIME, "A", 1, NULL, 0); + CU_ASSERT_EQUAL_FATAL(row_ret, 0); + row_ret = tsk_mutation_table_add_row( + &tables.mutations, 0, 4, TSK_NULL, TSK_UNKNOWN_TIME, "A", 1, NULL, 0); + CU_ASSERT_EQUAL_FATAL(row_ret, 1); + + ret = tsk_treeseq_init( + &ts, &tables, TSK_TS_INIT_BUILD_INDEXES | TSK_TS_INIT_COMPUTE_MUTATION_PARENTS); + CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MUTATION_PARENT_AFTER_CHILD); + tsk_treeseq_free(&ts); + + tsk_table_collection_free(&tables); +} + int main(int argc, char **argv) { @@ -8923,6 +9029,9 @@ main(int argc, char **argv) test_extend_haplotypes_conflicting_times }, { "test_init_take_ownership_no_edge_metadata", test_init_take_ownership_no_edge_metadata }, + { "test_init_compute_mutation_parents", test_init_compute_mutation_parents }, + { "test_init_compute_mutation_parents_errors", + test_init_compute_mutation_parents_errors }, { NULL, NULL }, }; diff --git a/c/tests/testlib.c b/c/tests/testlib.c index 637a59e681..1c59ca02db 100644 --- a/c/tests/testlib.c +++ b/c/tests/testlib.c @@ -767,6 +767,8 @@ tsk_treeseq_from_text(tsk_treeseq_t *ts, double sequence_length, const char *nod tsk_table_collection_t tables; tsk_id_t max_population_id; tsk_size_t j; + tsk_flags_t ts_flags; + bool all_parents_null; CU_ASSERT_FATAL(ts != NULL); CU_ASSERT_FATAL(nodes != NULL); @@ -807,7 +809,21 @@ tsk_treeseq_from_text(tsk_treeseq_t *ts, double sequence_length, const char *nod } } - ret = tsk_treeseq_init(ts, &tables, TSK_TS_INIT_BUILD_INDEXES); + /* If all mutation.parent are TSK_NULL, use TSK_TS_COMPUTE_MUTATION_PARENTS flag too + */ + ts_flags = TSK_TS_INIT_BUILD_INDEXES; + all_parents_null = true; + for (j = 0; j < tables.mutations.num_rows; j++) { + if (tables.mutations.parent[j] != TSK_NULL) { + all_parents_null = false; + break; + } + } + if (all_parents_null) { + ts_flags |= TSK_TS_INIT_COMPUTE_MUTATION_PARENTS; + } + + ret = tsk_treeseq_init(ts, &tables, ts_flags); /* tsk_treeseq_print_state(ts, stdout); */ if (ret != 0) { printf("\nret = %s\n", tsk_strerror(ret)); diff --git a/c/tskit/core.c b/c/tskit/core.c index 175505439e..8adcf1f85d 100644 --- a/c/tskit/core.c +++ b/c/tskit/core.c @@ -347,6 +347,12 @@ tsk_strerror_internal(int err) "(TSK_ERR_DISALLOWED_UNKNOWN_MUTATION_TIME)"; break; + case TSK_ERR_BAD_MUTATION_PARENT: + ret = "A mutation's parent is not consistent with the topology of the tree. " + "Use compute_mutation_parents to set the parents correctly." + "(TSK_ERR_BAD_MUTATION_PARENT)"; + break; + /* Migration errors */ case TSK_ERR_UNSORTED_MIGRATIONS: ret = "Migrations must be sorted by time. (TSK_ERR_UNSORTED_MIGRATIONS)"; diff --git a/c/tskit/core.h b/c/tskit/core.h index a4d022c2dd..bbf5d789fc 100644 --- a/c/tskit/core.h +++ b/c/tskit/core.h @@ -510,6 +510,12 @@ Some mutations have TSK_UNKNOWN_TIME in an algorithm where that's disallowed (use compute_mutation_times?). */ #define TSK_ERR_DISALLOWED_UNKNOWN_MUTATION_TIME -510 + +/** +A mutation's parent was not consistent with the topology of the tree. + */ +#define TSK_ERR_BAD_MUTATION_PARENT -511 + /** @} */ /** diff --git a/c/tskit/tables.c b/c/tskit/tables.c index 4b35ff5c1d..edc416ebf4 100644 --- a/c/tskit/tables.c +++ b/c/tskit/tables.c @@ -10986,11 +10986,159 @@ tsk_table_collection_check_index_integrity(const tsk_table_collection_t *self) return ret; } +static int TSK_WARN_UNUSED +tsk_table_collection_compute_mutation_parents_to_array( + const tsk_table_collection_t *self, tsk_id_t *mutation_parent) +{ + int ret = 0; + const tsk_id_t *I, *O; + const tsk_edge_table_t edges = self->edges; + const tsk_node_table_t nodes = self->nodes; + const tsk_site_table_t sites = self->sites; + const tsk_mutation_table_t mutations = self->mutations; + const tsk_id_t M = (tsk_id_t) edges.num_rows; + tsk_id_t tj, tk; + tsk_id_t *parent = NULL; + tsk_id_t *bottom_mutation = NULL; + tsk_id_t u; + double left, right; + tsk_id_t site; + /* Using unsigned values here avoids potentially undefined behaviour */ + tsk_size_t j, mutation, first_mutation; + + parent = tsk_malloc(nodes.num_rows * sizeof(*parent)); + bottom_mutation = tsk_malloc(nodes.num_rows * sizeof(*bottom_mutation)); + if (parent == NULL || bottom_mutation == NULL) { + ret = tsk_trace_error(TSK_ERR_NO_MEMORY); + goto out; + } + tsk_memset(parent, 0xff, nodes.num_rows * sizeof(*parent)); + tsk_memset(bottom_mutation, 0xff, nodes.num_rows * sizeof(*bottom_mutation)); + tsk_memset(mutation_parent, 0xff, self->mutations.num_rows * sizeof(tsk_id_t)); + + I = self->indexes.edge_insertion_order; + O = self->indexes.edge_removal_order; + tj = 0; + tk = 0; + site = 0; + mutation = 0; + left = 0; + while (tj < M || left < self->sequence_length) { + while (tk < M && edges.right[O[tk]] == left) { + parent[edges.child[O[tk]]] = TSK_NULL; + tk++; + } + while (tj < M && edges.left[I[tj]] == left) { + parent[edges.child[I[tj]]] = edges.parent[I[tj]]; + tj++; + } + right = self->sequence_length; + if (tj < M) { + right = TSK_MIN(right, edges.left[I[tj]]); + } + if (tk < M) { + right = TSK_MIN(right, edges.right[O[tk]]); + } + + /* Tree is now ready. We look at each site on this tree in turn */ + while (site < (tsk_id_t) sites.num_rows && sites.position[site] < right) { + /* Create a mapping from mutations to nodes. If we see more than one + * mutation at a node, the previously seen one must be the parent + * of the current since we assume they are in order. */ + first_mutation = mutation; + while (mutation < mutations.num_rows && mutations.site[mutation] == site) { + u = mutations.node[mutation]; + if (bottom_mutation[u] != TSK_NULL) { + mutation_parent[mutation] = bottom_mutation[u]; + } + bottom_mutation[u] = (tsk_id_t) mutation; + mutation++; + } + /* Make the common case of 1 mutation fast */ + if (mutation > first_mutation + 1) { + /* If we have more than one mutation, compute the parent for each + * one by traversing up the tree until we find a node that has a + * mutation. */ + for (j = first_mutation; j < mutation; j++) { + if (mutation_parent[j] == TSK_NULL) { + u = parent[mutations.node[j]]; + while (u != TSK_NULL && bottom_mutation[u] == TSK_NULL) { + u = parent[u]; + } + if (u != TSK_NULL) { + mutation_parent[j] = bottom_mutation[u]; + } + } + } + } + /* Reset the mapping for the next site */ + for (j = first_mutation; j < mutation; j++) { + u = mutations.node[j]; + bottom_mutation[u] = TSK_NULL; + /* Check that we haven't violated the sortedness property */ + if (mutation_parent[j] > (tsk_id_t) j) { + ret = tsk_trace_error(TSK_ERR_MUTATION_PARENT_AFTER_CHILD); + goto out; + } + } + site++; + } + /* Move on to the next tree */ + left = right; + } + +out: + tsk_safe_free(parent); + tsk_safe_free(bottom_mutation); + return ret; +} + +static int TSK_WARN_UNUSED +tsk_table_collection_check_mutation_parents(const tsk_table_collection_t *self) +{ + int ret = 0; + tsk_mutation_table_t mutations = self->mutations; + tsk_id_t *new_parents = NULL; + tsk_size_t j; + + if (mutations.num_rows == 0) { + return ret; + } + + new_parents = tsk_malloc(mutations.num_rows * sizeof(*new_parents)); + if (new_parents == NULL) { + ret = tsk_trace_error(TSK_ERR_NO_MEMORY); + goto out; + } + + ret = tsk_table_collection_compute_mutation_parents_to_array(self, new_parents); + if (ret != 0) { + goto out; + } + + for (j = 0; j < mutations.num_rows; j++) { + if (mutations.parent[j] != new_parents[j]) { + ret = tsk_trace_error(TSK_ERR_BAD_MUTATION_PARENT); + goto out; + } + } + +out: + tsk_safe_free(new_parents); + return ret; +} + tsk_id_t TSK_WARN_UNUSED tsk_table_collection_check_integrity( const tsk_table_collection_t *self, tsk_flags_t options) { tsk_id_t ret = 0; + int mut_ret = 0; + + if (options & TSK_CHECK_MUTATION_PARENTS) { + /* If we're checking mutation parents, we need to check the trees first */ + options |= TSK_CHECK_TREES; + } if (options & TSK_CHECK_TREES) { /* Checking the trees implies these checks */ @@ -11043,6 +11191,14 @@ tsk_table_collection_check_integrity( if (ret < 0) { goto out; } + /* This check requires tree integrity so do it last */ + if (options & TSK_CHECK_MUTATION_PARENTS) { + mut_ret = tsk_table_collection_check_mutation_parents(self); + if (mut_ret != 0) { + ret = mut_ret; + goto out; + } + } } out: return ret; @@ -12346,117 +12502,25 @@ tsk_table_collection_deduplicate_sites( int TSK_WARN_UNUSED tsk_table_collection_compute_mutation_parents( - tsk_table_collection_t *self, tsk_flags_t TSK_UNUSED(options)) + tsk_table_collection_t *self, tsk_flags_t options) { int ret = 0; - tsk_id_t num_trees; - const tsk_id_t *I, *O; - const tsk_edge_table_t edges = self->edges; - const tsk_node_table_t nodes = self->nodes; - const tsk_site_table_t sites = self->sites; - const tsk_mutation_table_t mutations = self->mutations; - const tsk_id_t M = (tsk_id_t) edges.num_rows; - tsk_id_t tj, tk; - tsk_id_t *parent = NULL; - tsk_id_t *bottom_mutation = NULL; - tsk_id_t u; - double left, right; - tsk_id_t site; - /* Using unsigned values here avoids potentially undefined behaviour */ - tsk_size_t j, mutation, first_mutation; - - /* Set the mutation parent to TSK_NULL so that we don't check the - * parent values we are about to write over. */ - tsk_memset(mutations.parent, 0xff, mutations.num_rows * sizeof(*mutations.parent)); - num_trees = tsk_table_collection_check_integrity(self, TSK_CHECK_TREES); - if (num_trees < 0) { - ret = (int) num_trees; - goto out; - } - parent = tsk_malloc(nodes.num_rows * sizeof(*parent)); - bottom_mutation = tsk_malloc(nodes.num_rows * sizeof(*bottom_mutation)); - if (parent == NULL || bottom_mutation == NULL) { - ret = tsk_trace_error(TSK_ERR_NO_MEMORY); - goto out; - } - tsk_memset(parent, 0xff, nodes.num_rows * sizeof(*parent)); - tsk_memset(bottom_mutation, 0xff, nodes.num_rows * sizeof(*bottom_mutation)); - tsk_memset(mutations.parent, 0xff, self->mutations.num_rows * sizeof(tsk_id_t)); - I = self->indexes.edge_insertion_order; - O = self->indexes.edge_removal_order; - tj = 0; - tk = 0; - site = 0; - mutation = 0; - left = 0; - while (tj < M || left < self->sequence_length) { - while (tk < M && edges.right[O[tk]] == left) { - parent[edges.child[O[tk]]] = TSK_NULL; - tk++; - } - while (tj < M && edges.left[I[tj]] == left) { - parent[edges.child[I[tj]]] = edges.parent[I[tj]]; - tj++; - } - right = self->sequence_length; - if (tj < M) { - right = TSK_MIN(right, edges.left[I[tj]]); - } - if (tk < M) { - right = TSK_MIN(right, edges.right[O[tk]]); + if (!(options & TSK_NO_CHECK_INTEGRITY)) { + /* Safe to cast here as we're not counting trees */ + ret = (int) tsk_table_collection_check_integrity(self, TSK_CHECK_TREES); + if (ret < 0) { + goto out; } + } - /* Tree is now ready. We look at each site on this tree in turn */ - while (site < (tsk_id_t) sites.num_rows && sites.position[site] < right) { - /* Create a mapping from mutations to nodes. If we see more than one - * mutation at a node, the previously seen one must be the parent - * of the current since we assume they are in order. */ - first_mutation = mutation; - while (mutation < mutations.num_rows && mutations.site[mutation] == site) { - u = mutations.node[mutation]; - if (bottom_mutation[u] != TSK_NULL) { - mutations.parent[mutation] = bottom_mutation[u]; - } - bottom_mutation[u] = (tsk_id_t) mutation; - mutation++; - } - /* Make the common case of 1 mutation fast */ - if (mutation > first_mutation + 1) { - /* If we have more than one mutation, compute the parent for each - * one by traversing up the tree until we find a node that has a - * mutation. */ - for (j = first_mutation; j < mutation; j++) { - if (mutations.parent[j] == TSK_NULL) { - u = parent[mutations.node[j]]; - while (u != TSK_NULL && bottom_mutation[u] == TSK_NULL) { - u = parent[u]; - } - if (u != TSK_NULL) { - mutations.parent[j] = bottom_mutation[u]; - } - } - } - } - /* Reset the mapping for the next site */ - for (j = first_mutation; j < mutation; j++) { - u = mutations.node[j]; - bottom_mutation[u] = TSK_NULL; - /* Check that we haven't violated the sortedness property */ - if (mutations.parent[j] > (tsk_id_t) j) { - ret = tsk_trace_error(TSK_ERR_MUTATION_PARENT_AFTER_CHILD); - goto out; - } - } - site++; - } - /* Move on to the next tree */ - left = right; + ret = tsk_table_collection_compute_mutation_parents_to_array( + self, self->mutations.parent); + if (ret != 0) { + goto out; } out: - tsk_safe_free(parent); - tsk_safe_free(bottom_mutation); return ret; } @@ -12494,6 +12558,7 @@ tsk_table_collection_compute_mutation_times( for (j = 0; j < mutations.num_rows; j++) { mutations.time[j] = TSK_UNKNOWN_TIME; } + /* TSK_CHECK_MUTATION_PARENTS isn't needed here as we're not using the parents */ num_trees = tsk_table_collection_check_integrity(self, TSK_CHECK_TREES); if (num_trees < 0) { ret = (int) num_trees; diff --git a/c/tskit/tables.h b/c/tskit/tables.h index 204e39e2db..085f14ba0a 100644 --- a/c/tskit/tables.h +++ b/c/tskit/tables.h @@ -797,6 +797,11 @@ All checks needed to define a valid tree sequence. Note that this implies all of the above checks. */ #define TSK_CHECK_TREES (1 << 7) +/** +Check mutation parents are consistent with topology. +Implies TSK_CHECK_TREES. +*/ +#define TSK_CHECK_MUTATION_PARENTS (1 << 8) /* Leave room for more positive check flags */ /** diff --git a/c/tskit/trees.c b/c/tskit/trees.c index 4638fb18e8..f3713843ed 100644 --- a/c/tskit/trees.c +++ b/c/tskit/trees.c @@ -458,10 +458,29 @@ tsk_treeseq_init( goto out; } } - num_trees = tsk_table_collection_check_integrity(self->tables, TSK_CHECK_TREES); - if (num_trees < 0) { - ret = (int) num_trees; - goto out; + + if (options & TSK_TS_INIT_COMPUTE_MUTATION_PARENTS) { + /* As tsk_table_collection_compute_mutation_parents performs an + integrity check, and we don't wish to do that twice we perform + our own check here */ + num_trees = tsk_table_collection_check_integrity(self->tables, TSK_CHECK_TREES); + if (num_trees < 0) { + ret = (int) num_trees; + goto out; + } + + ret = tsk_table_collection_compute_mutation_parents( + self->tables, TSK_NO_CHECK_INTEGRITY); + if (ret != 0) { + goto out; + } + } else { + num_trees = tsk_table_collection_check_integrity( + self->tables, TSK_CHECK_TREES | TSK_CHECK_MUTATION_PARENTS); + if (num_trees < 0) { + ret = (int) num_trees; + goto out; + } } self->num_trees = (tsk_size_t) num_trees; self->discrete_genome = true; diff --git a/c/tskit/trees.h b/c/tskit/trees.h index bef944fff3..f4b3993be6 100644 --- a/c/tskit/trees.h +++ b/c/tskit/trees.h @@ -71,6 +71,11 @@ when the tree sequence is initialised. Indexes are required for a valid tree sequence, and are not built by default for performance reasons. */ #define TSK_TS_INIT_BUILD_INDEXES (1 << 0) +/** +If specified, mutation parents in the table collection will be overwritten +with those computed from the topology when the tree sequence is initialised. +*/ +#define TSK_TS_INIT_COMPUTE_MUTATION_PARENTS (1 << 1) /** @} */ // clang-format on diff --git a/docs/data-model.md b/docs/data-model.md index a9011d36ed..152da15100 100644 --- a/docs/data-model.md +++ b/docs/data-model.md @@ -507,13 +507,14 @@ requirements for a valid set of mutations are: `time` and equal to or less than the `time` of the `parent` mutation if this mutation has one. If one mutation on a site has `UNKNOWN_TIME` then all mutations at that site must, as a mixture of known and unknown is not valid. -- `parent` must either be the null ID (-1) or a valid mutation ID within the - current table +- `parent` must either be the null ID (-1), if the mutation has no parent, or a + valid mutation ID within the current table. Furthermore, -- If another mutation occurs on the tree above the mutation in - question, its ID must be listed as the `parent`. +- The `parent` value must be consistent with the topology of the tree at the site + of the mutation, such that a path from the child mutation to the parent mutation + exists without passing through any other mutations at the same site. For simplicity and algorithmic efficiency, mutations must also: diff --git a/docs/file-formats.md b/docs/file-formats.md index c83e2e2666..a78295ee2e 100644 --- a/docs/file-formats.md +++ b/docs/file-formats.md @@ -110,8 +110,8 @@ position ancestral_state mutations = """\ site node derived_state time parent 0 0 A 0.5 -1 +1 1 A 1.0 -1 1 0 T 1.5 -1 -1 1 A 1.0 1 """ migrations = """\ diff --git a/python/CHANGELOG.rst b/python/CHANGELOG.rst index a6f5986e62..7a1da39e39 100644 --- a/python/CHANGELOG.rst +++ b/python/CHANGELOG.rst @@ -2,6 +2,15 @@ [0.6.5] - 2025-0X-XX -------------------- +**Breaking Changes** + +- For a tree seqeunce to be valid mutation parents in the table collection + must be correct and consistent with the topology of the tree at each mutation site. + ``TableCollection.tree_sequence()`` will raise a ``_tskit.LibraryError`` if this + is not the case. + (:user:`benjeffery`, :issue:`2729`, :issue:`2732`, :pr:`3212`). + + **Features** - ``TreeSequence.map_to_vcf_model`` now also returns the transformed positions and diff --git a/python/tests/test_genotypes.py b/python/tests/test_genotypes.py index e2a0920376..dc3ad21af7 100644 --- a/python/tests/test_genotypes.py +++ b/python/tests/test_genotypes.py @@ -342,6 +342,7 @@ def test_silent_mutations(self): site = tables.sites.add_row(position=0, ancestral_state="0") tables.mutations.add_row(site=site, node=u, derived_state="1") tables.mutations.add_row(site=site, node=sample, derived_state="1") + tables.compute_mutation_parents() ts_new = tables.tree_sequence() assert all([v.genotypes[sample] == 1 for v in ts_new.variants()]) @@ -927,8 +928,9 @@ def test_silent_mutations(self): tables.sites.clear() tables.mutations.clear() site = tables.sites.add_row(position=0, ancestral_state="0") - tables.mutations.add_row(site=site, node=u, derived_state="1") tables.mutations.add_row(site=site, node=tree.root, derived_state="1") + tables.mutations.add_row(site=site, node=u, derived_state="1") + tables.compute_mutation_parents() ts_new = tables.tree_sequence() all(h == 1 for h in ts_new.haplotypes()) diff --git a/python/tests/test_highlevel.py b/python/tests/test_highlevel.py index 429d1ecc5f..accb17f821 100644 --- a/python/tests/test_highlevel.py +++ b/python/tests/test_highlevel.py @@ -2692,6 +2692,104 @@ def test_impute_unknown_mutations_time(self, ts): ): ts.impute_unknown_mutations_time(method="foobar") + @pytest.mark.parametrize( + "mutations, error", + [ + ([], None), + ( + [{"node": 0, "parent": -1}, {"node": 1, "parent": -1}], + None, + ), # On parallel branches, no parents + ( + [ + {"node": 4, "parent": -1}, + {"node": 0, "parent": 0}, + {"node": 1, "parent": 0}, + ], + None, + ), # On parallel branches, legal parent + ( + [{"node": 0, "parent": -1}, {"node": 0, "parent": 0}], + None, + ), # On same node + ( + [{"node": 0, "parent": -1}, {"node": 0, "parent": -1}], + "not consistent with the topology", + ), # On same node without parents + ( + [ + {"node": 3, "parent": -1}, + {"node": 0, "parent": 0}, + {"node": 1, "parent": 0}, + ], + "not consistent with the topology", + ), # On parallel branches, parent on parallel branches + ( + [ + {"node": 5, "parent": -1}, + {"node": 0, "parent": 0}, + {"node": 1, "parent": 0}, + ], + "not consistent with the topology", + ), # On parallel branches, parent high on parallel + ( + [ + {"node": 3, "parent": -1}, + {"node": 0, "parent": 0}, + {"node": 7, "parent": 0}, + ], + "not consistent with the topology", + ), # On parallel branches, parent on different root + ( + [ + {"node": 0, "parent": -1}, + {"node": 1, "parent": 0}, + ], + "not consistent with the topology", + ), # parent on parallel branch + ( + [ + {"node": 6, "parent": -1}, + {"node": 6, "parent": 0}, + ], + None, + ), # parent above root + ( + [ + {"node": 6, "parent": -1}, + {"node": 6, "parent": -1}, + ], + "not consistent with the topology", + ), # parent above root, no parents + ], + ) + def test_mutation_parent_errors(self, mutations, error): + tables = tskit.TableCollection(sequence_length=1) + tables.nodes.add_row(time=0, flags=tskit.NODE_IS_SAMPLE) + tables.nodes.add_row(time=0, flags=tskit.NODE_IS_SAMPLE) + tables.nodes.add_row(time=0, flags=tskit.NODE_IS_SAMPLE) + tables.nodes.add_row(time=0, flags=tskit.NODE_IS_SAMPLE) + tables.nodes.add_row(time=1) + tables.nodes.add_row(time=1) + tables.nodes.add_row(time=2) + tables.nodes.add_row(time=3) + tables.edges.add_row(left=0, right=1, parent=4, child=0) + tables.edges.add_row(left=0, right=1, parent=4, child=1) + tables.edges.add_row(left=0, right=1, parent=5, child=2) + tables.edges.add_row(left=0, right=1, parent=5, child=3) + tables.edges.add_row(left=0, right=1, parent=6, child=4) + tables.edges.add_row(left=0, right=1, parent=6, child=5) + tables.sites.add_row(position=0.5, ancestral_state="A") + + for mut in mutations: + tables.mutations.add_row(**{"derived_state": "G", "site": 0, **mut}) + + if error is not None: + with pytest.raises(_tskit.LibraryError, match=error): + tables.tree_sequence() + else: + tables.tree_sequence() + class TestSimplify: # This class was factored out of the old TestHighlevel class 2022-12-13, @@ -2972,6 +3070,8 @@ def test_k_mutations(self, k): tables.nodes.add_row(1, 0) # will not have any mutations => missing for j in range(k): tables.mutations.add_row(site=0, node=0, derived_state=str(j)) + tables.build_index() + tables.compute_mutation_parents() ts = tables.tree_sequence() variant = next(ts.variants()) assert variant.has_missing_data diff --git a/python/tests/test_tables.py b/python/tests/test_tables.py index a4cf64b1f4..a8d415ee25 100644 --- a/python/tests/test_tables.py +++ b/python/tests/test_tables.py @@ -2792,6 +2792,8 @@ def test_sort_mutations_time(self): """\ id is_sample time 0 1 -6 + 1 1 -6 + 2 1 -6 """ ) edges = io.StringIO( @@ -2811,14 +2813,14 @@ def test_sort_mutations_time(self): """\ site node time derived_state parent 2 0 4 a -1 - 2 0 -5 b -1 - 2 0 6 c -1 + 2 1 -5 b -1 + 2 2 6 c -1 1 0 0.5 d -1 - 1 0 0.5 e -1 - 1 0 0.5 f -1 + 1 1 0.5 e -1 + 1 2 0.5 f -1 0 0 1 g -1 - 0 0 2 h -1 - 0 0 3 i -1 + 0 1 2 h -1 + 0 2 3 i -1 """ ) ts = tskit.load_text( @@ -2836,7 +2838,7 @@ def test_sort_mutations_time(self): assert len(sites) == 3 assert len(mutations) == 9 assert list(mutations.site) == [0, 0, 0, 1, 1, 1, 2, 2, 2] - assert list(mutations.node) == [0, 0, 0, 0, 0, 0, 0, 0, 0] + assert list(mutations.node) == [2, 1, 0, 0, 1, 2, 2, 0, 1] # Nans are not equal so swap in -1 times = mutations.time times[np.isnan(times)] = -1 @@ -2928,6 +2930,8 @@ def test_unsorted_times(self): """\ id is_sample time 0 1 0 + 1 1 0 + 2 1 0 """ ) edges = io.StringIO( @@ -2945,8 +2949,8 @@ def test_unsorted_times(self): """\ site node time derived_state parent 0 0 1 0 -1 - 0 0 2 0 -1 - 0 0 3 0 -1 + 0 1 2 0 -1 + 0 2 3 0 -1 """ ) ts = tskit.load_text( diff --git a/python/tests/test_topology.py b/python/tests/test_topology.py index b2eb4ea6a7..adb422a06c 100644 --- a/python/tests/test_topology.py +++ b/python/tests/test_topology.py @@ -1659,9 +1659,9 @@ def test_single_binary_tree_keep_roots_mutations(self): 0 0.1 0 """ mutations_before = """\ - site node derived_state - 0 3 2 - 0 4 1 + site node derived_state parent + 0 4 1 -1 + 0 3 2 0 """ # We sample 0 and 2 @@ -1682,9 +1682,9 @@ def test_single_binary_tree_keep_roots_mutations(self): 0 0.1 0 """ mutations_after = """\ - site node derived_state - 0 2 2 - 0 2 1 + site node derived_state parent + 0 2 1 -1 + 0 2 2 0 """ self.verify_simplify( samples=[0, 1], @@ -1716,10 +1716,10 @@ def test_place_mutations_with_and_without_roots(self): 0 1.0 0 """ mutations_before = """\ - site node derived_state time - 0 0 2 0 - 0 1 1 1 - 0 2 3 2 + site node derived_state time parent + 0 2 3 2 -1 + 0 1 1 1 0 + 0 0 2 0 1 """ # expected result without keep_input_roots nodes_after = """\ @@ -1730,10 +1730,10 @@ def test_place_mutations_with_and_without_roots(self): left right parent child """ mutations_after = """\ - site node derived_state time - 0 0 2 0 - 0 0 1 1 - 0 0 3 2 + site node derived_state time parent + 0 0 3 2 -1 + 0 0 1 1 0 + 0 0 2 0 1 """ # expected result with keep_input_roots nodes_after_keep = """\ @@ -1746,10 +1746,10 @@ def test_place_mutations_with_and_without_roots(self): 0 2 1 0 """ mutations_after_keep = """\ - site node derived_state time - 0 0 2 0 - 0 0 1 1 - 0 1 3 2 + site node derived_state time parent + 0 1 3 2 -1 + 0 0 1 1 0 + 0 0 2 0 1 """ self.verify_simplify( samples=[0], @@ -3778,6 +3778,7 @@ def test_small_tree_back_mutations(self): tables.mutations.add_row(site=0, node=7, derived_state="1") tables.mutations.add_row(site=0, node=5, derived_state="0") tables.mutations.add_row(site=0, node=1, derived_state="1") + tables.compute_mutation_parents() ts = tables.tree_sequence() assert ts.num_sites == 1 assert ts.num_mutations == 3 diff --git a/python/tskit/tables.py b/python/tskit/tables.py index bc078164c0..8eb5ec4b56 100644 --- a/python/tskit/tables.py +++ b/python/tskit/tables.py @@ -3682,8 +3682,8 @@ def compute_mutation_parents(self): valid, and the site and mutation tables must be sorted (see :meth:`TableCollection.sort`). This will produce an error if mutations are not sorted (i.e., if a mutation appears before its mutation parent) - *unless* the two mutations occur on the same branch, in which case - there is no way to detect the error. + *unless* the two mutations occur on the same branch, and have unknown times + in which case there is no way to detect the error. The ``parent`` of a given mutation is the ID of the next mutation encountered traversing the tree upwards from that mutation, or