Skip to content

Check mutation parents on tree sequence init #3212

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 14 commits into
base: main
Choose a base branch
from
32 changes: 32 additions & 0 deletions c/CHANGELOG.rst
Original file line number Diff line number Diff line change
@@ -1,3 +1,35 @@
--------------------
[1.2.0] - 2025-XX-XX
--------------------

**Breaking changes**

- ``tsk_treeseq_init`` now requires that mutation parents in the table collection
are correct and consistent with the topology of the tree at each mutation site.
Returns ``TSK_ERR_BAD_MUTATION_PARENT`` if this is not the case, or
``TSK_ERR_MUTATION_PARENT_AFTER_CHILD`` if the mutations are not in an order
compatible with the correct mutation parent.
(:user:`benjeffery`, :issue:`2729`, :issue:`2732`, :pr:`3212`).

**Features**

- Add ``TSK_TS_INIT_COMPUTE_MUTATION_PARENTS`` to ``tsk_treeseq_init``
to compute mutation parents from the tree sequence topology.
Note that the mutations must be in the correct order.
(:user:`benjeffery`, :issue:`2757`, :pr:`3212`).

- Add ``TSK_CHECK_MUTATION_PARENTS`` option to ``tsk_table_collection_check_integrity``
to check that mutation parents are consistent with the tree sequence topology.
This option implies ``TSK_CHECK_TREES``.
(:user:`benjeffery`, :issue:`2729`, :issue:`2732`, :pr:`3212`).

- Add the ``TSK_NO_CHECK_INTEGRITY`` option to ``tsk_table_collection_compute_mutation_parents``
to skip the integrity checks that are normally run when computing mutation parents.
This is useful for speeding up the computation of mutation parents when the
tree sequence is certainly known to be valid.
(:user:`benjeffery`, :pr:`3212`).


--------------------
[1.1.4] - 2025-03-31
--------------------
Expand Down
62 changes: 61 additions & 1 deletion c/tests/test_tables.c
Original file line number Diff line number Diff line change
Expand Up @@ -9011,7 +9011,7 @@ test_sort_tables_mutation_times(void)
CU_ASSERT_EQUAL_FATAL(ret, 0);

/* Check to make sure we have legal mutations */
ret = tsk_treeseq_init(&ts, &tables, 0);
ret = tsk_treeseq_init(&ts, &tables, TSK_TS_INIT_COMPUTE_MUTATION_PARENTS);
CU_ASSERT_EQUAL_FATAL(ret, 0);

ret = tsk_treeseq_copy_tables(&ts, &t1, 0);
Expand Down Expand Up @@ -10576,6 +10576,64 @@ test_table_collection_check_integrity_bad_indexes(void)
tsk_table_collection_free(&tables);
}

static void
test_check_integrity_bad_mutation_parent_topology(void)
{
int ret;
tsk_id_t ret_trees;
tsk_table_collection_t tables;
const char *sites = "0 0\n";
/* Make a mutation on a parallel branch the parent*/
const char *bad_mutations = "0 0 1 -1\n"
"0 1 1 0\n";

/* A mutation above is set as child*/
const char *reverse_mutations = "0 0 1 -1\n"
"0 4 1 0\n";

const char *reverse_sites = "0.5 0\n"
"0 0\n";

ret = tsk_table_collection_init(&tables, 0);
CU_ASSERT_EQUAL_FATAL(ret, 0);

tables.sequence_length = 1;
parse_nodes(single_tree_ex_nodes, &tables.nodes);
CU_ASSERT_EQUAL_FATAL(tables.nodes.num_rows, 7);
parse_edges(single_tree_ex_edges, &tables.edges);
CU_ASSERT_EQUAL_FATAL(tables.edges.num_rows, 6);
parse_sites(sites, &tables.sites);
CU_ASSERT_EQUAL_FATAL(tables.sites.num_rows, 1);
parse_mutations(bad_mutations, &tables.mutations);
CU_ASSERT_EQUAL_FATAL(tables.mutations.num_rows, 2);
tables.sequence_length = 1.0;

ret = tsk_table_collection_build_index(&tables, 0);
CU_ASSERT_EQUAL_FATAL(ret, 0);

ret_trees = tsk_table_collection_check_integrity(&tables, TSK_CHECK_TREES);
CU_ASSERT_EQUAL_FATAL(ret_trees, 1);
ret_trees
= tsk_table_collection_check_integrity(&tables, TSK_CHECK_MUTATION_PARENTS);
CU_ASSERT_EQUAL_FATAL(ret_trees, TSK_ERR_BAD_MUTATION_PARENT);

parse_mutations(reverse_mutations, &tables.mutations);
ret_trees = tsk_table_collection_check_integrity(&tables, TSK_CHECK_TREES);
CU_ASSERT_EQUAL_FATAL(ret_trees, 1);
ret_trees
= tsk_table_collection_check_integrity(&tables, TSK_CHECK_MUTATION_PARENTS);
CU_ASSERT_EQUAL_FATAL(ret_trees, TSK_ERR_MUTATION_PARENT_AFTER_CHILD);

/* Now check that TSK_CHECK_MUTATION_PARENTS implies TSK_CHECK_TREES
by triggering an error with reversed sites */
parse_sites(reverse_sites, &tables.sites);
ret_trees
= tsk_table_collection_check_integrity(&tables, TSK_CHECK_MUTATION_PARENTS);
CU_ASSERT_EQUAL_FATAL(ret_trees, TSK_ERR_UNSORTED_SITES);

tsk_table_collection_free(&tables);
}

static void
test_table_collection_subset_with_options(tsk_flags_t options)
{
Expand Down Expand Up @@ -11658,6 +11716,8 @@ main(int argc, char **argv)
test_table_collection_check_integrity_bad_indexes_example },
{ "test_table_collection_check_integrity_bad_indexes",
test_table_collection_check_integrity_bad_indexes },
{ "test_check_integrity_bad_mutation_parent_topology",
test_check_integrity_bad_mutation_parent_topology },
{ "test_table_collection_subset", test_table_collection_subset },
{ "test_table_collection_subset_unsorted",
test_table_collection_subset_unsorted },
Expand Down
121 changes: 115 additions & 6 deletions c/tests/test_trees.c
Original file line number Diff line number Diff line change
Expand Up @@ -3886,16 +3886,16 @@ test_simplest_mutation_edges(void)
"1 2 2 0\n";
const char *sites = "0.5 0\n"
"1.5 0\n";
const char *mutations = "0 1 1\n"
const char *mutations = "0 2 1\n"
"0 1 1\n"
"0 0 1\n"
"0 2 1\n"
"1 0 1\n"
"1 2 1\n"
"1 1 1\n"
"1 2 1\n";
"1 0 1\n";
tsk_treeseq_t ts;
tsk_tree_t tree;
/* We have mutations over roots, samples and just isolated nodes */
tsk_id_t mutation_edges[] = { -1, 0, -1, 1, -1, -1 };
tsk_id_t mutation_edges[] = { -1, -1, 0, -1, -1, 1 };
tsk_size_t i, j, k, t;
tsk_mutation_t mut;
tsk_site_t site;
Expand Down Expand Up @@ -4238,7 +4238,7 @@ test_single_tree_bad_mutations(void)
ret = tsk_treeseq_init(&ts, &tables, load_flags);
CU_ASSERT_EQUAL(ret, TSK_ERR_MUTATION_PARENT_AFTER_CHILD);
tsk_treeseq_free(&ts);
tables.mutations.parent[3] = TSK_NULL;
tables.mutations.parent[3] = 2;

/* time < node time */
tables.mutations.time[2] = 0;
Expand Down Expand Up @@ -8715,6 +8715,112 @@ test_init_take_ownership_no_edge_metadata(void)
tsk_treeseq_free(&ts);
}

static void
test_init_compute_mutation_parents(void)
{
int ret;
tsk_table_collection_t *tables, *tables2;
tsk_treeseq_t ts;
const char *sites = "0 0\n";
/* Make a mutation on a parallel branch the parent*/
const char *bad_mutations = "0 0 1 -1\n"
"0 1 1 0\n";

tables = tsk_malloc(sizeof(tsk_table_collection_t));
CU_ASSERT_NOT_EQUAL_FATAL(tables, NULL);
tables2 = tsk_malloc(sizeof(tsk_table_collection_t));
CU_ASSERT_NOT_EQUAL_FATAL(tables2, NULL);

CU_ASSERT_FATAL(tables != NULL);
ret = tsk_table_collection_init(tables, 0);
CU_ASSERT_EQUAL_FATAL(ret, 0);

tables->sequence_length = 1;
parse_nodes(single_tree_ex_nodes, &tables->nodes);
CU_ASSERT_EQUAL_FATAL(tables->nodes.num_rows, 7);
parse_edges(single_tree_ex_edges, &tables->edges);
CU_ASSERT_EQUAL_FATAL(tables->edges.num_rows, 6);
parse_sites(sites, &tables->sites);
CU_ASSERT_EQUAL_FATAL(tables->sites.num_rows, 1);
parse_mutations(bad_mutations, &tables->mutations);
CU_ASSERT_EQUAL_FATAL(tables->mutations.num_rows, 2);
tables->sequence_length = 1.0;
ret = tsk_table_collection_copy(tables, tables2, 0);
CU_ASSERT_EQUAL_FATAL(ret, 0);

ret = tsk_treeseq_init(&ts, tables, TSK_TS_INIT_BUILD_INDEXES);
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_MUTATION_PARENT);
tsk_treeseq_free(&ts);

ret = tsk_treeseq_init(
&ts, tables, TSK_TS_INIT_BUILD_INDEXES | TSK_TS_INIT_COMPUTE_MUTATION_PARENTS);
CU_ASSERT_EQUAL_FATAL(ret, 0);
tsk_treeseq_free(&ts);

/* When we use take ownership, the check of parents shouldn't overwrite them*/
ret = tsk_treeseq_init(&ts, tables, TSK_TAKE_OWNERSHIP | TSK_TS_INIT_BUILD_INDEXES);
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_MUTATION_PARENT);
CU_ASSERT_EQUAL(tables->mutations.parent[0], -1);
CU_ASSERT_EQUAL(tables->mutations.parent[1], 0);
tsk_treeseq_free(&ts);

/* When we use take ownership and compute, the tables are overwritten*/
ret = tsk_treeseq_init(&ts, tables2,
TSK_TAKE_OWNERSHIP | TSK_TS_INIT_BUILD_INDEXES
| TSK_TS_INIT_COMPUTE_MUTATION_PARENTS);
CU_ASSERT_EQUAL_FATAL(ret, 0);
CU_ASSERT_EQUAL(tables2->mutations.parent[0], -1);
CU_ASSERT_EQUAL(tables2->mutations.parent[1], -1);

/* Don't need to free tables as we took ownership */
tsk_treeseq_free(&ts);
}

static void
test_init_compute_mutation_parents_errors(void)
{
int ret;
tsk_id_t row_ret;
tsk_table_collection_t tables;
tsk_treeseq_t ts;
const char *sites = "0.5 0\n"
"0 0\n";

ret = tsk_table_collection_init(&tables, 0);
CU_ASSERT_EQUAL_FATAL(ret, 0);

tables.sequence_length = 1;
parse_nodes(single_tree_ex_nodes, &tables.nodes);
CU_ASSERT_EQUAL_FATAL(tables.nodes.num_rows, 7);
parse_edges(single_tree_ex_edges, &tables.edges);
CU_ASSERT_EQUAL_FATAL(tables.edges.num_rows, 6);
parse_sites(sites, &tables.sites);
CU_ASSERT_EQUAL_FATAL(tables.sites.num_rows, 2);
tables.sequence_length = 1.0;

ret = tsk_treeseq_init(
&ts, &tables, TSK_TS_INIT_BUILD_INDEXES | TSK_TS_INIT_COMPUTE_MUTATION_PARENTS);
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_UNSORTED_SITES);
tsk_treeseq_free(&ts);

tsk_site_table_clear(&tables.sites);
row_ret = tsk_site_table_add_row(&tables.sites, 0.5, "A", 1, NULL, 0);
CU_ASSERT_EQUAL_FATAL(row_ret, 0);
row_ret = tsk_mutation_table_add_row(
&tables.mutations, 0, 0, TSK_NULL, TSK_UNKNOWN_TIME, "A", 1, NULL, 0);
CU_ASSERT_EQUAL_FATAL(row_ret, 0);
row_ret = tsk_mutation_table_add_row(
&tables.mutations, 0, 4, TSK_NULL, TSK_UNKNOWN_TIME, "A", 1, NULL, 0);
CU_ASSERT_EQUAL_FATAL(row_ret, 1);

ret = tsk_treeseq_init(
&ts, &tables, TSK_TS_INIT_BUILD_INDEXES | TSK_TS_INIT_COMPUTE_MUTATION_PARENTS);
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MUTATION_PARENT_AFTER_CHILD);
tsk_treeseq_free(&ts);

tsk_table_collection_free(&tables);
}

int
main(int argc, char **argv)
{
Expand Down Expand Up @@ -8923,6 +9029,9 @@ main(int argc, char **argv)
test_extend_haplotypes_conflicting_times },
{ "test_init_take_ownership_no_edge_metadata",
test_init_take_ownership_no_edge_metadata },
{ "test_init_compute_mutation_parents", test_init_compute_mutation_parents },
{ "test_init_compute_mutation_parents_errors",
test_init_compute_mutation_parents_errors },
{ NULL, NULL },
};

Expand Down
18 changes: 17 additions & 1 deletion c/tests/testlib.c
Original file line number Diff line number Diff line change
Expand Up @@ -767,6 +767,8 @@ tsk_treeseq_from_text(tsk_treeseq_t *ts, double sequence_length, const char *nod
tsk_table_collection_t tables;
tsk_id_t max_population_id;
tsk_size_t j;
tsk_flags_t ts_flags;
bool all_parents_null;

CU_ASSERT_FATAL(ts != NULL);
CU_ASSERT_FATAL(nodes != NULL);
Expand Down Expand Up @@ -807,7 +809,21 @@ tsk_treeseq_from_text(tsk_treeseq_t *ts, double sequence_length, const char *nod
}
}

ret = tsk_treeseq_init(ts, &tables, TSK_TS_INIT_BUILD_INDEXES);
/* If all mutation.parent are TSK_NULL, use TSK_TS_COMPUTE_MUTATION_PARENTS flag too
*/
ts_flags = TSK_TS_INIT_BUILD_INDEXES;
all_parents_null = true;
for (j = 0; j < tables.mutations.num_rows; j++) {
if (tables.mutations.parent[j] != TSK_NULL) {
all_parents_null = false;
break;
}
}
if (all_parents_null) {
ts_flags |= TSK_TS_INIT_COMPUTE_MUTATION_PARENTS;
}

ret = tsk_treeseq_init(ts, &tables, ts_flags);
/* tsk_treeseq_print_state(ts, stdout); */
if (ret != 0) {
printf("\nret = %s\n", tsk_strerror(ret));
Expand Down
6 changes: 6 additions & 0 deletions c/tskit/core.c
Original file line number Diff line number Diff line change
Expand Up @@ -347,6 +347,12 @@ tsk_strerror_internal(int err)
"(TSK_ERR_DISALLOWED_UNKNOWN_MUTATION_TIME)";
break;

case TSK_ERR_BAD_MUTATION_PARENT:
ret = "A mutation's parent is not consistent with the topology of the tree. "
"Use compute_mutation_parents to set the parents correctly."
"(TSK_ERR_BAD_MUTATION_PARENT)";
break;

/* Migration errors */
case TSK_ERR_UNSORTED_MIGRATIONS:
ret = "Migrations must be sorted by time. (TSK_ERR_UNSORTED_MIGRATIONS)";
Expand Down
6 changes: 6 additions & 0 deletions c/tskit/core.h
Original file line number Diff line number Diff line change
Expand Up @@ -510,6 +510,12 @@ Some mutations have TSK_UNKNOWN_TIME in an algorithm where that's
disallowed (use compute_mutation_times?).
*/
#define TSK_ERR_DISALLOWED_UNKNOWN_MUTATION_TIME -510

/**
A mutation's parent was not consistent with the topology of the tree.
*/
#define TSK_ERR_BAD_MUTATION_PARENT -511

/** @} */

/**
Expand Down
Loading
Loading