|
| 1 | +import numpy as np |
| 2 | +import logging |
| 3 | +import copy |
| 4 | +import allel |
| 5 | +import skbio |
| 6 | +from pypopgen3.modules import treetools |
| 7 | + |
| 8 | +logger = logging.getLogger() |
| 9 | +logging.basicConfig( |
| 10 | + format='%(levelname)-8s %(asctime)s %(filename) %(message)s') |
| 11 | +#logging.basicConfig(format='%(levelname)-8s %(asctime)s %(funcName)20s() %(message)s') |
| 12 | +logger.setLevel(logging.WARNING) |
| 13 | + |
| 14 | + |
| 15 | +def pairwise_differences(ac, an, correct_for_missing=True): |
| 16 | + """ |
| 17 | + Calculate pairwise genetic difference between many |
| 18 | + individuals/populations/species from allele count matrix (2D-array). |
| 19 | + Allele counts can be from individual haplotypes (0 or 1), |
| 20 | + genotypes (0,1,2) (for diploids) or populations (0,1,2,..,an). |
| 21 | + an and ac have input dimensions number of variants x number of populations. |
| 22 | + If normalised by the accessible genome size, the diagonal of the output corresponds to within-population |
| 23 | + nucleotide diversity pi (or, for genotypes, heterozygosity), the off-diagonal entries correspond |
| 24 | + to nucleotide divergence d_xy. |
| 25 | +
|
| 26 | + :param ac: 2D-array non-reference (or derived) allele count in haplotype/individual/population. |
| 27 | + Each row corresponds to a variant (e.g., SNP) site and each column corresponds to |
| 28 | + corresponds to a different haplotype/individual/population. |
| 29 | + :param an: 2D array of allele number for each variant in each haplotype/individual/population. |
| 30 | + Same dimensions as ac. Corresponds to the number of samples per haplotype/individual/population. |
| 31 | + E.g., an should be 1 for haplotypes and 2 for genotypes unless the variant call is missing. |
| 32 | + If the call is missing, then it is 0. For populations of diploid individuals an of SNP i in |
| 33 | + population j corresponds to the number of 2x the number of individuals in population j for |
| 34 | + which genotypes could be called at SNP i. |
| 35 | + :return: Symmetric (n populations x n populations) matrix of absolute counts of pairwise genetic difference. |
| 36 | + Each "population" can also just be a single individual or a single haplotype. |
| 37 | +
|
| 38 | + """ |
| 39 | + ac = np.array(ac) |
| 40 | + an = np.array(an).astype(float) |
| 41 | + an[an == 0] = np.nan |
| 42 | + af = ac / an |
| 43 | + af1 = np.nan_to_num(af) |
| 44 | + af1c = np.nan_to_num(1 - af) |
| 45 | + # this corrects for sampling without replacement in self comparisons |
| 46 | + af1c_within = np.nan_to_num(1 - (ac - 1.) / (an - 1)) |
| 47 | + pi = np.zeros((af.shape[1], af.shape[1])) |
| 48 | + np.fill_diagonal(pi, (2 * af1 * af1c_within).sum(axis=0)) |
| 49 | + dxy = np.einsum('ij,ik->jk', af1, af1c) + np.einsum('ij,ik->jk', af1c, af1) |
| 50 | + np.fill_diagonal(dxy, 0) |
| 51 | + pwd = pi + dxy |
| 52 | + |
| 53 | + if correct_for_missing: |
| 54 | + # should we use an for all or only for variable sites to correct for this? |
| 55 | + n_pwc = n_pairwise_comparisons(an) |
| 56 | + pwd = pwd * len(an) / n_pwc |
| 57 | + |
| 58 | + return pwd |
| 59 | +#4 |
| 60 | + |
| 61 | +def n_pairwise_comparisons(an): |
| 62 | + has_data = np.array(an).astype(float) |
| 63 | + has_data[np.isnan(has_data)] = 0 |
| 64 | + has_data[has_data > 0] = 1 |
| 65 | + comparison_number = np.einsum('ij,ik->jk', has_data, has_data) |
| 66 | + return comparison_number |
| 67 | + |
| 68 | +def pairwise_differences_from_gt_array(gt_arr, correct_for_missing=True): |
| 69 | + """ |
| 70 | + Get pairwise genetic differences within and between genotypes from numpy array with genotpyes as loaded by |
| 71 | + allel.read_vcf. |
| 72 | + :param gt_arr: numpy array with genotpyes as loaded by allel.read_vcf |
| 73 | + :return: Symmetric (n individuals x n indiviudals) matrix of absolute counts of pairwise genetic difference. |
| 74 | + The diagonal corresponds to within-indiviudal heterozygosity. |
| 75 | + """ |
| 76 | + gt = gt_arr.astype(float) |
| 77 | + gt[gt < 0] = np.nan |
| 78 | + gt = np.sum(gt, axis=-1) |
| 79 | + an = 2 * (gt == gt).astype(int) |
| 80 | + return pairwise_differences(gt, an, correct_for_missing=correct_for_missing) |
| 81 | + |
| 82 | +def get_nj_tree_newick(pairwise_differences, samples): |
| 83 | + pairwise_differences = copy.deepcopy(pairwise_differences) |
| 84 | + np.fill_diagonal(pairwise_differences, 0) |
| 85 | + |
| 86 | + distance_matrix = skbio.DistanceMatrix(pairwise_differences, |
| 87 | + ids=samples) |
| 88 | + # This computes a neighbour-joining tree and outputs a newick string |
| 89 | + nj_newick = skbio.nj(distance_matrix, result_constructor=str) |
| 90 | + return nj_newick |
| 91 | + |
| 92 | +def get_nj_tree(pairwise_differences, samples, outgroup=None, prune_outgroup=True): |
| 93 | + nj_newick = get_nj_tree_newick(pairwise_differences, samples) |
| 94 | + tree_inferred = treetools.HsTree(nj_newick) |
| 95 | + if outgroup is not None: |
| 96 | + tree_inferred.set_outgroup(outgroup) |
| 97 | + if prune_outgroup: |
| 98 | + tree_inferred.prune([n for n in tree_inferred.get_leaf_names() if n != outgroup]) |
| 99 | + return tree_inferred |
| 100 | + |
0 commit comments