Skip to content

Commit 237292e

Browse files
committed
Ported over functionality from pypopgen2 and new diversity module
1 parent 144a22d commit 237292e

File tree

5 files changed

+761
-0
lines changed

5 files changed

+761
-0
lines changed

modules/diversity.py

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
import numpy as np
2+
import logging
3+
import copy
4+
import allel
5+
import skbio
6+
from pypopgen3.modules import treetools
7+
8+
logger = logging.getLogger()
9+
logging.basicConfig(
10+
format='%(levelname)-8s %(asctime)s %(filename) %(message)s')
11+
#logging.basicConfig(format='%(levelname)-8s %(asctime)s %(funcName)20s() %(message)s')
12+
logger.setLevel(logging.WARNING)
13+
14+
15+
def pairwise_differences(ac, an, correct_for_missing=True):
16+
"""
17+
Calculate pairwise genetic difference between many
18+
individuals/populations/species from allele count matrix (2D-array).
19+
Allele counts can be from individual haplotypes (0 or 1),
20+
genotypes (0,1,2) (for diploids) or populations (0,1,2,..,an).
21+
an and ac have input dimensions number of variants x number of populations.
22+
If normalised by the accessible genome size, the diagonal of the output corresponds to within-population
23+
nucleotide diversity pi (or, for genotypes, heterozygosity), the off-diagonal entries correspond
24+
to nucleotide divergence d_xy.
25+
26+
:param ac: 2D-array non-reference (or derived) allele count in haplotype/individual/population.
27+
Each row corresponds to a variant (e.g., SNP) site and each column corresponds to
28+
corresponds to a different haplotype/individual/population.
29+
:param an: 2D array of allele number for each variant in each haplotype/individual/population.
30+
Same dimensions as ac. Corresponds to the number of samples per haplotype/individual/population.
31+
E.g., an should be 1 for haplotypes and 2 for genotypes unless the variant call is missing.
32+
If the call is missing, then it is 0. For populations of diploid individuals an of SNP i in
33+
population j corresponds to the number of 2x the number of individuals in population j for
34+
which genotypes could be called at SNP i.
35+
:return: Symmetric (n populations x n populations) matrix of absolute counts of pairwise genetic difference.
36+
Each "population" can also just be a single individual or a single haplotype.
37+
38+
"""
39+
ac = np.array(ac)
40+
an = np.array(an).astype(float)
41+
an[an == 0] = np.nan
42+
af = ac / an
43+
af1 = np.nan_to_num(af)
44+
af1c = np.nan_to_num(1 - af)
45+
# this corrects for sampling without replacement in self comparisons
46+
af1c_within = np.nan_to_num(1 - (ac - 1.) / (an - 1))
47+
pi = np.zeros((af.shape[1], af.shape[1]))
48+
np.fill_diagonal(pi, (2 * af1 * af1c_within).sum(axis=0))
49+
dxy = np.einsum('ij,ik->jk', af1, af1c) + np.einsum('ij,ik->jk', af1c, af1)
50+
np.fill_diagonal(dxy, 0)
51+
pwd = pi + dxy
52+
53+
if correct_for_missing:
54+
# should we use an for all or only for variable sites to correct for this?
55+
n_pwc = n_pairwise_comparisons(an)
56+
pwd = pwd * len(an) / n_pwc
57+
58+
return pwd
59+
#4
60+
61+
def n_pairwise_comparisons(an):
62+
has_data = np.array(an).astype(float)
63+
has_data[np.isnan(has_data)] = 0
64+
has_data[has_data > 0] = 1
65+
comparison_number = np.einsum('ij,ik->jk', has_data, has_data)
66+
return comparison_number
67+
68+
def pairwise_differences_from_gt_array(gt_arr, correct_for_missing=True):
69+
"""
70+
Get pairwise genetic differences within and between genotypes from numpy array with genotpyes as loaded by
71+
allel.read_vcf.
72+
:param gt_arr: numpy array with genotpyes as loaded by allel.read_vcf
73+
:return: Symmetric (n individuals x n indiviudals) matrix of absolute counts of pairwise genetic difference.
74+
The diagonal corresponds to within-indiviudal heterozygosity.
75+
"""
76+
gt = gt_arr.astype(float)
77+
gt[gt < 0] = np.nan
78+
gt = np.sum(gt, axis=-1)
79+
an = 2 * (gt == gt).astype(int)
80+
return pairwise_differences(gt, an, correct_for_missing=correct_for_missing)
81+
82+
def get_nj_tree_newick(pairwise_differences, samples):
83+
pairwise_differences = copy.deepcopy(pairwise_differences)
84+
np.fill_diagonal(pairwise_differences, 0)
85+
86+
distance_matrix = skbio.DistanceMatrix(pairwise_differences,
87+
ids=samples)
88+
# This computes a neighbour-joining tree and outputs a newick string
89+
nj_newick = skbio.nj(distance_matrix, result_constructor=str)
90+
return nj_newick
91+
92+
def get_nj_tree(pairwise_differences, samples, outgroup=None, prune_outgroup=True):
93+
nj_newick = get_nj_tree_newick(pairwise_differences, samples)
94+
tree_inferred = treetools.HsTree(nj_newick)
95+
if outgroup is not None:
96+
tree_inferred.set_outgroup(outgroup)
97+
if prune_outgroup:
98+
tree_inferred.prune([n for n in tree_inferred.get_leaf_names() if n != outgroup])
99+
return tree_inferred
100+

modules/treetools.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -803,6 +803,8 @@ def align_fbranch_with_tree(fbranch, tree, outgroup, ladderize=False):
803803
tree_no = copy.deepcopy(tree)
804804
# tree_no.ladderize()
805805
# remove outgroup
806+
if outgroup not in tree_no.get_leaf_names():
807+
print("WARNING: outrgroup {} not in tree".format(outgroup))
806808
tree_no.prune([n for n in tree_no.get_leaf_names() if n != outgroup])
807809
#
808810
if ladderize:

0 commit comments

Comments
 (0)