-
Notifications
You must be signed in to change notification settings - Fork 42
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
6 changed files
with
953 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
# Recombination Simulation | ||
|
||
This repository contains scripts to create a multiple-sequence alignment of recombinant genomes. It was designed to test the recombination module in [UShER](https://usher-wiki.readthedocs.io/en/latest/), but may be used for other purposes as well. | ||
|
||
Note: I have been using specifically commit a6f65ade7a6606ef75902ee290585c6db23aeba6 as after this, the format for the output of `matUtils extract -S` has changed. To get this version of user, you can use `wget https://github.com/yatisht/usher/archive/a6f65ade7a6606ef75902ee290585c6db23aeba6.zip` and follow the remainder of the install instructions at [the UShER Wiki](https://usher-wiki.readthedocs.io/en/latest/). | ||
|
||
## Summary: | ||
|
||
### makeMutsFile.py | ||
generates a MSA of all nodes of interest. By default, this will produce a diff file for each internal node with at least 10 descendants, but you may set -t to 0 to get this for all internal nodes and leaves, or set -f to output a full MSA as well. | ||
|
||
the internal nodes from the initial tree with at least 10 descendants. | ||
Options: | ||
- -s (--samples): Sample mutation paths file, generated by e.g. *matUtils extract -i <input.pb> -A <sample-paths.tsv>*. | ||
- -l (--leaves): Leaves per node, generated by e.g. *matUtils extract -i <input.pb> -L <num-leaves.tsv>*. | ||
- -t (--threshold): By default, this script includes only nodes with at least 10 descendant leaves. To use a different minimum, specify here. (Default = 10). | ||
- -r (--reference): Reference genome, used to generate VCF. Default = 'wuhan.ref.fa' (in this repository, corresponds to [the reference genome of SARS-CoV-2](https://www.ncbi.nlm.nih.gov/nuccore/1798174254) with [problematic sites](https://raw.githubusercontent.com/W-L/ProblematicSites_SARS-CoV2/master/problematic_sites_sarsCov2.vcf) masked.). | ||
- -f (--fasta): Write fasta file containing all samples according to their mutation paths. This makes the script significantly slower. (Default = False) | ||
|
||
### makeRandomRecombinants.py | ||
generates a MSA of recombinant samples from the internal node sequences. | ||
Options: | ||
- -b (--breakpoints): Number of breakpoints that each recombinant sample will have. Must be 1 or 2 (Default = 1). | ||
- -s (--samples): Number of recombinant samples to create (Default = 100). | ||
- -c (--copies): Number of identical copies to make for each recombinant sample (Default = 10). | ||
- -d (--differences): Minimum mutational distance for acceptor/donor samples (Default = 10). | ||
- -m (--commonMutations): Number of mutations to add to each copy, shared by all in a set. (Default = 0). | ||
- -M (--randomMutations): Number of mutations to add to each copy, randomly chosen for each copy. (Default = 0). | ||
- -f (--fasta): Fasta file containing sequences for acceptor/donor samples. [REQUIRED] | ||
- -r (--reference): Fasta file containing reference genome for use in creating VCF. (Default = 'wuhan.ref.fa'). | ||
- -S (--separate): If enabled, will produce one MSA as a .fasta file for each set of recombinants to the argument directory. If not enabled, will not produce these files. | ||
|
||
## Example pipeline: | ||
|
||
``` | ||
matUtils extract -i input.pb -S samples.tsv -L num_leaves.tsv | ||
python makeMutsFile.py -s samples.tsv -l num-leaves.txt -t 10 -r wuhan.ref.fa | ||
python makeRandomRecombinants.py -b 1 -s 100 -c 10 -t 10 -d allNodeToMutsT10.txt -r wuhan.ref.fa -S 1BP_0M_10C | ||
faToVcf recombination_1.msa.fa recombination_1.msa.vcf | ||
usher -i input.pb -v recombination_1.msa.vcf -o recombination_1.msa.pb | ||
findRecombination -i recombination_1.msa.pb | ||
``` |
200 changes: 200 additions & 0 deletions
200
scripts/recombination/simulation/makeInternalNodesMSA.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,200 @@ | ||
#!/usr/bin/env python3 | ||
# Name: Bryan Thornlow | ||
# Date: 2/1/2018 | ||
# compareDatabases.py | ||
|
||
import sys | ||
import os | ||
import datetime | ||
import numpy | ||
from numpy import random | ||
import gzip | ||
import math | ||
import argparse | ||
|
||
########################## | ||
###### COMMAND LINE ###### | ||
########################## | ||
|
||
class CommandLine(object): | ||
""" | ||
Handles the input arguments from the command line. Manages | ||
the argument parser. | ||
""" | ||
|
||
def __init__(self, inOpts=None): | ||
''' | ||
CommandLine constructor. | ||
Implements a parser to interpret the command line input using argparse. | ||
''' | ||
self.parser = argparse.ArgumentParser() | ||
self.parser.add_argument("-s", "--samples", help="Sample mutation paths file. To generate: matUtils extract -i <input.pb> -A <sample-paths.tsv>. [REQUIRED]", default='') | ||
self.parser.add_argument("-l", "--leaves", help="File containing number of leaves for each node. Only nodes with at least 10 descendants will be used, "+ | ||
"as this is required for detection. To generate: matUtils extract -i <input.pb -L <num-leaves.tsv>. [REQUIRED]", default='') | ||
self.parser.add_argument("-t", "--threshold", help="By default, this script includes only nodes with at least 10 descendant leaves. To use a different minimum, specify here.",default=10) | ||
self.parser.add_argument("-r", "--ref", help="Fasta file containing reference genome. (Default = 'wuhan.ref.fa').", default='wuhan.ref.fa') | ||
|
||
if inOpts is None: | ||
self.args = vars(self.parser.parse_args()) | ||
else: | ||
self.args = vars(self.parser.parse_args(inOpts)) | ||
self.args = self.parser.parse_args() | ||
|
||
########################## | ||
##### MAIN FUNCTIONS ##### | ||
########################## | ||
|
||
def getMutationsFile(myS, myL, myT, myR): | ||
myReference = '' | ||
with open(myR) as f: | ||
for line in f: | ||
l = line.strip() | ||
if not l.startswith('>'): | ||
myReference += l.strip().upper() | ||
|
||
nodeToMuts = {1:[]} | ||
with open(myS) as f: | ||
for line in f: | ||
splitLine = (line.strip()).split('\t') | ||
myPath = stripEach(splitLine[1].split('>')) | ||
pastMuts = [] | ||
for k in myPath: | ||
if k.endswith(')'): # we don't care about mutations on sample leafs | ||
s = k.split() | ||
if len(s) == 1: | ||
myNode = int((s[0])[1:-1]) | ||
mutSplit = [] | ||
else: | ||
myNode = int((s[1])[1:-1]) | ||
mutSplit = s[0].split(',') | ||
if not myNode in nodeToMuts: | ||
nodeToMuts[myNode] = [] | ||
for m in pastMuts: | ||
nodeToMuts[myNode].append(m) | ||
for m in mutSplit: | ||
nodeToMuts[myNode].append(m) | ||
for m in mutSplit: | ||
pastMuts.append(m) | ||
|
||
|
||
goodNodes = {} | ||
with open(myL) as f: | ||
for line in f: | ||
splitLine = (line.strip()).split('\t') | ||
if splitLine[0].isdigit() and int(splitLine[1]) >= myT: | ||
goodNodes[int(splitLine[0])] = True | ||
|
||
myOutMSA = '' | ||
myOutString = '' | ||
for n in nodeToMuts: | ||
if n in goodNodes and n != 1: # don't use the root -- it has no mutations and is not useful for our purposes | ||
myOutMSA += '>NODE'+str(n)+'\n'+makeChanges(myReference, nodeToMuts[n])+'\n' | ||
myOutString += str(n)+'\t'+','.join(nodeToMuts[n])+'\n' | ||
open('internal_nodes.msa.fa','w').write(myOutMSA) | ||
open('nodeToMuts.txt','w').write(myOutString) | ||
|
||
|
||
########################## | ||
#### HELPER FUNCTIONS #### | ||
########################## | ||
|
||
def stripEach(myList): | ||
myReturn = [] | ||
for k in myList: | ||
myReturn.append(k.strip()) | ||
return(myReturn) | ||
|
||
def makeChanges(ref, muts): | ||
myReturn = [] | ||
for k in list(ref): | ||
myReturn.append(k) | ||
for k in muts: | ||
myCoord = int(k[1:-1])-1 | ||
if not myReturn[myCoord] == k[0]: | ||
print(k, myReturn[myCoord]) | ||
myReturn[myCoord] = str(k[-1]) | ||
if not myReturn[myCoord] == k[-1]: | ||
print(k, myReturn[myCoord]) | ||
return(''.join(myReturn)) | ||
|
||
def getPos(myInds, intLineNumToPos): | ||
myReturn = [] | ||
for k in myInds: | ||
myReturn.append(intLineNumToPos[k]) | ||
return(myReturn) | ||
|
||
def joiner(entry): | ||
newList = [] | ||
for k in entry: | ||
newList.append(str(k)) | ||
return '\t'.join(newList) | ||
|
||
def joinerU(entry): | ||
newList = [] | ||
for k in entry: | ||
newList.append(str(k)) | ||
return '_'.join(newList) | ||
|
||
def joinerC(entry): | ||
newList = [] | ||
for k in entry: | ||
newList.append('"'+str(k)+'"') | ||
return ','.join(newList) | ||
|
||
######################### | ||
##### FUNCTION CALL ##### | ||
######################### | ||
|
||
def main(myCommandLine=None): | ||
""" | ||
Initializes a CommandLine object and passes the provided | ||
arguments into a new fileConverter object and calls main method. | ||
""" | ||
myCommandLine = CommandLine() | ||
|
||
# Necessary files: | ||
if myCommandLine.args.samples: | ||
myS = myCommandLine.args.samples | ||
if myCommandLine.args.leaves: | ||
myL = myCommandLine.args.leaves | ||
if myCommandLine.args.threshold: | ||
myT = int(myCommandLine.args.threshold) | ||
else: | ||
myT = 10 | ||
if myCommandLine.args.ref: | ||
myR = myCommandLine.args.ref | ||
|
||
getMutationsFile(myS, myL, myT, myR) | ||
|
||
|
||
if __name__ == "__main__": | ||
""" | ||
Calls main when program is run by user. | ||
""" | ||
main(); | ||
raise SystemExit | ||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
Oops, something went wrong.