diff --git a/scripts/recombination/simulation/README.md b/scripts/recombination/simulation/README.md new file mode 100644 index 00000000..fbd80c05 --- /dev/null +++ b/scripts/recombination/simulation/README.md @@ -0,0 +1,42 @@ +# Recombination Simulation + +This repository contains scripts to create a multiple-sequence alignment of recombinant genomes. It was designed to test the recombination module in [UShER](https://usher-wiki.readthedocs.io/en/latest/), but may be used for other purposes as well. + +Note: I have been using specifically commit a6f65ade7a6606ef75902ee290585c6db23aeba6 as after this, the format for the output of `matUtils extract -S` has changed. To get this version of user, you can use `wget https://github.com/yatisht/usher/archive/a6f65ade7a6606ef75902ee290585c6db23aeba6.zip` and follow the remainder of the install instructions at [the UShER Wiki](https://usher-wiki.readthedocs.io/en/latest/). + +## Summary: + +### makeMutsFile.py +generates a MSA of all nodes of interest. By default, this will produce a diff file for each internal node with at least 10 descendants, but you may set -t to 0 to get this for all internal nodes and leaves, or set -f to output a full MSA as well. + +the internal nodes from the initial tree with at least 10 descendants. +Options: +- -s (--samples): Sample mutation paths file, generated by e.g. *matUtils extract -i -A *. +- -l (--leaves): Leaves per node, generated by e.g. *matUtils extract -i -L *. +- -t (--threshold): By default, this script includes only nodes with at least 10 descendant leaves. To use a different minimum, specify here. (Default = 10). +- -r (--reference): Reference genome, used to generate VCF. Default = 'wuhan.ref.fa' (in this repository, corresponds to [the reference genome of SARS-CoV-2](https://www.ncbi.nlm.nih.gov/nuccore/1798174254) with [problematic sites](https://raw.githubusercontent.com/W-L/ProblematicSites_SARS-CoV2/master/problematic_sites_sarsCov2.vcf) masked.). +- -f (--fasta): Write fasta file containing all samples according to their mutation paths. This makes the script significantly slower. (Default = False) + +### makeRandomRecombinants.py +generates a MSA of recombinant samples from the internal node sequences. +Options: +- -b (--breakpoints): Number of breakpoints that each recombinant sample will have. Must be 1 or 2 (Default = 1). +- -s (--samples): Number of recombinant samples to create (Default = 100). +- -c (--copies): Number of identical copies to make for each recombinant sample (Default = 10). +- -d (--differences): Minimum mutational distance for acceptor/donor samples (Default = 10). +- -m (--commonMutations): Number of mutations to add to each copy, shared by all in a set. (Default = 0). +- -M (--randomMutations): Number of mutations to add to each copy, randomly chosen for each copy. (Default = 0). +- -f (--fasta): Fasta file containing sequences for acceptor/donor samples. [REQUIRED] +- -r (--reference): Fasta file containing reference genome for use in creating VCF. (Default = 'wuhan.ref.fa'). +- -S (--separate): If enabled, will produce one MSA as a .fasta file for each set of recombinants to the argument directory. If not enabled, will not produce these files. + +## Example pipeline: + +``` +matUtils extract -i input.pb -S samples.tsv -L num_leaves.tsv +python makeMutsFile.py -s samples.tsv -l num-leaves.txt -t 10 -r wuhan.ref.fa +python makeRandomRecombinants.py -b 1 -s 100 -c 10 -t 10 -d allNodeToMutsT10.txt -r wuhan.ref.fa -S 1BP_0M_10C +faToVcf recombination_1.msa.fa recombination_1.msa.vcf +usher -i input.pb -v recombination_1.msa.vcf -o recombination_1.msa.pb +findRecombination -i recombination_1.msa.pb +``` diff --git a/scripts/recombination/simulation/makeInternalNodesMSA.py b/scripts/recombination/simulation/makeInternalNodesMSA.py new file mode 100644 index 00000000..235c3977 --- /dev/null +++ b/scripts/recombination/simulation/makeInternalNodesMSA.py @@ -0,0 +1,200 @@ +#!/usr/bin/env python3 +# Name: Bryan Thornlow +# Date: 2/1/2018 +# compareDatabases.py + +import sys +import os +import datetime +import numpy +from numpy import random +import gzip +import math +import argparse + +########################## +###### COMMAND LINE ###### +########################## + +class CommandLine(object): + """ + Handles the input arguments from the command line. Manages + the argument parser. + """ + + def __init__(self, inOpts=None): + ''' + CommandLine constructor. + Implements a parser to interpret the command line input using argparse. + ''' + self.parser = argparse.ArgumentParser() + self.parser.add_argument("-s", "--samples", help="Sample mutation paths file. To generate: matUtils extract -i -A . [REQUIRED]", default='') + self.parser.add_argument("-l", "--leaves", help="File containing number of leaves for each node. Only nodes with at least 10 descendants will be used, "+ + "as this is required for detection. To generate: matUtils extract -i . [REQUIRED]", default='') + self.parser.add_argument("-t", "--threshold", help="By default, this script includes only nodes with at least 10 descendant leaves. To use a different minimum, specify here.",default=10) + self.parser.add_argument("-r", "--ref", help="Fasta file containing reference genome. (Default = 'wuhan.ref.fa').", default='wuhan.ref.fa') + + if inOpts is None: + self.args = vars(self.parser.parse_args()) + else: + self.args = vars(self.parser.parse_args(inOpts)) + self.args = self.parser.parse_args() + +########################## +##### MAIN FUNCTIONS ##### +########################## + +def getMutationsFile(myS, myL, myT, myR): + myReference = '' + with open(myR) as f: + for line in f: + l = line.strip() + if not l.startswith('>'): + myReference += l.strip().upper() + + nodeToMuts = {1:[]} + with open(myS) as f: + for line in f: + splitLine = (line.strip()).split('\t') + myPath = stripEach(splitLine[1].split('>')) + pastMuts = [] + for k in myPath: + if k.endswith(')'): # we don't care about mutations on sample leafs + s = k.split() + if len(s) == 1: + myNode = int((s[0])[1:-1]) + mutSplit = [] + else: + myNode = int((s[1])[1:-1]) + mutSplit = s[0].split(',') + if not myNode in nodeToMuts: + nodeToMuts[myNode] = [] + for m in pastMuts: + nodeToMuts[myNode].append(m) + for m in mutSplit: + nodeToMuts[myNode].append(m) + for m in mutSplit: + pastMuts.append(m) + + + goodNodes = {} + with open(myL) as f: + for line in f: + splitLine = (line.strip()).split('\t') + if splitLine[0].isdigit() and int(splitLine[1]) >= myT: + goodNodes[int(splitLine[0])] = True + + myOutMSA = '' + myOutString = '' + for n in nodeToMuts: + if n in goodNodes and n != 1: # don't use the root -- it has no mutations and is not useful for our purposes + myOutMSA += '>NODE'+str(n)+'\n'+makeChanges(myReference, nodeToMuts[n])+'\n' + myOutString += str(n)+'\t'+','.join(nodeToMuts[n])+'\n' + open('internal_nodes.msa.fa','w').write(myOutMSA) + open('nodeToMuts.txt','w').write(myOutString) + + +########################## +#### HELPER FUNCTIONS #### +########################## + +def stripEach(myList): + myReturn = [] + for k in myList: + myReturn.append(k.strip()) + return(myReturn) + +def makeChanges(ref, muts): + myReturn = [] + for k in list(ref): + myReturn.append(k) + for k in muts: + myCoord = int(k[1:-1])-1 + if not myReturn[myCoord] == k[0]: + print(k, myReturn[myCoord]) + myReturn[myCoord] = str(k[-1]) + if not myReturn[myCoord] == k[-1]: + print(k, myReturn[myCoord]) + return(''.join(myReturn)) + +def getPos(myInds, intLineNumToPos): + myReturn = [] + for k in myInds: + myReturn.append(intLineNumToPos[k]) + return(myReturn) + +def joiner(entry): + newList = [] + for k in entry: + newList.append(str(k)) + return '\t'.join(newList) + +def joinerU(entry): + newList = [] + for k in entry: + newList.append(str(k)) + return '_'.join(newList) + +def joinerC(entry): + newList = [] + for k in entry: + newList.append('"'+str(k)+'"') + return ','.join(newList) + +######################### +##### FUNCTION CALL ##### +######################### + +def main(myCommandLine=None): + """ + Initializes a CommandLine object and passes the provided + arguments into a new fileConverter object and calls main method. + """ + myCommandLine = CommandLine() + + # Necessary files: + if myCommandLine.args.samples: + myS = myCommandLine.args.samples + if myCommandLine.args.leaves: + myL = myCommandLine.args.leaves + if myCommandLine.args.threshold: + myT = int(myCommandLine.args.threshold) + else: + myT = 10 + if myCommandLine.args.ref: + myR = myCommandLine.args.ref + + getMutationsFile(myS, myL, myT, myR) + + +if __name__ == "__main__": + """ + Calls main when program is run by user. + """ + main(); + raise SystemExit + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/scripts/recombination/simulation/makeMutsFile.py b/scripts/recombination/simulation/makeMutsFile.py new file mode 100644 index 00000000..2f9dd9e5 --- /dev/null +++ b/scripts/recombination/simulation/makeMutsFile.py @@ -0,0 +1,236 @@ +#!/usr/bin/env python3 +# Name: Bryan Thornlow +# Date: 2/1/2018 +# compareDatabases.py + +import sys +import os +import datetime +import numpy +from numpy import random +import gzip +import math +import argparse + +""" +Note: Newest matUtils extract -S does not always output all nodes regardless of whether they house a mutation or not. +Using commit a6f65ade7a6606ef75902ee290585c6db23aeba6 for usher +""" + +########################## +###### COMMAND LINE ###### +########################## + +class CommandLine(object): + """ + Handles the input arguments from the command line. Manages + the argument parser. + """ + + def __init__(self, inOpts=None): + ''' + CommandLine constructor. + Implements a parser to interpret the command line input using argparse. + ''' + self.parser = argparse.ArgumentParser() + self.parser.add_argument("-s", "--samples", help="Sample mutation paths file. To generate: matUtils extract -i -S . [REQUIRED]", default='') + self.parser.add_argument("-l", "--leaves", help="File containing number of leaves for each node. Only nodes with at least 10 descendants will be used, "+ + "as this is required for detection. To generate: matUtils extract -i -L . Required if --threshold > 0.", default='') + self.parser.add_argument("-t", "--threshold", help="By default, this script includes only nodes with at least 10 descendant leaves. To use a different minimum, specify here.",default=10) + self.parser.add_argument("-r", "--ref", help="Fasta file containing reference genome. (Default = 'wuhan.ref.fa').", default='wuhan.ref.fa') + self.parser.add_argument("-f", "--fasta", help="Write fasta file containing all samples according to their mutation paths. This makes the script significantly slower. (Default = False)",default=False) + + if inOpts is None: + self.args = vars(self.parser.parse_args()) + else: + self.args = vars(self.parser.parse_args(inOpts)) + self.args = self.parser.parse_args() + +########################## +##### MAIN FUNCTIONS ##### +########################## + +def getMutationsFile(myS,myL,myT,myR,myF): + myReference = '' + with open(myR) as f: + for line in f: + l = line.strip() + if not l.startswith('>'): + myReference += l.strip().upper() + sys.stderr.write("Finished reading in reference.\n") + + nodeToMuts = {1:[]} + nodeToPars = {} + lineCounter = 0 + with open(myS) as f: + for line in f: + splitLine = (line.strip()).split('\t') + myPath = stripEach(splitLine[1].split('>')) + pastMuts = [] + for k in myPath: + if k.endswith(')'): # for internal nodes + s = k.split() + if len(s) == 1: + myNode = str((s[0])[1:-1]) + mutSplit = [] + else: + myNode = str((s[1])[1:-1]) + mutSplit = s[0].split(',') + nodeToPars[myNode] = len(mutSplit) + if not myNode in nodeToMuts: + nodeToMuts[myNode] = [] + for m in pastMuts: + nodeToMuts[myNode].append(m) + for m in mutSplit: + nodeToMuts[myNode].append(m) + for m in mutSplit: + pastMuts.append(m) + else: # this is the sample leaf + mutSplit = k.split(',') + nodeToMuts[str(splitLine[0])] = [] + nodeToPars[splitLine[0]] = len(mutSplit) + for m in pastMuts: + nodeToMuts[splitLine[0]].append(m) + for m in mutSplit: + nodeToMuts[splitLine[0]].append(m) + sys.stderr.write("Finished storing all mutations.\n") + + myOutMSA = '' + myOutString = '' + myOutPars = '' + if myT > 0: # If threshold specified for leaves, read leaves file and only output nodes passing threshold + with open(myL) as f: + for line in f: + splitLine = (line.strip()).split('\t') + if splitLine[0].isdigit() and int(splitLine[1]) >= myT and int(splitLine[0]) != 1: + if myF != False: + myOutMSA += '>'+str(splitLine[0])+'\n'+makeChanges(myReference, nodeToMuts[str(splitLine[0])])+'\n' + myOutString += str(splitLine[0])+'\t'+','.join(nodeToMuts[str(splitLine[0])])+'\n' + myOutPars += str(splitLine[0])+'\t'+str(nodeToPars[str(splitLine[0])])+'\n' + + else: # Otherwise, output all nodes + for n in nodeToMuts: + if n != 1: + if myF: + myOutMSA += '>'+str(n)+'\n'+makeChanges(myReference, nodeToMuts[n])+'\n' + myOutString += str(n)+'\t'+','.join(nodeToMuts[n])+'\n' + myOutPars += str(n)+'\t'+str(nodeToPars[n])+'\n' + if myF: + open('allNodesT'+str(myT)+'.msa.fa','w').write(myOutMSA) + open('allNodeToMutsT'+str(myT)+'.txt','w').write(myOutString) + open('allNodeToParsT'+str(myT)+'.txt','w').write(myOutPars) + sys.stderr.write("Finished writing all output files.\n") + + +########################## +#### HELPER FUNCTIONS #### +########################## + +def stripEach(myList): + myReturn = [] + for k in myList: + myReturn.append(k.strip()) + return(myReturn) + +def makeChanges(ref, muts): + myReturn = [] + for k in list(ref): + myReturn.append(k) + for k in muts: + myCoord = int(k[1:-1])-1 + if not myReturn[myCoord] == k[0]: + print(k, myReturn[myCoord]) + myReturn[myCoord] = str(k[-1]) + if not myReturn[myCoord] == k[-1]: + print(k, myReturn[myCoord]) + return(''.join(myReturn)) + +def getPos(myInds, intLineNumToPos): + myReturn = [] + for k in myInds: + myReturn.append(intLineNumToPos[k]) + return(myReturn) + +def joiner(entry): + newList = [] + for k in entry: + newList.append(str(k)) + return '\t'.join(newList) + +def joinerU(entry): + newList = [] + for k in entry: + newList.append(str(k)) + return '_'.join(newList) + +def joinerC(entry): + newList = [] + for k in entry: + newList.append('"'+str(k)+'"') + return ','.join(newList) + +######################### +##### FUNCTION CALL ##### +######################### + +def main(myCommandLine=None): + """ + Initializes a CommandLine object and passes the provided + arguments into a new fileConverter object and calls main method. + """ + myCommandLine = CommandLine() + + # Necessary files: + if myCommandLine.args.samples: + myS = myCommandLine.args.samples + if myCommandLine.args.leaves: + myL = myCommandLine.args.leaves + if myCommandLine.args.threshold: + myT = int(myCommandLine.args.threshold) + if myT == 0: + myL = '' + else: + myT = 10 + if myCommandLine.args.ref: + myR = myCommandLine.args.ref + else: + myR = 'wuhan.ref.fa' + if myCommandLine.args.fasta: + myF = myCommandLine.args.fasta + else: + myF = False + + getMutationsFile(myS, myL, myT, myR, myF) + + +if __name__ == "__main__": + """ + Calls main when program is run by user. + """ + main(); + raise SystemExit + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/scripts/recombination/simulation/makeRandomRecombinants.py b/scripts/recombination/simulation/makeRandomRecombinants.py new file mode 100644 index 00000000..00621dee --- /dev/null +++ b/scripts/recombination/simulation/makeRandomRecombinants.py @@ -0,0 +1,381 @@ +#!/usr/bin/env python3 +# Name: Bryan Thornlow +# Date: 2/1/2018 +# compareDatabases.py + +import sys +import os +import datetime +import numpy +from numpy import random +import gzip +import math +import argparse + +########################## +###### COMMAND LINE ###### +########################## + +class CommandLine(object): + """Handles the input arguments from the command line. Manages + the argument parser.""" + + def __init__(self, inOpts=None): + ''' + CommandLine constructor. + Implements a parser to interpret the command line input using argparse. + ''' + self.parser = argparse.ArgumentParser() + self.parser.add_argument("-b", "--breakpoints", help="Number of breakpoints that each recombinant sample will have. Must be [1..4] (Default = 1).", default=1, type=int) + self.parser.add_argument("-s", "--samples", help="Number of recombinant samples to create (Default = 100).", default=100, type=int) + self.parser.add_argument("-c", "--copies", help="Number of identical copies to make for each recombinant sample (Default = 10).", default=10, type=int) + self.parser.add_argument("-m", "--commonMutations", help="Number of mutations to add to each copy, shared by all in a set. (Default = 0).", default=0, type=int) + self.parser.add_argument("-M", "--randomMutations", help="Number of mutations to add to each copy, randomly chosen for each copy. (Default = 0).", default=0, type=int) + self.parser.add_argument("-t", "--threshold", help="Minimum mutational distance for acceptor/donor samples (Default = 10).", default=10, type=int) + self.parser.add_argument("-f", "--fasta", help="Fasta file containing sequences for acceptor/donor samples. [Must include either -f or -d!]", default='') + self.parser.add_argument("-d", "--differences", help="File containing all mutations relative to reference for each sample (allNodeToMuts.py output by makeMutsFile.py) [Must include either -f or -d!]", default='') + self.parser.add_argument("-r", "--ref", help="Fasta file containing reference genome for use in creating VCF. (Default = 'wuhan.ref.fa').", default='wuhan.ref.fa') + self.parser.add_argument("-S", "--separate", help="If enabled, will produce one MSA as a .fasta file for each set of recombinants to the argument directory. If not enabled, will not produce these files.", default=False) + if inOpts is None: + self.args = vars(self.parser.parse_args()) + else: + self.args = vars(self.parser.parse_args(inOpts)) + self.args = self.parser.parse_args() + if self.args.breakpoints < 1 or self.args.breakpoints > 4: + sys.stderr.write("Please retry with between 1 and 4 breakpoints.\n") + sys.exit(1) + if self.args.fasta == '' and self.args.differences == '': + sys.stderr.write("Please supply either a MSA via --fasta or a diff file via --differences.\n") + sys.exit(1) + +########################## +##### MAIN FUNCTIONS ##### +########################## + +def makeExamples(myS, myB, myC, myD, myF, myT, mym, myM, myR, mySep): + + ### Read in reference sequence + posToRef = {} + with open(myR) as f: + for line in f: + l = line.strip() + if not l.startswith('>'): + myReference = l.upper() + for i in range(0,len(myReference)): + posToRef[i] = myReference[i] + else: + myRefName = l[1:] + sys.stderr.write("Finished reading in reference.\n") + + + ### Read in either differences file or MSA to get our node sequences + nodeToDiffs = {} + if myD != '': + with open(myD) as f: + for line in f: + splitLine = (line.strip()).split('\t') + if len(splitLine) == 1: + nodeToDiffs[splitLine[0]] = [] + else: + nodeToDiffs[splitLine[0]] = splitLine[1].split(',') + + sampleToSeq = {} + if myF != '': + with open(myF) as f: + for line in f: + l = line.strip() + if l.startswith('>'): + mySample = l[1:] + else: + sampleToSeq[mySample] = l + sys.stderr.write("Finished reading in Diff/MSA input file.\n") + + recSampleToLog = {} + recSampleToSeq = {} + recSampleToDiffBetweenBps = {} + while len(recSampleToSeq.keys()) < myS: + + ### Get samples for 2 sequences that will make up our recombinant + if myD == '': + samples = numpy.random.choice(list(sampleToSeq.keys()), size=2, replace=False) + mySampleName = 'RECOMB_'+str(myB)+'_'+str(len(recSampleToSeq))+'_'+str(samples[0])+'_'+str(samples[1]) + s1 = samples[0] + s2 = samples[1] + elif myF == '': + samples = numpy.random.choice(list(nodeToDiffs.keys()), size=2, replace=False) + mySampleName = 'RECOMB_'+str(myB)+'_'+str(len(recSampleToSeq))+'_'+str(samples[0])+'_'+str(samples[1]) + s1 = samples[0] + sampleToSeq[s1] = addMuts(myReference, nodeToDiffs[s1]) + s2 = samples[1] + sampleToSeq[s2] = addMuts(myReference, nodeToDiffs[s2]) + + ### Create recombinant sequence from our two samples + myTotalDiff = getDiff(sampleToSeq[s1],sampleToSeq[s2], 0) + if myB == 1: + bps = numpy.random.choice(sorted(list(posToRef.keys())),size=1, replace=False) + bp1 = bps[0] + mySeq = sampleToSeq[s1][:bp1]+sampleToSeq[s2][bp1:] + myDiff = minLen(getDiff(sampleToSeq[s1][:bp1], sampleToSeq[s2][:bp1], 0), getDiff(sampleToSeq[s1][bp1:], sampleToSeq[s2][bp1:], bp1)) + print(myDiff, myT) + elif myB == 2: + bps = sorted(numpy.random.choice(sorted(list(posToRef.keys())),size=2, replace=False)) + while bps[1]-bps[0] <= 1000: + bps = sorted(numpy.random.choice(sorted(list(posToRef.keys())),size=2, replace=False)) + bp1 = bps[0] + bp2 = bps[1] + mySeq = sampleToSeq[s1][:bp1]+sampleToSeq[s2][bp1:bp2]+sampleToSeq[s1][bp2:] + myDiff = minLen(getDiff(sampleToSeq[s1][bp1:bp2], sampleToSeq[s2][bp1:bp2], bp1), getDiff(sampleToSeq[s1][:bp1]+sampleToSeq[s1][bp2:], sampleToSeq[s2][:bp1]+sampleToSeq[s2][bp2:], 0)) + elif myB == 3: + bps = sorted(numpy.random.choice(sorted(list(posToRef.keys())),size=3, replace=False)) + while bps[1]-bps[0] <= 1000 or bps[2]-bps[1] <= 1000: + bps = sorted(numpy.random.choice(sorted(list(posToRef.keys())),size=3, replace=False)) + bp1 = bps[0] + bp2 = bps[1] + bp3 = bps[2] + mySeq = sampleToSeq[s1][:bp1]+sampleToSeq[s2][bp1:bp2]+sampleToSeq[s1][bp2:bp3]+sampleToSeq[s2][bp3:] + diff1 = getDiff(sampleToSeq[s1][bp1:bp2]+sampleToSeq[s1][bp3:], sampleToSeq[s2][bp1:bp2]+sampleToSeq[s2][bp3:], bp1) + diff2 = getDiff(sampleToSeq[s1][:bp1]+sampleToSeq[s1][bp2:bp3], sampleToSeq[s2][:bp1]+sampleToSeq[s2][bp2:bp3], 0) + myDiff = minLen(diff1, diff2) + elif myB == 4: + bps = sorted(numpy.random.choice(sorted(list(posToRef.keys())),size=4, replace=False)) + while bps[1]-bps[0] <= 1000 or bps[2]-bps[1] <= 1000 or bps[3]-bps[2] <= 1000: + bps = sorted(numpy.random.choice(sorted(list(posToRef.keys())),size=4, replace=False)) + bp1 = bps[0] + bp2 = bps[1] + bp3 = bps[2] + bp4 = bps[3] + mySeq = sampleToSeq[s1][:bp1]+sampleToSeq[s2][bp1:bp2]+sampleToSeq[s1][bp2:bp3]+sampleToSeq[s2][bp3:bp4]+sampleToSeq[s1][bp4:] + diff1 = getDiff(sampleToSeq[s1][bp1:bp2]+sampleToSeq[s1][bp3:bp4], sampleToSeq[s2][bp1:bp2]+sampleToSeq[s2][bp3:bp4], bp1) + diff2 = getDiff(sampleToSeq[s1][:bp1]+sampleToSeq[s1][bp2:bp3]+sampleToSeq[s1][bp4:], sampleToSeq[s2][:bp1]+sampleToSeq[s2][bp2:bp3]+sampleToSeq[s2][bp4:], 0) + myDiff = minLen(diff1, diff2) + + ### If the differences between the parents of our recombinant is above the threshold between all adjacent breakpoint pairs, keep it + if len(myDiff) >= myT: + recSampleToLog[mySampleName] = [joiner(samples), joiner(bps), len(myTotalDiff), len(myDiff)] + if mym > 0: ### Add common mutations here, prior to making copies + myMuts = [] + for m in range(0, mym): + mySeq, myMut = addMut(mySeq, numpy.random.choice(sorted(list(posToRef.keys())),size=1, replace=False)[0]) + myMuts.append(myMut) + recSampleToLog[mySampleName].append(joiner(myMuts)) + recSampleToSeq[mySampleName] = mySeq + recSampleToDiffBetweenBps[mySampleName] = myDiff + sys.stderr.write("Generated "+str(len(recSampleToSeq.keys()))+" recombinant sequences.\n") + sys.stderr.write("Finished generating recombinant sequences.\n") + + bpToHeader = {} + bpToHeader[1] = 'recombinant_sample\tparent1\tparent2\tgenetic_distance\tmutations_in_recomb_tract\tbp1\n' + bpToHeader[2] = 'recombinant_sample\tparent1\tparent2\tgenetic_distance\tmutations_in_recomb_tract\tbp1\tbp2\n' + bpToHeader[3] = 'recombinant_sample\tparent1\tparent2\tgenetic_distance\tmutations_in_recomb_tract\tbp1\tbp2\tbp3\n' + bpToHeader[4] = 'recombinant_sample\tparent1\tparent2\tgenetic_distance\tmutations_in_recomb_tract\tbp1\tbp2\tbp3\tbp4\n' + + ### Write our MSA, which we will use to create a VCF to add to the starting .pb via faToVcf and UShER + myOutMSA = '>'+myRefName+'\n'+myReference+'\n' + mySepMSAs = [] + myOutLog = bpToHeader[myB] + myOutDiff = '' + for s in recSampleToSeq: + tempMSA = '>'+myRefName+'\n'+myReference+'\n' ### Fasta should have reference sequence first, for use with faToVcf + for x in range(0,myC): + if myM > 0: + myMuts = [] + mySeq = recSampleToSeq[s] + for m in range(0, myM): + mySeq, myMut = addMut(mySeq, numpy.random.choice(sorted(list(posToRef.keys())),size=1, replace=False)[0]) + myMuts.append(myMut) + myOutMSA += '>'+s+'_X'+str(x)+'\n'+mySeq+'\n' + tempMSA += '>'+s+'_X'+str(x)+'\n'+mySeq+'\n' + myOutLog += s+'_X'+str(x)+'\t'+doubleJoiner(recSampleToLog[s])+'\t'+joiner(myMuts)+'\n' + myOutDiff += s+'_X'+str(x)+'\t'+joinerC(recSampleToDiffBetweenBps[s])+'\n' + else: + myOutMSA += '>'+s+'_X'+str(x)+'\n'+recSampleToSeq[s]+'\n' + tempMSA += '>'+s+'_X'+str(x)+'\n'+recSampleToSeq[s]+'\n' + myOutLog += s+'_X'+str(x)+'\t'+doubleJoiner(recSampleToLog[s])+'\n' + myOutDiff += s+'_X'+str(x)+'\t'+joinerC(recSampleToDiffBetweenBps[s])+'\n' + mySepMSAs.append(tempMSA) + open('recombination_'+str(myB)+'_'+str(myC)+'_'+str(myM)+'.msa.fa','w').write(myOutMSA) + open('recombination_'+str(myB)+'_'+str(myC)+'_'+str(myM)+'.log','w').write(myOutLog) + open('recombination_'+str(myB)+'_'+str(myC)+'_'+str(myM)+'.differences.txt','w').write(myOutDiff) + + myOutFaToVcf = '' + if mySep != False: + if not os.path.exists(mySep): + os.mkdir('./'+mySep) + for i in range(1,len(mySepMSAs)+1): + open('./'+mySep+'/recombinant_set_'+str(i)+'.fa','w').write(mySepMSAs[i-1]) + myOutFaToVcf += 'faToVcf recombinant_set_'+str(i)+'.fa recombinant_set_'+str(i)+'.vcf\n' + open('./'+mySep+'/makeSetVCFs.sh','w').write(myOutFaToVcf) + +########################## +#### HELPER FUNCTIONS #### +########################## + +def addMut(seq, pos): + #print(pos, len(seq)) + myReturn = [] + for i in range(0,len(seq)): + if i != int(pos): + myReturn.append(seq[i]) + else: + if seq[i] == 'A': + mySub = numpy.random.choice(['C','G','T'], size=1)[0] + elif seq[i] == 'C': + mySub = numpy.random.choice(['A','G','T'], size=1)[0] + elif seq[i] == 'G': + mySub = numpy.random.choice(['C','A','T'], size=1)[0] + elif seq[i] == 'T': + mySub = numpy.random.choice(['A','G','C'], size=1)[0] + else: + mySub = numpy.random.choice(['A','G','C','T'], size=1)[0] + myMut = seq[i]+str(pos)+mySub + myReturn.append(mySub) + return(''.join(myReturn), myMut) + +def addMuts(ref, muts): + myReturn = [] + for k in list(ref): + myReturn.append(k) + if len(muts) > 0: + for k in muts: + myCoord = int(k[1:-1])-1 + if not myReturn[myCoord] == k[0]: + print(k, myReturn[myCoord]) + myReturn[myCoord] = str(k[-1]) + if not myReturn[myCoord] == k[-1]: + print(k, myReturn[myCoord]) + return(''.join(myReturn)) + +def getDiff(s1, s2, add): + myReturn = [] + for i in range(0,len(s1)): + if s1[i] != s2[i]: + myReturn.append(add+i) + return(myReturn) + +def minLen(l1, l2): + if len(l1) <= len(l2): + return(l1) + else: + return(l2) + +def doubleJoiner(myList): + myReturn = [] + for k in myList: + if type(k) == list: + myReturn.append(joiner(k)) + else: + myReturn.append(k) + return(joiner(myReturn)) + +def replaceSymbols(myEntry): + myEntry = myEntry.replace('|', '_') + myEntry = myEntry.replace('/', '_') + return(myEntry) + +def joiner(entry): + newList = [] + for k in entry: + newList.append(str(k)) + return '\t'.join(newList) + +def joinerU(entry): + newList = [] + for k in entry: + newList.append(str(k)) + return '_'.join(newList) + +def joinerC(entry): + newList = [] + for k in entry: + newList.append(str(k)) + return(','.join(newList)) + +######################### +##### FUNCTION CALL ##### +######################### + +def main(myCommandLine=None): + """ + Initializes a CommandLine object and passes the provided + arguments into a new fileConverter object and calls main method. + """ + myCommandLine = CommandLine() + + # Necessary files: + if myCommandLine.args.samples: + myS = myCommandLine.args.samples + else: + myS = 100 + if myCommandLine.args.breakpoints: + myB = myCommandLine.args.breakpoints + else: + myB = 1 + if myCommandLine.args.copies: + myC = myCommandLine.args.copies + else: + myC = 10 + if str(myCommandLine.args.threshold): + myT = myCommandLine.args.threshold + else: + myT = 10 + if myCommandLine.args.fasta: + myF = myCommandLine.args.fasta + else: + myF = '' + if myCommandLine.args.commonMutations: + mym = myCommandLine.args.commonMutations + else: + mym = 0 + if myCommandLine.args.randomMutations: + myM = myCommandLine.args.randomMutations + else: + myM = 0 + if myCommandLine.args.ref: + myR = myCommandLine.args.ref + else: + myR = 'wuhan.ref.fa' + if myCommandLine.args.separate: + mySep = myCommandLine.args.separate + else: + mySep = False + if myCommandLine.args.differences: + myD = myCommandLine.args.differences + else: + myD = '' + + makeExamples(myS, myB, myC, myD, myF, myT, mym, myM, myR, mySep) + +if __name__ == "__main__": + """ + Calls main when program is run by user. + """ + main(); + raise SystemExit + +if __name__ == "__main__": + """ + Calls main when program is run by user. + """ + main(); + raise SystemExit + + + + + + + + + + + + + + + + + + + + + diff --git a/scripts/recombination/simulation/makeSampleFiles.py b/scripts/recombination/simulation/makeSampleFiles.py new file mode 100644 index 00000000..80be5c60 --- /dev/null +++ b/scripts/recombination/simulation/makeSampleFiles.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python3 +# Name: Bryan Thornlow +# Date: 2/1/2018 +# compareDatabases.py + +import sys +import os +import datetime +import numpy +from numpy import random +import gzip +import math +import re + +########################## +##### MAIN FUNCTIONS ##### +########################## + +def makeSampleFiles(): + myParallelJobs = '' + bmToDir = {'10':'S1_500/','11':'S1_500_1/','12':'S1_500_2/','13':'S1_500_3/','20':'S2_500/','21':'S2_500_1/','22':'S2_500_2/','23':'S2_500_3/','30':'S3_500/','31':'S3_500_1/','32':'S3_500_2/','33':'S3_500_3/','40':'S4_500/','41':'S4_500_1/','42':'S4_500_2/','43':'S4_500_3/'} + for b in ['1','2','3','4']: + for m in ['0','1','2','3']: + l = 0 + if os.path.exists('recombination_'+b+'_1_'+m+'.log'): + with open('recombination_'+b+'_1_'+m+'.log') as f: + for line in f: + splitLine = (line.strip()).split('\t') + if not splitLine[0] == 'recombinant_sample': + l += 1 + # print(splitLine[0]) + open('TEMP_SAMPLES/temp_sample_'+b+m+'_'+str(l)+'.txt','w').write(splitLine[0]+'\n') + if not os.path.exists('SIM_DATA/recombination_'+b+m+'_'+str(l)+'.tsv'): + myBashSh = '/HOME/usher/build/usher -v ../'+bmToDir[b+m]+'recombinant_set_'+str(l)+'.vcf.gz -i ../public-latest.all.masked.pb -o ../temp_'+b+m+'_'+str(l)+'.pb -T 32 2> ../LOGS/temp_'+b+m+'_'+str(l)+'.log\n' + myBashSh += 'grep Parsimony ../LOGS/temp_'+b+m+'_'+str(l)+'.log > ../SIM_DATA/temp_'+b+m+'_'+str(l)+'.parsimony\nmkdir ../'+b+m+'_'+str(l)+'/\n' + myBashSh += '/usr/bin/time -o '+b+'_'+m+'_'+str(l)+'.time -f "%E %M" /HOME/usher/build/ripples -i ../temp_'+b+m+'_'+str(l)+'.pb -s ../TEMP_SAMPLES/temp_sample_'+b+m+'_'+str(l)+'.txt -d ../'+b+m+'_'+str(l)+'/ -T 32 2> ../LOGS/findRec_'+b+m+'_'+str(l)+'.log\n' + myBashSh += 'mv ../'+b+m+'_'+str(l)+'/recombination.tsv ../SIM_DATA/recombination_'+b+m+'_'+str(l)+'.tsv\n' + myBashSh += 'mv ../'+b+m+'_'+str(l)+'/descendants.tsv ../SIM_DATA/descendants_'+b+m+'_'+str(l)+'.tsv\nrm ../temp_'+b+m+'_'+str(l)+'.pb\n' + open('SIM_SCRIPTS/'+b+m+'_'+str(l)+'.sh','w').write(myBashSh) + myParallelJobs += 'bash '+b+m+'_'+str(l)+'.sh\n' + open('SIM_SCRIPTS/myParallelJobs.sh','w').write(myParallelJobs) + + + +########################## +#### HELPER FUNCTIONS #### +########################## + +def getPos(myInds, intLineNumToPos): + myReturn = [] + for k in myInds: + myReturn.append(intLineNumToPos[k]) + return(myReturn) + +def joiner(entry): + newList = [] + for k in entry: + newList.append(str(k)) + return('\t'.join(newList)) + +def toInt(myList): + myReturn = [] + for k in myList: + myReturn.append(int(k)) + return(myReturn) + +def joinerU(entry): + newList = [] + for k in entry: + newList.append(str(k)) + return '_'.join(newList) + +def joinerC(entry): + newList = [] + for k in entry: + newList.append(str(k)) + return ','.join(newList) + +######################### +##### FUNCTION CALL ##### +######################### + +def main(): + makeSampleFiles() + + +if __name__ == "__main__": + """ + Calls main when program is run by user. + """ + main(); + raise SystemExit \ No newline at end of file diff --git a/scripts/recombination/simulation/wuhan.ref.fa b/scripts/recombination/simulation/wuhan.ref.fa new file mode 100644 index 00000000..07abb44d --- /dev/null +++ b/scripts/recombination/simulation/wuhan.ref.fa @@ -0,0 +1,2 @@ +>NC_045512v2 +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNgatctgttctctaaacgaactttaaaatctgtgtggctgtcactcggctgcatgcttagtgcactcacgcagtataattaataactaattactgNcgNtgacaggacacgagtaactcgtctatcttctgcaggctgcttacggtttcgtccgtgttgcagccgatcatcagcacatctaggtttcgtccgggtgtgaccgaaaggtaagatggagagccttgtccctggtttcaacgagaaaacacacgtccaactcagtttgcctgttttacaggttcgcgacgtgctcgtacgtggctttggagactccgtggaggaggtcttatcagaggcacgtcaacatcttaaagatggcacttgtggcttagtagaagttgaaaaaggcgttttgcctcaacttgaacagccctatgtgttcatcaaacgttcggatgctcgaactgcacctcatggtcatgttatggttgagctggtagcagaactcgaaggcattcagtacggtcgtagtggtgagacacttggtgtccttgtccctcatgtgggcgaaataccagtggcttaccgcaaggttcttcttNgtaagaacggtaataaaggagctggtggccatagttacggcgccgatctaaagtcatttgacttaggcgacgagcttggcactgatccttatgaagattttcaagaaaactggaacactaaacatagcagtggtgttacccgtgaactcatgcgtgagcttaacggaggggcatacactcgctatgtcgataacaacttctgtggccctgatggctaccctcttgagtgcattaaagaccttctagcacgtgctggtaaagcttcatgcactttgtccgaacaactggactttattgacactaagaggggtgtatactgctgccgtgaacatgagcatgaaattgcttggtacacggaacgttctgaaaagagctatgaattgcagacaccttttgaaattaaattggcaaagaaatttgacaccttcaatggggaatgtccaaattttgtatttcccttaaattccataatcaagactattcaaccaagggttgaaaagaaaaagcttgatggctttatgggtagaattcgatctgtctatccagttgcgtcaccaaatgaatgcaaccaaatgtgcctttcaactctcatgaagtgtgatcattgtggtgaaacttcatggcagacgggcgattttgttaaagccacttgcgaattttgtggcactgagaatttgactaaagaaggtgccactacttgtggttacttaccccaaaatgctgttgttaaaatttattgtccagcatgtcacaattcagaagtaggacctgagcatagtcttgccgaataccataatgaatctggcttgaaaaccattcttcgtaagggtggtcgcactattgcctttggaggctgtgtgttctcttatgttggttgccataacaagtgtgcctattgggttccacgtgctagcgctaacataggttgtaaccatacaggtgttgttggagaaggttccgaaggtcttaatgacaaccttcttgaaatactccaaaaagagaaagtcaacatcaatattgttggtgactttaaacttaatgaagagatcgccattattttggcatctttttNtgcttccacaagtgcttttgtggaaactgtgaaaggtttggattataaagcattcaaacaaattgttgaatcctgtggtaattttaaagttacaaaaggaaaagctaaaaaaggtgcctggaatattggtgaacagaaatcaatactgagtcctctttatgcatttgcatcagaggctgctcgtgttNtacgatcaattttctcccgcactcttgaaactgctcaaaattctgtgcgtgttttacagaaggccgctataacaatactagatggaatttcacagtattcactgagactcattgatgctatgatgttcacatctgatttggctactaacaatctagttgtaatggcctacattacaggtggtgttgttcagttgaNttNgcagtggctaactaacatctttggcactgtttatgaaaaactcaaacccgtccttgattggcttgaagagaagtttaaggaaggtgtagagtttcttagagacNgttgggaaattgttaaatttatctcaacctgtgcttgtgaaattgtcggtggacaaattgtcacctgtgcaaaggaaattaaggagagtgttcagacattctttaagcttgtaaataaatttttggctttgtgtgctgactctatcattattggtggagctaaacttaaagccttgaatttaggtgaaacatttgtcacgcactcaaagggattgtacagaaagtgtgttaaatccagagaagaaactggcctactcatgcctctaaaagccccaaaagaaattatcttcttagagggagaaacacttcccacagaagtgttaacagaggaagttgtcttgaaaactggtgatttacaaccattagaacaacctactagtgaagctgttgaagctccattggttgNtacaccagtttgtattaacgggcttatgttgctcgaaatcaaagacacagaaaagtactgtgcccttgcacctaatatgatggtaacaaacaataccttcacactcaaaggcggtgcaccaacaaaggttacttttggtgatgacactgtgatagaagtgcaaggttacaagagtgtgaatatcacttttgaacttgatgaaaggattgataaagtacttaatgagaagtgctctgcctatacagttgaactcggtacagaagtaaatgagttcgcctgtgttgtggcagatgctgtcataaaaactttgcaaccagtatctgaattacttacaccactgggcattgatttagatgagtggagtatggctacatactacttatttgatgagtctggtgagtttaaattggcttcacatatgtattgttctttctaccctccagatgaggatgaagaagaaggtgattgtgaagaagaagagtttgagccatcaactcaatatgagtatggtactgaagatgattaccaaggtaaacctttNgaatttggtgccacttctgctgctcttcaacctgaagaagagcaagaagaagattggttagatgatgatagtcaacaaactgttggtcaacaagacggcagtgaggacaatcagacaactactattcaaacaattgttgaggttcaacctcaattagagatggaacttacaccagttgttcagactattgaagtgaatagttttagtggttatttaaaacttactgacaatgtatacattaaaaatgcagacattgtggaagaagctaaaaaggtaaaaccaacagtggttgttaatgcagccaatgtttaccttaaacatggaggaggtgttgcaggagccttaaataaggctactaacaatgccatgcaagttgaatctgatgattacatagctactaatggaccacttaaagtggNtggtagttgtgttttaagcggacacaatcttgctaaacactgtcttcatgttgtcggcccaaatgttaacaaagNtgaagacattcaacttcttaagagtgcttatgaaaattttaatcagcacgaagttctacttgcaccattattatcagctggtatttttggtgctgaccctatacattctttaagagtttgtgtagatactgttcgcacNaatgtctacttagctgtctttgataaaaatctctatgacaaacttgtttcaagctttttggaaatgaagagtgaaaagcaagttgaacaaaagatcgctgagattcctaaagaggaagttaagccatttataactgaaagtaaaccttcagttgaacagagaaaacaagatgataagaaaatcaaagcttgtgttgaagaagttacaacaactctggaagaaactaagttcctcacagaaaacttgttactttatattgacattaatggcaNtcttcatccagattctgccactcttgttagtgacattgacatcactttcttaaagaaagatgctccatatatagtgggtgatgttgttcaagagggtgttttaactgctgtggttatacctactaaaaaggctggtggcactactgaaatgctagcgaaagctttgagaaNagtgccaacagacaattatataaccacttacccgggtcagggtttaaatggttacactgtagaggaggcaaagacagtgcttaaaaagtgtaaaagtgccttttacattctaccatctattatctctaatgagaagcaagaaattcttggaactgtttcttggaatttgcgagaaatgcttgcacatgcagaagaaacacgcaaattaatgcctgtctgtgtggaaactaaagccatagtttcaactatacagcgtaaatataagggtattaaaatacaagagggtgtggttgattatggtgctagattttacttttacaccagtaaaacaactgtagcgtcacttatcaacacacttaacgatctaaatgaaactcttgttacaatgccacttggctatgtaacacatggcttaaatttggaagaagctgctcggtatatgagatctctcaaagtgccagctacagtttctgtttcttcacctgatgctgttacagcgtataatggttatcttacttcttcttctaaaacacctgaagaacattttattgaaaccatctcacttgctggttcctataaagattggtcctattctggacaatctacacaactaggtatagaatttcttaagagaggtgataaaagtgtatattacactagtaatcctaccacattccacctagatggtgaagttatcacctttgacaatcttaagacacttctttctttgagagaagtgaggactattaaggtgtttacaacagtagacaacattaacctccacacgcaNgttgtggacatgtcaatgacatatggacaacagtttggtccaacttatttggatggagctgatgttactaaaataaaacctcataattcacatgaaggtaaaacattttatgttttacctaatgatgacactctacgtgttgaggcttttgagtactaccacacaactgatcctagttttctgggtaggtacatgtcagcattaaatcacactaaaaagtggaaatacccacaagttaatggtttNacttctattaaatgggcagataacaactgttatcttgccactgcattgttaacactccaacaaatagagttgaagtttaatccacctgctctacaagatgcttattacagagcaagggctggtgaagctgctaacttttgtgcacttatcttagcctactgtaataagacagtaggtgagttaggtgatgttagagaaacaatgagttacttgtttcaacatgccaatttagattcttgcaaaagagtcttgaacgtggtgtgtaaaacttgtggacaacagcagacaacccttaagggtgtagaagctgttatgtacatgggcacactttcttatgaacaatttaagaaaggtgttcagataccttgtacgtgtggtaaacaagctacaaaatatctagtacaacaggagtcaccttttgttatgatgtcagcaccacctgctcagtatgaacttaagcatggtacatttacttgtgNtagtgaNNacactggtaattaccagtgtggtcactataaacatataacttctaaagaaactttgtattgcatagacggtgctttacttacaaagtcctcagaatacaaaggtcctattacggatgttttctacaaagaaaacagttacacaacaaccataaaaccagttacttataaattggatggtgttgtttgtacagaaattgaccctaagttggacaattattataagaaagacaattcttatttcacagagcaaccaattgatcttgtaccaaaccaaccatatccaaacgcaagcttcgataattttaagtttgtatgtgataatatcaaatttgctgatgatttaaaccagttaactggttataagaaacctgcttcaagagagcttaaagttacatttttccctgacttaaatggtgatgtgNtggctattgattataaacactacacaccctcttttaagaaaggagctaaattgttacataaacctattgtttggcatgttaacaatgNaactaataaagccacgtataaaccaaatacctggtgtatacgttgtctttggagcacaaaaccagttgaaacatcaaattcgtttgatgtactgaagtcagaggacgcgcagggaatggataatcttgcctgcgaagatctaaaaccagtctctgaagaagtagtggaaaatcctaccatacagaaagacgttcttgagtgtaatgtgaaaactaccgaagttgtaggagacattatacttaaaccagcaaataatagtttaaaaattacagaagaggttggccacacagatctaatggctgcttatgtagacaattctagtcttactattaagaaacctaatgaattatctagagtattaggtttgaaaacccttgctactcatggtttagctgctgttaatagtgtcccttgggatactatagctaattatgctaagccttttcttaacaaagttgttagtacaactactaacatagttacacggtgtttaaaccgtgtttgtactaattatatgccttatttctttactttattgctacaattgtgtacttttactagaagtacaaattctagaattaaagcatctatgccgactactatagcaaagaatNctgtNaagagtgtcggtaaattttgtctagaggcttcatttaattatttgaagtcacctaatttttctaaactgataaatattataatttggtttttactattaagtgtttgcctaggttctttaatctactcaaccgctgctttaggtgttttaatgtctaatttaggcatgccttcttactgtactggttacagagaaggctatttgaactctactaatgtcactattgcaacctactgtactggttctataccttgtagtgtttgtcttagtggtttagattctttagacacctatccttctttagaaactatacaaattaccatttcatcttttaaatgggatttaactgcttttggcttagttgcagagtggtttttggcatatattcttttcactaggtttttctatgtacttggattggctgcaatcatgcaattgtttttcagctattttgcagtacattttattagtaattcttggcttatgtggttaataattaatcttgtacaaatggccccgatttcagctatggttagaatgtacatcttctttgcatcattttattatgtatggaaaagttatgtgcatgttgtagacggttgtaattcatcaacttgtatgatgtgttacaaacgtaatagagcaacaagagtcgaatgtacaactattgttaatggtgttagaaggtccttttatgtctatgctaatggaggtaaaggcttttgcaaactacacaattggaattgtgttaattgtgatacattctgtgctggtagtacatttattagtgatgaagttgcgagagacttgtcactacagtttaaaagaccaataaatcctactgaccagtcttcttacatcgttgatagtgttacagtgaagaatggttccatccatctttactttgataaagctggtcaaaagacttatgaaagacattctctctctcattttgttaacttagacaacctgagagctaataacactaaaggttcattgcctattaatgttatagtttttgatggtaaatcaaaatgtgaagaatcatctgcaaaatcagcgtctgtttactacagtcagcttatgtgtcaacctatactgttactagatcaggcattagtgtctgatgttggtgatagtgcggaagNtgcNgttaaaatgtttgatgcttacgttaatacgttttcatcaacttttaacgtaccaatggaaaaactcaaaacactagttgcaactgcagaagctgaacttgcaaagaatgtgtccttagacaatgtcttatctacttttatttcagcagctcggcaagggtttgttgattcagatgtagaaactaaagatgttgttgaatgtcttaaattgtcacatcaatctgacatagaagttactggcgatagttgtaataactatatgctcacctataacaaagttgaaaacatgacaccccgtgaccNtggtgcttgtattgactgtagtgcgcgtcatattaatgcgcaggtagcaaaaagtcacaacattgctttgatatggaacgttaaagatttcatgtcattgtctgaacaactacgaaaacaaatacgtagtgctgctaaaaagaataacttaccttttaagttgacatgtgcaactactagacaagttgttaatgttgtaacaacaaagatagcacttaagggtggtaaaattgttaataattggttgaagcagttaattaaagttacacttgtgttcctttttgttgctgctattttctatttaataacacctgttcatgtcatgtctaaacatactgacttttcaagtgaaatcataggatacaaggctattgatggtggtgtcactcgtgacatagcatctacagatacttgttttgctaacaaacatgctgattttgacacatggtttagccagcgtgNtggtagttatactaatgacaaagcttgcccattgatNNctgcagtcataacaagagaagtgggttttgtcgtgcctggtttgcctggcacgatatNNcgcacaactaatggtgactttttgcatttcttacctagagtttttagtgcagttggtaacatctgttacacaccatcaaaacttatagagtacactgactttgcaacatcagcttgtgttttggctgctgaatgtacaatttttaaagatgNttctggtaagccagtaccatattgttatgataccaatgtactagaaggttctgttgcttatgaaagtttacgccctgacacacgttatgtgctcatggatggctctattattcaatttcctaacacctaccttgaaggttctgttagagtggtaacaacttttgattctgagtactgtaggcacggcacttgtgaaagatcagaagctggtgtttgtgtatctactagtggtagatgggtacttaacaatgattattacagatctttaccaggagttttctgtggtgtagatgctgtaaatttacttactaatatgtttacaccactaattcaacctattggtgctttggacatatcagcatctatagtagctggtggtattgtagctatcgtagtaacatgccttgcctactattttatgaggtttagaagagcttttggtgaatacagtcatgtagttgcctttaatactttactattccttatgtcattcactgtactctgtttaacaccagtttactcattcttacctggtgtttattctgttatttacttgtacttgacattttatcttactaatgatgtttcttttttagcacatattcagtggatggttatgttcacacctttagtacctttctggataacaattgcttatatcatttgtatttccacaaagcatttctattggttctttagtaattacctaaagagacgtgtagtctttaatggtgtttcctttagtacttttgaagaagctgcgctgtgcacctttttgttaaataaagaaatgtatctaaagttgcgtagtgatgtgctattacctcttacgcaatataatagatacttagctctttataataagtacaagtattttagtggagcaatggatacaactagctacagagaagctgcttgttgtcatctcgcaaaggctctcaatgacttcagtaactcaggttctgatgttctttaccaaccaccacaaacctctatcacctcagctgttttgcagagtggttttagaaaaatggcattcccatctggtaaagttgagggttgtatggtacaagtaacttgtggtacaacNacacttaacggtctttggcttgatgacgtagtttactgtccaagacatgtgatctgcacctctgaagacatgcttaaccctaattatgaagatttactcattcgtaagtNtaatcataatttcttggtacaggctggtaatgttcaactcagggttattggacattctatgcaaaattgtgtacttaagcttaaggttgatacagccaatcctaagacacctaagtataagtttgttcgcattcaaccaggacagactttttcagtgttagcttgttacaatggttcaccatctggtgtttaccaatgtgctatgaggcccaatttcactattaagggttcattccttaatggttcatgtggtagtgttggttttaacatagattatgactgtgtctctttttgttacatgcaccatatggaatNaccaactggagttcatgctggcacagacttagaaggtaacttttatggaccttttgttgacaggcaaacagcacaagcagctggtacggacacaactattacagttaatgttttagcttggttgtacgctgctgttataaatggagacaggtggtttctcaatcgatttaccacaactcttaatgactttaaccttgtggctatgaagtacaattatgaacctctaacacaagaccatgttgacatactaggacctctttctgctcaaactggaattgccgttttagatatgtgtgcttcattaaaagaattactgcaaaatggtatgaatggacgtaccatattgggtagtgctttattagaagatgaatttacaccttttgatgttgttagacaatgctcaggtgttactttccaaagtgcagtgaaaagaacaatcaagggtacacaccactggttgttactcacaattttgacttcacttttagttttagtccagagtactcaatggtctttgttNttttttttNtatgaaaatgcctttttaccttttgctatgggtattattgctatgtctgcttttgcaatgatgtttgtcaaacataagcatgcatttctctgtttgtttttgttaccttctcttgccactgtagcttattttaatatggtctatatgcctgctagttgggtgatgcgtattatgacatggttggatatggttgatactagtttgtctggttttaagctaaaagactgtgttatgtatgcatcagctgtagtgttactaatccttatgacagcaagaactgtgtatgatgatggtgctaggagagtgtggacacttatgaatgtcttgacactcgtttataaagtttattatggtaatgctttagatcaagccatttccatgtgggctcttataatctctgttacttctaactactcaggtgtagttacaactgtcatgtttttggccagagNtattgtttttatgtgtgttgagtattgccctattttcttcataactggtaatacacttcagtgtataatgctagtttattgtttcttaggctatttttgtacttgttactttggcctcttttgtttactcaaccgctactttagactgactcttggtgtttatgattacttagtttctacacaggagtttagatatatgaattcacagggactactcccacccaagaatagcatagatgccttcaaactcaacattaaattgttgggtgttggtggcaaaccttgtatcaaagtagccactgtacagtctaaaatgtcagatgtaaagtgcacatcagtagtcttactctcagttttgcaacaactcagagtagaatcatcatctaaattgtgggctcaatgtgtccagttacacaatgacattctcttagctaaagatactactgaagcctttgaaaaaatggtttcactactttctgttttgctttccatgcagggtgctgtagacataaacaagctttgtgaagaaatgctggacaacagggcaaccttacaagctatagcctcagagtttagttcccttccatcatatgcagcttttgctactgctcaagaagcttatgagcaggctgttgctaatggtgattctgaagttgttcttaaaaagttgaagaagtctttgaatgtggctaaatctgaatttgaccgtgatgcagccatgcaacgtaagttggaaaagatggctgatcaagctatgacccaaatgtataaacaggctagatctgaggacaagagggcaaaagttactagtgctatgcagacaatgcttttcactatgcttagaaagttggataatgatgcactcaacaacattatcaacaatgcaagagatggttgtgttcccttgaacataatacctcttacaacagcagccaaactaatggttgtcataccagactataacacatataaaaatacgtgtgatggtacaacatttacttatgcatcagcattgtgggaaatccaacaggttgtagatgcagatagtaaaattgttcaacttagtgaaattagtatggacaattcacctaatttagcatggcctcttattgtaacagctttaagggccaattctgctgtcaaattacagaataatgagcttagtcctgttgcactacgacagatgtcttgtgctgccggtactacacaaactgcttgcactgatgacaatgcgttagcttactacaacacaacaaagggaggtaggtttgtacttgcactgttatccgatttacaggatttgaaatgggctagattccctaagagtgatggaactggtactatctatacagaactggaaccaccttgtaggtttgttacagacacacctaaaggtcctaaagtgaagtatttatactttattaaaggattaaacaacctaaatagaggtatggtacttggtagtttagctgccacagtacgtctacaagctggtaatgcaacagaagtgcctgccaattcaactgtattatctttctgtgcttttgctgtagatgctgctaaagcttacaaagattatctagctagtgggggacaaccaatcactaattgtgttaagatgttgtgtacacacactggtactggtcaggcaataacagttacaccggaagccaatatggatcaagaatcctttggtggtgcatcgtgttgtctgtactgccgttgccacatagatcatccaaatcctaaaggattttgtgacttaaaaggtaagtatgtacaaatacctacaacttgtgctaatgaccctgtgggttttacacttaaaaacacagtctgtaccgtctgcggtatgtggaaaggttaNggctgNagttgtgatcaactccgcgaacccatgcttcagtcagctgatgcacaatcgtttttaaacgggtttgNggtgtaagtgcagcccgtcttacaccgtgcggcacaggcactagtactgatgtcgtatacagggcttttgacatctacaatgataaagtagctgNttttgctaaattcctaaaaactaattgNtgtcgcttccaagaaaaggacgaagatgacaatttaattgattcttactttgtagttaagagacacactttctctaactaccaacatNaagaaacaatttataatttacttaaggattgtccagctgttgctaaacatgacttctttaagtttagaatagacggtgacatggtaccacatatatcacgtcaacgtcttactaaatacacaatggcagacctcgtctatgctttaaggcattttgatgaaggtaattgtgacacattaaaagaaatacttgtcacatacaattgttgtgatgatgattatttcaataaaaaggactggtatgattttgtagaaaacccagatatattacgcgtatacgccaacttaggtgaacgtgtacgccaagctttgttaaaaacagtacaattctgtgatgccatgcgaaatgctggtattgttggtgtactgacattagataatcaagatctcaatggtaactggtatgatttcggtgatttcatacaaaccacgccaggtagtggagttcctgttgtagattcttattattcattgttaatgcctatattaaccttgaccagggctttaactgcagagtcacatgttgacactgactNNaNaaagccttacattaagtgggatttgttaaaatatgacttcacggaagagagNttaaaactctttgaccgttattttaaatattgggatcagacataccacccaaattgtgttaactgtttggatgacagatgcattctgcattgtgcaaactttaatgttttattctctacagtgttcccacctacaagttttggaccactagtgagaaaaatatttgttgatggtgttccatttgtagtttcaactggataccacttcagagagctaggtgttgtacataatcaggatgtaaacttacatagctctagacttagttttaaggaattacttgtgtatgctgctgaccctgctatgcacgctgcttctggtaatctattactagataaacgcactacgtgcttttcagtagctgcacttactaacaatgttgcttttcaaactgtcaaacccggtaattttaacaaagacttctatgactttgctgtgtctaagggtttctttaaggaaggaagttctgttgaattaaaacacttcttctttgctcaggatggtaatgctgctatcagcgattatgactactatcgttataatctaccaacaatgtgtgatatcagacaactactatttgtagttgaagttgttgataagtactttgattgttacgatggtggctgtattaatgctaaccaagtcatcgtcaacaacctagacaaatcagctggttttccatttaataaatggggtaaggctagactttattatgattcaatgagttatgaggatcaagatgcacttttcgcatatacaaaacgtaatgtcatccctactataactcaaatgaatcttaagtatgccattagtgcaaagaatagagctcgcaccgtagctggtgtctctatctgtagtactatgaccaatagacagtttcatcaaaaattattgaaatcaatagccgccactagaggagctactgtagtaattggaacaagcaaattctatggtggttggcacaacatgttaaaaactgtttatagtgatgtagaaaaccctcaccttatgggttgggattatcctaaatgtgatagagccatgcctaacatgcttagaattatggcctcacttgttcttgctcgcaaacatacaacgtgttgtagcttgtcacaccgtttctatagattagctaatgagtgtgctcaagtattgagtgaNatggtcatgtgtggcggttcactatatgttaaaccaggtggaacctcatcaggagatgccacaactgcttatgctaatagtgtttttaacatttgtcaagctgtcacggccaatgttaatgcacttttatctactgatggtaacaaaattgccgataagtatgtccgcaatttacaacacagactttatgagtgtctctatagaaatagagatgttgacacagactttgtgaatgagttttacgcatatttgcgtaaacatttctcaatgatgatactctctgacgatgctgttgtgtgtttcaatagcacttatgcatctcaaggtctagtggctagcataaagaactttaagtcagttctttattatcaaaacaatgtttttatgtctgaagcaaaatgttggactgagactgaccttactaaaggacctcatgaattttgctctcaacatacaatgctagttaaacagggtgatgattatgtgNaccttccttacccagatccatcaagaatcctaggggccggctgttttgtagatgatatcgtaaaaacagatggtacacttatgattgaacggttcgtgtctttagctatagatgcttacccacttactaaacatcctaatcaggagtatgctgatgtctttcatttgtacttacaatacataagaaagctacatgatgagttaacaggacacatgttagacatgtattctgttatgcttactaatgataacacttcaaggtattgggaacctgagttttatgaggctatgtacacaccgcatacagtcttacaggctgttggggcttgtgttctttgcaattcacagacttcattaagatgtggtgcNtgcatacgtagaccattcttatgttgtaaatgctgttacgaccatgtcatatcaacatcacataaattagtcttgtctgttaatccgtatgtttgcaatgctccaggttgtgatgtcacagatgtgactcaactttacttaggaggtatgagctattattgtaaatcacataaaccacccattagttttccattgtgtgctaatggacaagtttttggtttatataaaaatacatgtgttggtagcgataatgttactgactttaatgcaattgcaacatgtgactggacaaatgctggtgattacattttagctaacacctgtactgaaagactcaagctttttgcagcagaaacgctcaaagctactgaggagacatttaaactgtcttatggtattgctactgtacgtgaagtgctgtctgacagagaattacatctttcatgggaagttggtaaacctagaccaccacttaaccgaaattatgtctttactggttatcgtgtaactaaaaacagtaaagtacaaataggagagtacacctttgaaaaaggtgactatggtgatgctgttgtttaccgaggtacaacaacttaNaaattaaatgttggtgattattttgtgctgacatcacatacagtaatgccattaagtgcacctacactagtgccacaagagcactatgttagaattactggcttatacccaacactcaatatctcagatgagttttctagcaatgttgcaaattatcaaaaggttggtatgcaaaagtattctacactccagggaccacctggtactggtaagagtcattttgctattggcctagctctctactacccttctgctcgcatagtgtatacagcttgctctcatgccgctgtNNatNcactatgtgagaaggcattaaaatatttgcctatagataaatgtagtagaattatacctgcacgtgctcgtgtagagtgttttgataaattcaaagtgaattcaacattagaacagtatgtcttttgtactgtaaatgcattgcctgagacgacagcagatatagttgtctttgatgaaatttcaatggccacaaattatgatttgagtgttgtcaatgccagattacgtgctaagcactatgtgtacattggcgaccctgctcaattacctgcaccacgcacattgctaactaagggcacactagaaccagaatatttcaattcagtgtgtagacttatgaaaactataggtccagacatgttcctcggaacttgtcggcgttgtcctgctgaaattgttgacactgtgagtgctttggtttatgataataagcttaaagcacataaagacaaatcagctcaatgctttaaaatgttttataagggtgttatcacgcatgatgtttcatctgcaattaacaggccacaaataggcgtggtaagagaattccttacacgtaaccctgcttggagaaaagctgtctttatttcaccttataattcacagaatgctgtagcctcaaagattttgggactaccaactcaaactgttgattcatcacagggctcagaatatgactatgtcatattcactcaaaccactgaaacagctcactcttgtaatgtaaacagatttaatgttgctattaccagagcaaaagtaggcatactttgcataatgtctgatagagacctttatgacaagttgcaatttacaagtcttgaaattccacgtaggaatgtggcaactttacaagctgaaaatgtaacaggactctttaaagattgtagtaaggtaatcactgggttacatcctacacaggcacctacacacctcagtgttgacactaaattcaaaactgaaggtttatgtgttgacatacctggcatacctaaggacatgacctatagaagactcatctctatgatgggttttaaaatgaattatcaagttaatggttaccctaacatgtttatcacccgcgaagaagctataagacatgtacgtgcatggattggcttcgatgtcgaggggtgtcatgctactagagaagctgttggtaccaatttacctttacagctaggtttttctacaggtgttaacctagttgctgtacctacaggttatgttgatacacctaataatacagatttttccagagttagtgctaaaccaccgcctggagatcaatttaaacacctcataccacttatgtacaaaggacttccttggaatgtagtgcgtataaagattgtacaaatgttaagtgacacacttaaaaatctctctgacagagtcgtatttgtcttatgggcacatggctttgagttgacatctatgaagtattttgtgaaaataggacctgagcgcacctgttgtctatgtgatagacgtgccacatgcttttccactgcttcagacacttatgcctgttggcatcattctattggatttgattacgtctataatccgtttatgattgatgttcaacaatggggttttacaggtaacctacaaagcaaccatgatctgtattgtcaagtccatggtaatgcacatgtagctagttgtgatgcaatcatgactaggtgtctagctgtccacgagtgctttgttaagcgtgttgactggactattgaatatcctataattggtgatgaactgaagattaatgcggcttgtagaaaggttcaacacatggttgttaaagctgcattattagcagacaaattcccagttcttcacgacattggtaaccctaaagctattaagtgtgtacctcaagctgatgtagaatggaagttctatgatgcacagccttgtagtgacaaagcttataaaatagaagaattattctattcttatgccacacattctgacaaattcacagatggtgtatgcctattttggaattgcaatgtcgatagatatcctgctaattccattgtttgtagatttgacactagagtgctatctaaccttaacttgcctggttgtgatggtggcagtttgtNNgtaaataaacatgcattccacacaccagcttttgataaaagtgcttttgttaatttaaaacaattaccatttttctattactctgacagtccatgtgagtctcatggaaaacaagtagtgtcagatatagattatgtaccactaaagtctgctacgtgtataacacgttgcaatttaggtggtgNtgtctgtagacatcatgctaatgagtacagattgtatctcgatgcttataacatgatgatctcNgctggctttagcttgtgggtttacaaacaatttgatacttataacctctggaacacttttacaagacttcagagtttagaaaatgtggcttttaatgttgtaaataagggacactttgatggacaacagggtgaagtaccagtttctatcattaataacactgtttacacaaaagttgatggtgttgatgtagaattgtttgaaaataaaacaacattacctgttaatgtagcatttgagctttgggctaagcgcaacattaaaccagtaccagaggtgaaaatactcaataatttgggtgtggacattgctgctaatactgtgatctgggactacaaaagagatgctccagcacatatatctactattggtgtttgttctatgactgacatagccaagaaaccaactgaaacgatttgtgcaccactcactgtcttttttgatggtagagttgatggtcaagtagacttatttagaaatgcccgtaatggtgttcttattacagaaNgtagtgttaaaggtttacaaccatctgtaggtcccaaacaagctagtcttaatggagtcacattaaNtggagaagccgtaaaaacacagttcaattattataagaaagttgatggtgttgtccaacaattacctgaaacttactttactcagagtagaaatttacaagaatttaaacccaggagtcaaatggaaattgatttcttagaattagctatggatgaattcattgaacggtataaattagaaggctatgccttcgaacatatcgtttatggagattttagtcatagtcagttaggtggtttacatctactgattggactagctaaacgttttaaggaatcaccttttgaattagaagattttattcctatggacagtacagttaaaaactatttcataacagNtgcgcaaacaggttcatctaagtgtgtgtgttctgttattgatttattacttgatgattttgttgaaataataaaatcccaagatttatctgtagtttctaaggttgtcaaagtgactattgactatacagaaatttcatttatgctttggtgtaaagatggccatgtagaaacattttacccaaaattacaatctagtcaagcgtggcaaccgggtgttgctatgcctaatctttacaaaatgcaaagaatgctattagaaaagtgtgaccttcaaaattatggtgatagtgcaacattacctaaaggcataatgatgaatgtcgcaaaatatactcaactgtgtcaatatttaaacacattaacattagctgtaccctataatatgagagttatacattttggtgctggttctgataaaggagttgcaccaggtacagctgttttaagacagtggttgcctacgggtacgctgcttgtcgattcagatcttaatgactttgtctctgatgcagattcaactttgattggtgattgtgcaactgtacatacagctaataaatgggatctcattattagtgatatgtacgaccctaagactaaaaatgttacaaaagaaaatgactctaaagagggttttttcacttacatttgtgggtttatacaacaaaagctagctcttgNaNgttccgtggctataaagataacagaacattcttggaatgctgatctttataagctcaNggNacacttcgcatggtggacagcctttgttactaatgtgaatgcgtcatcatctgaagcatttttaattggatgtaattatcttggcaaaccacgcgaacaaatagatggttatgtcatgcatgcaaattacatattttggaggaatacaaatccaattcagttgtcttcctattctttatttgacatgagtaaatttccccttaaattaaggggtactgctgttatgtctttaaaagaaggtcaaatcaatgatatgattttatctcttcttagtaaaggtagacttataattagagaaaacaacagagttgttatttctagtgatgttcttgttaacNNctaaacgaacaatgtttgtttttNttgttttattgccactagtctctagtcagtgtgttaatcttacaaccagaactcaattaccccctgcatacactaattctttcacacgtggtgtttattaccctgacaaagttttcagatcctcagttttacattcaactcaggacttgttcttacctttcttttccaatgttacttggttccatgctatacatgtctctgggaccaatggtactaagaggtttgataaccctgtcctaccatttaatgatggtgtttattttgcttccactgagaagtctaacataataagaggctggatttttggtactactttagattcgaagacccagtccctacttattgttaataacgctactaatgttgttattaaagtctgtgaatttcaattttgtaatgatccatttttgggtgtttattaccacaaaaacaacaaaagttggatggaaagtgagttcagagtttattctagtgcgaataattgcacttttgaatatgtctctcagccttttcttatggaccttgaaggaaaacagggtaatttcaaaaatcttagggaatttgtgtttaagaatattgatggttattttaaaatatattctaagcacacgcctattaatttagtgcgtgatctccctcagggtttttcggctttagaaccattggtagatttgccaataggtattaacatcactaggtttcaaactttacttgctttacatagaagttatttgactcctggtgattcttcttcaggttNgacagctggtgctgcagcttattatgtgggttatcttcaacctaggacttttctattaaaatataatgaaaatggaaccattacagatgctgtagactgtgcacttgaccctctctcagaaacaaagtgtacgttgaaatccttcactgtagaaaaaggaatctatcaaacttctaacttNagagNccaaccaacagaatctattgttagatttcctaatattacaaacttgtgcccttttggtgaagtttttaacgccaccagatttgcatctgtttatgcttggaacaggaagagaatcagcaactgtgttgctgattattctNtcctatataattccgcatcattttccacttttaagtgttatggagtgtctcctactaaattaaatgatctctgctttactaatgtctatgcagattcatttgtaattagaggtgatgaagtcagacaaatcgctccagggNaaactggaaagattgctgattataattataaattaccagatgattttacaggctgcgttatagcttggaattctaacaatcttgattctaaggttggtggtaattataattacctgtatagattgtttaggaagtctaatctcaaaccttttgagagagatatttcaactgaaatctatcaggccggtagcacaccttgtaatggtgttgaaggttttaattgttactttcctttacaatcatatggtttccaacccactaatggtgttggttaccaaccatacagagtagtagtactttcttttgaacttctacatgcaccagcaactgtttgtggacctaaaaagtctactaatttggttaaaaacaaatgtgtcaatttcaacttcaatggtttaacaggcacaggtgttcttactgagtctaacaaaaagtttctgcctttccaacaatttggcagagacattgctgacactactgatgctgtccgtgatccacagacacttgagattcttgacattacaccatgttcttttggtggtgtcagtgttataacaccaggaacaaatacttctaaccaggttgctgttctttatcaggatgttaactgcacagaagtccctgttgctattcatgcagatcaacttactcctacttggcgtgtttattctacaggttctaatgtttttcaaacacgtgcaggctgtttaataggggctgaacatgtcaacaactcatatgagtgtgacatacccattggtgcaggtatatgcgctagttatcagactcagactaattctcctcggcgggcacgtagtgtagctagtcaatccatcattgcctacactatgtcacttggtgcagaaaattcagttgcttactctaataactctattgccatacccacaaattttactattagtgttaccacagaaattctaccagtgtctatgaccaagacatcagtagattgtacaatgtacatttgtggtgattcaactgaatgcagcaatcttttgttgcaatatggcagtttttgtacacaattaaaccgtgctttaactggaatagctgttgaacaagacaaaaacacccaagaagtttttgcacaagtcaaacaaatttacaaaacaccaccaattaaagattttggtggttttaatttttcacaaatattaccagatccatcaaaaccaagcaagaggtcatttattgaagatctacttttcaacaaagtgacacttgcagatgctggcttcatcaaacaatatggtgattgccttggtgatattgctgctagagacctcatttgtgcacaaaagtttaacggccttactgttttgccacctttgctcacagatgaaatgattgctcaatacacttctgcactgttagcgggtacaatcacttctggttggacctttggtgcaggtgctgcattacaaataccatttgctatgcaaatggcttataggtttaatggtattggagttacacagaatgttctctatgagaaccaaaaattgattgccaaccaatttaatagtgctattggcaaaattcaagactcactttcttccacagcaNNtgcacttggaaaacttcaagatgtggtcaaccaaaatgcacaagctttaaacacgcttgttaaacaacttagctccaattttggtgcaatttcaagtgttttaaatgatatcctttcacgtcttgacaaagttgaggctgaagtgcaaattgataggttgatcacaggcagacttcaaagtttgcagacatatgtgactcaacaattaattagagctgcagaaatcagagcNtctgctaatcttgctgctactaaaatgtcagagtgtgtacttggacaatcaaaaagagttgatttttgtggaaagggctatcatcttatgtccttccctcagtcagcacctcatggtgtagtcttcttgcatgtgacttatgtccctgcacaagaaaagaacttcacaactgctcctgccatttgtcatgatggaaaagcacactttcctcgtgaaggtgtctttgtttcaaatggcacacactggtttgtaacacaaaggaatttttatgaaccacaaatcattactacagacaacacatttgtgtctgNtaactgtgatgttgtaataggaattgtcaacaacacagtttatgatcctttgcaacctgaattagactcattcaaggaggagttagataaatattttaagaatcatacatcaccagatgttgatttaggtgacatctctggcattaatgcttcagttgtaaacattcaaaaagaaattgaccgcctcaatgaggttgccaagaatttaaatgaatctctcatcgatctccaagaacttggaaagtatgagcagtatataaaatggccaNggtacatttggctaggttttatagctggcttgattgccatagtaatggtgacaattatgctttgctgtatgaccagttgctgtagttgtctcaagggctgttgttcttgtggatcctgctgcaaatttgatgaagacgactctgagccagtgctcaaaggagtcaaattacattacacNNaaacgaacttatggatttgtttatgagaatcttcacaattggaactgtaactttgaagcaaggtgaaatcaaggatgctactccttcagattttgttcgcgctactgcaacgataccgatacaagcctcactccctttcggatggcttattgttggcgttgcacttcttgctgtttttcagagcgcttccaaaatcataaccctcaaaaagagatggcaactagcactctccaagggtgttcactttgtttgcaacttgctgttgttgtttgtaacagtttactcacaccttttgctcgttgctgctggccttgaagccccttttctctatctttatgctttagtctacttcttgcagagtataaactttgtaagaataataatgaggctttggctttgctggaaatgccgttccaaaaacccattactttatgatgccaactattttctttgctggcatactaattgttacgactattgtataccttacaatagtgtaacttcttcaattgtcattacttcaggtgatggcacaacaagtcctatttctgaacatgactaccagattggtggttatactgaaaaatgggaatctggagtaaaagactgtgttgtattacacagttacttcacttcagactattaccagctgtactcaactcaattgagtacagacactggtgttgaacatgttaccttcttcatctacaataaaattgttgatgagcctgaagaacatgtccaaattcacacaatcgacggttcatccggagttgttaatccagtaatggaaccaatttatgatgaaccgacgacgactactagcgtgcctttgtaagcacaagctgatgagtacgaacttatgtactcattcgtttcggaagagacaggtacgttaatagttaatagcgtacttctttttcttgctttcgtggtattcttgctagttacactagccatccttactgcgcttcgattgtgtgcgtactgctgcaatattgttaacgtgagtcttgtaaaaccttctttttacgtttactctcgtgttaaaaatctgaattcttctagagttcctgatcttctggtctaaacgaactaaatattatattagtttttctgtttggaactttaattttagccatggcagattccaacggtactattacNgttgaagagcttaaaaagctccttgaacaatggaacctagtaataggtttcctattccttacatggatttgtcttctacaatttgcctatgccaacaggaataggtttttgtatataattaagttaattttcctctggctgttatggccagtaactttagcttgttttgtgcttgctgctgtttacagaataaattggatcaccggtggaattgctatcgcaatggcttgtcttgtaggcttgatgtggctcagctacttcattgcttctttcagactgtttgcgcgtacgcgttccatgtggtcattcaatccagaaactaacattcttctcaacgtgccactccatggcactattctgaccagaccgcttctagaaagtgaactcgtaatcggagctgtgatccttcgtggacatcttcgtattgctggacaccatctaggacgctgtgacatcaaggacctgcctaaagaaatcactgttgctacatcacgaacgctttcttattacaaattgggagcttcgcagcgtgtagcaggtgactcaggttttgctgcatacagtcgctacaggattggcaactataaattaaacacagaccattccagtagcagtgacaatattgctttgcttgtacagtaagtgacaacagatgtttcatctcgttgactttcaggttactatagcagagatattactaattattatgaggacttttaaagtttccatttggaatcttgattacatcataaacctcataattaaaaatttatctaagtcactaactgagaataaatattctcaattagatgaagagcaaccaatggagattgattaaacgaacatgaaaattattcttttcttggcactgataacactcgctacttgtgagctttatcactaccaagagtgtgttagaggtacaacagtacttttaaaagaaccttgctcttctggaacatacgagggcaattcaccatttcatcctctagctgataacaaatttgcactgacttgctttagcactcaatttgcttttgcttgtcctgacggcgtaaaacacgtctatcagttacgtgccagatcagtttcacctaaactgttcatcNgNcaagaggaagttcaagaactttactctccaatttttcttattgttgcggcaatagtgtttataacactttgcttcacactcaaaagaaagacagaatgaNNgaactttcattaattgacttctNtttgtgctttttagcctttctgctattccttgttttaattatgcttattatcttttggttctcacttgaactgcaagatcataatgaaacttgtcacgcctaaacgaacatgaaatttcttgttttcttaggaatcatcacaactgtagctgcatttcaccaagaatgtagtttacagtcatgtactcaacatcaaccatatgtagttgatgacccgtgtcctattcacttctattctaaatggtatattagagtaggagctagaaaatcagcacctttaattgaattgtgcgtggatgaggctggttctaaatcacccattcagtacatcgatatcggtaattatacagtttcctgtttaccttttacaattaattgccaggaacctaaattgggtagNcttgtagtgcgttgttcgttctatgaagactttttagagtatcatgacgttcgtgttgttttagatttNatctaaacgaacaaactaaaatgtctgataatggaccccaaaatcagcgaaatgcaccccgcattacgtttggtggaccctcagattcaactggcagtaaccagaatggagaacgcagtggggcgcgatcaaaacaacgtcggccccaaggtttacccaataatactgcgtcttggttcaccgctctcactcaacatggcaaggaagaccttaaattccctcgaggacaaggcgttccaattaacaccaatagcagtccagatgaccaaattggctactaccgaagagctaccagacgaattcgtggtggtgacggtaaaatgaaagatctcagtccaagatggtatttctactacctaggaactgggccagaagctggacttccctatggtgctaacaaagacggcatcatatgggttgcaactgagggagccttgaatacaccaaaagatcacattggcacccgcaatcctgctaacaatgctgcaatcgtgctacaacttcctcaaggaacaacattgccaaaaggcttctacgcagaagggagcagaggcggcagtcaagcctcttctcgttcctcatcacgtagtcgcaacagttcaagaaattcaactccaggcagcagtaggggaacttctcctgctagaatggctggcaatggcggtgatgctgctcttgctttgctgctgcttgacagattgaaccagcttgagagcaaaatgtctggtaaaNgccaacaacaacaaggccaaactgtcactaagaaatctgctgctgaggcttNtNagaagcctcggcaaaaacgtactgccactaaagcatacaatgtaacacaagctttcggcagacgtggtccagaacaaacccaaggaaattttggggaccaggaactaatcagacaaggaactgattacaaacattggccgcaaattgcacaatttgcccccagcgcttcagcgttcttcggaatgtcgcgcattggcatggaagtcacaccttcgggaacgtggttgacctacacaggtgccatcaaattggatgacaaagatccaaatttcaaagatcaagtcattttgctgaataagcatattgacgcatacaaaacattcccaccaacagagcctaaaaaggacaaaaagaagaaggctgatgaaactcaagccttaccgcaNagacagaagaaacagcaaactgtgactcttcttcctgctgcagatttggatgatttctccaaacaattgcaacaatccatgagcagtgctgactcaactcaggcctaaactcatgcagaccacacaaNgcagatgggctatataaacgttttcgcttttccgtttacgNtatatagtctactcttgtgcagaatgaattctcgtaactacatagcacaagtagatgtagttaactttaatctcacatagcaatctttaatcagtgtgtaacattagggaggacttgaaagagccaccacattttcaccgaggccacgcggagtacgatcgagtgtacagtgaacaatgctagggagagctgcctatatggaagagcccNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN