diff --git a/haplomap/README.md b/haplomap/README.md index 630cf50..5148b10 100644 --- a/haplomap/README.md +++ b/haplomap/README.md @@ -136,11 +136,31 @@ build/bin/haplomap ghmap -p ${HOME}/data/test_traits.txt \ By default. Output result are gene-summrized (see `haplomap ghmap --help`). Recommend adding `-a` flag, which will output gene-oriented format results. -**NOTE 2:** strain order in (-p) should keep the same to the (-b). That's, eblocks (-s) +**NOTE 2:** strain order in (-p) should keep the same to the eblocks (-s) + ## Input see `example` folder for test cases. +### convert +vcf file from variant calling tool. +vcf must be genotyped (has the GT field) + +**Encoding** +1. SNV: A->A, T->T, G->G, C->C +2. INDEL/SV: + - reference: A + - deletion: G + - duplication or tanderm duplication: G + - insertion: G + - inversion: G + - breakend: G + + +### annotate + +For SVs, must contain SVTYPE, and SVLEN + ### eblocks: - Strain file (-s): - Tree column txt file: "#Abbrev \t (Optional) \t Values " diff --git a/haplomap/include/haplolib.h b/haplomap/include/haplolib.h index 49338d4..bb92688 100644 --- a/haplomap/include/haplolib.h +++ b/haplomap/include/haplolib.h @@ -68,12 +68,12 @@ class SNPInfo std::string name; int chrIdx; // chromosome index int position; // location on chromosome - char ref; + char ref; // reference allele char // string of alleles "ACGTD?" "D" is deletion, "?" is unspecified. char *alleles; // string of haplotype numbers (not printable). char *pattern; - std::map geneCodonMap; // Map gene names to codons + std::map geneCodonMap; // Map gene names to codons/annotations bool frozen; // when true, pattern is uniquified and should not be changed. bool used; // Is this SNP in a chosen block? bool qMarks; //Are there any unkowns in this SNP? diff --git a/haplomap/include/vcf.h b/haplomap/include/vcf.h index 38b4808..2d6b071 100644 --- a/haplomap/include/vcf.h +++ b/haplomap/include/vcf.h @@ -55,7 +55,7 @@ struct VCFOptions class Variant { public: - bool isSNP; + int TYPE; // snv: 1, indel: 2, sv: 3, unknown: 0 std::string CHROM; int POS; std::string ID; @@ -75,7 +75,8 @@ class Variant { private: void parseFORMAT(const std::vector & rec); - bool issnp(); + bool isSNP(); + bool isSV(); // is structral variant ? float max(const std::string & fmt); }; @@ -91,14 +92,14 @@ class VCF void parseRecords(); void parseHeader(); void checkSamples(); - bool parseStructralVariant(std::vector & alleles, + bool parseStructralVariant(std::string& alleles, std::vector& alts, std::vector& hasAlt); - bool parseSNP(std::vector & alleles, + bool parseSNP(std::string& alleles, std::vector& alts, std::vector& hasAlt); - void writeSNP(std::vector &alleles); - void writeTPED(std::vector &alleles); + void writeSNP(std::string& alleles); + void writeTPED(std::string& alleles); void writeTFAM(); - void writeStructralVariant(std::vector &alleles, const char* varType); + void writeStructralVariant(std::string& alleles); private: std::string line; @@ -113,6 +114,8 @@ class VCF // read input file std::istream* input; VCFOptions* opts; + char getAllele(std::string& alleles, int i) { return alleles[i]; } + void setAllele(std::string& alleles, int i, char a) { alleles[i] = a; } }; #endif \ No newline at end of file diff --git a/haplomap/src/eblocks.cpp b/haplomap/src/eblocks.cpp index ff44c80..d9906be 100755 --- a/haplomap/src/eblocks.cpp +++ b/haplomap/src/eblocks.cpp @@ -229,7 +229,21 @@ void readAlleleInfoCompact(char *fname, char* refgenome) { std::cout<<"Reference Genome is found: "< Variant database: "< Variant database has names: "<= 0) { diff --git a/haplomap/src/ehaploblocks.cpp b/haplomap/src/ehaploblocks.cpp index a6aeb87..4143238 100755 --- a/haplomap/src/ehaploblocks.cpp +++ b/haplomap/src/ehaploblocks.cpp @@ -173,7 +173,7 @@ EblockOptions *parseEblockOptions(int argc, char **argv) if (argc == 1) { std::cout<allelesFileName) { @@ -229,11 +229,11 @@ int main_eblocks(int argc, char **argv) // Can't do this earlier because numStrains is used in constructor. snpVec.reserve(approxNumSNPs); - // FIXME: Add option for "compact SNP file" + // read variant database beginPhase("reading compact alleles file"); readAlleleInfoCompact(opts->allelesFileName, opts->refStrainName); endPhase(); - std::string chr = chromosomes.eltOf(0); + // std::string chr = chromosomes.eltOf(0); // if no gene names file specified, don't read it, omit gene names. if (opts->genesFileName) { diff --git a/haplomap/src/haplolib.cpp b/haplomap/src/haplolib.cpp index 6f4a719..57a6954 100644 --- a/haplomap/src/haplolib.cpp +++ b/haplomap/src/haplolib.cpp @@ -208,10 +208,12 @@ std::string SNPInfo::allelesToPattern(void) map[(int)'G'] = '?'; map[(int)'T'] = '?'; map[(int)'?'] = '?'; + map[(int)'N'] = '?'; // ref allele, used in indel or sv map[(int)'D'] = '?'; // deletion - map[(int)'U'] = '?'; // duplication - map[(int)'I'] = '?'; // insertion + map[(int)'U'] = '?'; // duplication, or tanderm duplication + map[(int)'S'] = '?'; // insertion map[(int)'V'] = '?'; // inversion + map[(int)'K'] = '?'; // breakend // update Ref if defined int eqclass = 0; if (this->ref != '?') diff --git a/haplomap/src/vcf.cpp b/haplomap/src/vcf.cpp index c6d41f2..19f3f3b 100644 --- a/haplomap/src/vcf.cpp +++ b/haplomap/src/vcf.cpp @@ -19,8 +19,16 @@ Variant::Variant(std::string record) QUAL = std::stof(rec[5]); FILTER = rec[6]; INFO = rec[7]; - isSNP = issnp(); // format dict + TYPE = 0; + if (this->isSNP()) + { + TYPE = 1; // snv + } else if (this->isSV()) { + TYPE = 3; // sv + } else if (REF.size() != ALT.size() ){ + TYPE = 2; // indel + } parseFORMAT(rec); } @@ -91,11 +99,11 @@ void Variant::parseFORMAT(const std::vector & rec) return; } -bool Variant::issnp() +bool Variant::isSNP() { - if (this->REF.size() > 1) - return false; - + if (this->REF.size() > 1) return false; + if (this->REF == "N") return false; + // e.g. if (this->ALT.find("<") != std::string::npos || this->ALT.find(">") != std::string::npos) return false; @@ -108,6 +116,13 @@ bool Variant::issnp() return true; } +bool Variant::isSV() +{ + if (this->INFO.find("SVTYPE=") != std::string::npos) + return true; + return false; +} + float Variant::max(const std::string & fmt) { if (this->FORMATS.find(fmt) == this->FORMATS.end() ) @@ -247,7 +262,7 @@ void VCF::checkSamples() } -bool VCF::parseSNP(std::vector & alleles, +bool VCF::parseSNP(std::string & alleles, std::vector& alts, std::vector& hasAlt) { if (variant.REF == "N") @@ -339,18 +354,22 @@ bool VCF::parseSNP(std::vector & alleles, if (GTs[0] > 0) { if ((alts[GTs[0] - 1]) != "*") // GATK has *, covert to '?' - alleles[s] = alts[GTs[0] - 1]; + { + std::string c = alts[GTs[0] - 1]; // do a copy + alleles[s] = c[0]; // string to char + } hasAlt[GTs[0] - 1] = 1; //sampleAlts[s] = 1; } else { - alleles[s] = variant.REF; + alleles[s] = (char) variant.REF[0]; } } } return true; } -bool VCF::parseStructralVariant(std::vector & alleles, std::vector& alts, std::vector& hasAlt) +bool VCF::parseStructralVariant(std::string& alleles, std::vector& alts, std::vector& hasAlt) { + for (int s=0; s < strains.size(); s++) { std::string gt = variant.FORMATS["GT"][s]; @@ -365,7 +384,7 @@ bool VCF::parseStructralVariant(std::vector & alleles, std::vector< if (gts[0] != gts[1]) continue; // allele == '?' - // genotype 0/0, 0/1, 1/1, 1/2,2/2 + // genotype 0/0, 0/1, 1/1, 2/2 std::vector GTs; // string to int, vectorize std::transform(gts.begin(), gts.end(), std::back_inserter(GTs), @@ -378,7 +397,7 @@ bool VCF::parseStructralVariant(std::vector & alleles, std::vector< alleles[s] = 'G'; hasAlt[GTs[0] - 1] = 1; } else { - alleles[s] = 'A'; + alleles[s] = 'A'; // ref } } @@ -386,7 +405,7 @@ bool VCF::parseStructralVariant(std::vector & alleles, std::vector< } -void VCF::writeSNP(std::vector &alleles) +void VCF::writeSNP(std::string& alleles) { output <<"SNP_"< &alleles) } } else { - for (auto &a: alleles) - output< &alleles) +void VCF::writeTPED(std::string &alleles) { - tped < 1) tped <<"A A"; // convert to A as ref // write allele pattern if (!samples.empty()) @@ -419,15 +454,15 @@ void VCF::writeTPED(std::vector &alleles) for(auto &s: samples) { int strIdx = strains.indexOf(s); - std::string a = alleles[strIdx]; - if (a == "?") a = "0"; + char a = alleles[strIdx]; + if (a == '?') a = '0'; tped<<" "< &alleles, const char* vartype) -{ +void VCF::writeStructralVariant(std::string &alleles) +{ + std::string vartype; + switch (variant.TYPE) { + case 1: + vartype = "SNP"; + break; + case 2: + vartype = "INDEL"; + break; + case 3: + vartype = "SV"; + break; + default: + vartype = "UNKNOWN"; + } std::unordered_map INFO = variant.getINFO(); output << vartype <<"_"< &alleles, const char* v } } else { - for (auto &a: alleles) - output<qual) - continue; + if (variant.QUAL < opts->qual) continue; - //std::string alleles("?", strains.size()); // not easy to do inplace replacement + std::string alleles(strains.size(), '?'); // allele pattern std::vector alts = split(variant.ALT, ','); std::vector hasAlt(alts.size(), 0); // number of alternates std::vector sampleAlts(strains.size(), 0); // sample is alt or not - if (alts.size() > 1) // only allow 1 alternate + if (variant.TYPE == 1 && alts.size() > 1) // snp only allow 1 alternate continue; // string find not found, skip // std::unordered_map INFO = variant.getINFO(); - std::vector alleles(strains.size(),"?"); - if (variant.isSNP && std::strcmp(opts->variantType, "SNV") == 0) + + if (variant.TYPE == 1 && std::strcmp(opts->variantType, "SNV") == 0) { bool ret = parseSNP(alleles, alts, hasAlt); - if (!ret) - continue; + if (!ret) continue; int numGoodAlt = std::accumulate(hasAlt.begin(), hasAlt.end(), 0); if (numGoodAlt == 1) { writeSNP(alleles); if (opts->plink) writeTPED(alleles); - } } - else if (!variant.isSNP && (std::strcmp(opts->variantType, "SV") == 0 || std::strcmp(opts->variantType, "INDEL") == 0 )) + else if ( variant.TYPE > 1 && (std::strcmp(opts->variantType, "SV") == 0 || std::strcmp(opts->variantType, "INDEL") == 0 )) { bool ret = parseStructralVariant(alleles, alts, hasAlt); - if (!ret) - continue; + if (!ret) continue; int numGoodAlt = std::accumulate(hasAlt.begin(), hasAlt.end(), 0); if (numGoodAlt == 1) { - writeStructralVariant(alleles, opts->variantType); + writeStructralVariant(alleles); if (opts->plink) writeTPED(alleles); } }