fixed ghmap -e, this should be an optional input

zqfang · Jul 9, 2024 · 5d96aca · 5d96aca
1 parent 563315e
commit 5d96aca
Show file tree

Hide file tree

Showing 7 changed files with 50 additions and 72 deletions.
diff --git a/example/MPD_000/trait.000.txt b/example/MPD_000/trait.000.txt
diff --git a/haplomap/src/quantTraitMap.cpp b/haplomap/src/quantTraitMap.cpp
@@ -30,7 +30,7 @@ struct GhmapOptions
     GhmapOptions() : isCategorical(false), filterCoding(false), haploBlocks(false),
                      geneBlocks(false), geneAllBlocks(false), pvalueCutoff(0.05),datasetName((char *)"Unnamed_dataset"),
                      phenotypeFileName(NULL), blocksFileName(NULL), outputFileName(NULL), geneName(NULL),
-                     equalFile(NULL), goTermFile(NULL), goFilter(NULL),
+                     expressionFile(NULL), equalFile(NULL), goTermFile(NULL), goFilter(NULL),
                      geneticRelationMatrix(NULL) {};
 };
 

diff --git a/workflows/config.2.yaml b/workflows/config.2.yaml
@@ -57,7 +57,7 @@ HBCGM:
     # KNOWNGENE: "/data/bases/shared/haplomap/PELTZ_20210609/SNP_Annotation/mm10_knownGene.txt" 
 
     # genetic relation file from PLink output
-    GENETIC_REL: "/data/bases/shared/haplomap/PELTZ_20210609/mouse54_grm.rel"
+    GENETIC_REL: "/data/bases/shared/haplomap/PELTZ_20210609/mouse_grm.dist"
     # gene expression file
     GENE_EXPRS: "/data/bases/shared/haplomap/PELTZ_20210609/mus.compact.exprs.txt"
     # MeSH mapping

diff --git a/workflows/config.indel.yaml b/workflows/config.indel.yaml
@@ -25,7 +25,7 @@ dbSNP: "/home/fangzq/genome/mouse/mouse.dbsnp.vcf.gz" # mm10
 
 HBCGM:
     # working directory
-    WORKSPACE: "/data/bases/shared/haplomap/MPD_MeSH_Indel_example"
+    WORKSPACE: "/data/bases/shared/haplomap/MPD_MeSH_Indel"
     # path to haplomap
     BIN: "/home/fangzq/github/HBCGM/build/bin"
     # path to GNNHap repo
@@ -34,7 +34,7 @@ HBCGM:
     GNNHAP_BUNDLE: "/home/fangzq/github/GNNHap/bundle"
 
     # MPD trait ids 
-    TRAIT_IDS: "/home/fangzq/github/HBCGM/example/MPD_MeSH.example.txt"
+    TRAIT_IDS: "/data/bases/shared/haplomap/PELTZ_20210609/MPD_MeSH_suffixed.txt"
     # set to true if input individual animal data. Default: use strain means.   
     USE_RAWDATA: false 
     # use custom database instead of using MPD API to get data.
@@ -47,7 +47,7 @@ HBCGM:
     # Ensembl-vep output after variant calling step
     VEP_DIR: "/data/bases/shared/haplomap/PELTZ_20210609/VEP"
     # path to indel database
-    SNPS_DIR: "/data/bases/shared/haplomap/PELTZ_20210609/INDELs"
+    VAR_DIR: "/data/bases/shared/haplomap/PELTZ_20210609/HBCGM_VAR_DB"
 
     # genetic relation file from PLink output
     GENETIC_REL: "/data/bases/shared/haplomap/PELTZ_20210609/mouse54_grm.rel"

diff --git a/workflows/config.yaml b/workflows/config.yaml
@@ -25,7 +25,7 @@ dbSNP: "/home/fangzq/genome/mouse/mouse.dbsnp.vcf.gz" # mm10
 
 HBCGM:
     # working directory
-    WORKSPACE: "/data/bases/shared/haplomap/MPD_MeSH_example"
+    WORKSPACE: "/data/bases/fangzq/20220711_pain/TGR/TGR_05-30" #"/data/bases/shared/haplomap/MPD_MeSH"
     # path to haplomap
     BIN: "/home/fangzq/github/HBCGM/build/bin"
     # path to GNNHap repo
@@ -34,7 +34,7 @@ HBCGM:
     GNNHAP_BUNDLE: "/home/fangzq/github/GNNHap/bundle"
 
     # MPD trait ids 
-    TRAIT_IDS: "/home/fangzq/github/HBCGM/example/MPD_MeSH.example.txt"
+    TRAIT_IDS: "/data/bases/fangzq/20220711_pain/TGR/TGR_05-30/trait_list.txt"
     # set to true if input individual animal data. Default: use strain means.   
     USE_RAWDATA: false 
     # use custom database instead of using MPD API to get data.
@@ -49,7 +49,7 @@ HBCGM:
     VEP_DIR: "/data/bases/shared/haplomap/PELTZ_20210609/VEP"
 
     # path to SNP database
-    SNPS_DIR: "/data/bases/shared/haplomap/PELTZ_20210609/SNPs"
+    VAR_DIR: "/data/bases/shared/haplomap/PELTZ_20210609/HBCGM_VAR_DB"
     # PATH to SNP annotations for all genes
     # ANNOVAR: "/data/bases/shared/haplomap/PELTZ_20210609/SNP_Annotation" 
     # # snp, geneid,genename mapping
@@ -61,7 +61,7 @@ HBCGM:
     # gene expression file
     GENE_EXPRS: "/data/bases/shared/haplomap/PELTZ_20210609/mus.compact.exprs.txt"
     # MeSH mapping
-    MPD2MeSH: "/data/bases/shared/haplomap/PELTZ_20210609/mpd2mesh.json"
+    MPD2MeSH: "/data/bases/fangzq/20220711_pain/TGR/TGR_05-30/trait_mesh.json"
 
     ######### others #########
     # open chromatin regions to annotate haploblocks.

diff --git a/workflows/haplomap.indel.smk b/workflows/haplomap.indel.smk
@@ -37,7 +37,7 @@ with open(TRAIT_IDS, 'r') as t:
 # filter id in MeSH
 with open(MPD2MeSH, 'r') as j:
     MESH_DICT = json.load(j)
-IDS = [i for i in IDS_ if i.split("-")[0] in MESH_DICT ]
+IDS = [i for i in IDS_ if i.replace("MPD_", "").split("-")[0] in MESH_DICT ]
 # filter id that already run
 # pat = config['HBCGM']['WORKSPACE'] + "MPD_{ids}_indel.results.mesh.txt"
 # IDS = [mnum for mnum in IDS if not os.path.exists(pat.format(ids = mnum))]
@@ -62,26 +62,26 @@ rule target:
 #     ouput: os.path.join(OUTPUT_DIR, "mpd.ids.txt")
 #     shell: 
 #         "cut -d, -f1 {input} | uniq | sed '1d' > {output.txt}"
-rule traits: 
-    output: temp(expand("MPD_{ids}/strain.{ids}.temp", ids=IDS))
-    run:
-        for out in output:
-            shell("touch %s"%out)
+# rule traits: 
+#     output: temp(expand("MPD_{ids}/strain.{ids}.temp", ids=IDS))
+#     run:
+#         for out in output:
+#             shell("touch %s"%out)
 
-rule strain2trait:
-    input: 
-        strain = STRAIN_ANNO,
-        ids = "MPD_{ids}/strain.{ids}.temp"
-    output: 
-        "MPD_{ids}/strain.{ids}.txt",
-        "MPD_{ids}/trait.{ids}.txt",
-    params:
-        trait = TRAIT_DATA,
-        outdir = config['HBCGM']['WORKSPACE'],
-        traitid = "{ids}",
-        rawdata = config['HBCGM']['USE_RAWDATA']
-    script:
-        "../scripts/strain2traits.py"
+# rule strain2trait:
+#     input: 
+#         strain = STRAIN_ANNO,
+#         ids = "MPD_{ids}/strain.{ids}.temp"
+#     output: 
+#         "MPD_{ids}/strain.{ids}.txt",
+#         "MPD_{ids}/trait.{ids}.txt",
+#     params:
+#         trait = TRAIT_DATA,
+#         outdir = config['HBCGM']['WORKSPACE'],
+#         traitid = "{ids}",
+#         rawdata = config['HBCGM']['USE_RAWDATA']
+#     script:
+#         "../scripts/strain2traits.py"
 
 rule strainOrder:
     output: "strain.order.snpdb.txt"

diff --git a/workflows/haplomap.smk b/workflows/haplomap.smk
@@ -37,7 +37,7 @@ with open(TRAIT_IDS, 'r') as t:
 # filter id in MeSH
 with open(MPD2MeSH, 'r') as j:
     MESH_DICT = json.load(j)
-IDS = [i for i in IDS_ if i.split("-")[0] in MESH_DICT ]
+IDS = [i for i in IDS_ if i.replace("MPD_","").split("-")[0] in MESH_DICT ]
 # filter id that already run
 pat = config['HBCGM']['WORKSPACE'] + "MPD_{ids}_snp.results.txt"
 IDS = [mnum for mnum in IDS if not os.path.exists(pat.format(ids = mnum))]
@@ -47,39 +47,39 @@ CHROMSOME = [str(i) for i in range (1, 20)] + ['X'] # NO 'Y'
 # SNPDB = expand("SNPs/chr{i}.txt", i=CHROMOSOMES)
 HBCGM =  expand("MPD_{ids}/chr{i}.snp.results.txt", ids=IDS, i=CHROMSOME)
 HBCGM_NONCODING = expand("MPD_{ids}/chr{i}.open_region.bed", ids=IDS, i=CHROMSOME)
-HBCGM_MESH = expand("MPD_{ids}_snp.results.mesh.txt", ids=IDS)
+# HBCGM_MESH = expand("MPD_{ids}_snp.results.mesh.txt", ids=IDS)
 # rules that not work in a new node
 #localrules: target, traits, strain2trait  
 
 
 rule target:
-    input: HBCGM_MESH, #HBCGM_NONCODING
+    input: HBCGM, #HBCGM_NONCODING
 
 ########################### Prepare Phenotypic DATA (from MPD API) ############################
 # rule pheno:
 #     input: TRAIT_DATA
 #     ouput: os.path.join(OUTPUT_DIR, "mpd.ids.txt")
 #     shell: 
 #         "cut -d, -f1 {input} | uniq | sed '1d' > {output.txt}"
-rule traits: 
-    output: temp(expand("MPD_{ids}/strain.{ids}.temp", ids=IDS))
-    run:
-        for out in output:
-            shell("touch %s"%out)
+# rule traits: 
+#     output: temp(expand("MPD_{ids}/strain.{ids}.temp", ids=IDS))
+#     run:
+#         for out in output:
+#             shell("touch %s"%out)
 
-rule strain2trait:
-    input: 
-        strain = STRAIN_ANNO,
-        ids = "MPD_{ids}/strain.{ids}.temp"
-    output: 
-        "MPD_{ids}/trait.{ids}.txt",
-    params:
-        trait = TRAIT_DATA,
-        outdir = config['HBCGM']['WORKSPACE'],
-        traitid = "{ids}",
-        rawdata = config['HBCGM']['USE_RAWDATA']
-    script:
-        "../scripts/strain2traits.py"
+# rule strain2trait:
+#     input: 
+#         strain = STRAIN_ANNO,
+#         ids = "MPD_{ids}/strain.{ids}.temp"
+#     output: 
+#         "MPD_{ids}/trait.{ids}.txt",
+#     params:
+#         trait = TRAIT_DATA,
+#         outdir = config['HBCGM']['WORKSPACE'],
+#         traitid = "{ids}",
+#         rawdata = config['HBCGM']['USE_RAWDATA']
+#     script:
+#         "../scripts/strain2traits.py"
 
 
 ############################ Convert VCF to niehs, tped, tfam  ########################################