[scripts] Add tredprepare.py

humanlongevity · Oct 10, 2017 · 2903f22 · 2903f22
1 parent 368c406
commit 2903f22
Show file tree

Hide file tree

Showing 4 changed files with 145 additions and 2 deletions.
diff --git a/sites/PIEZO1.json b/sites/PIEZO1.json
@@ -0,0 +1,28 @@
+{
+  "PIEZO1": {
+    "allele_freq": "",
+    "cutoff_prerisk": 7,
+    "cutoff_risk": 7,
+    "gene_location": "chr16:88715338-88785211:-",
+    "gene_name": "PIEZO1",
+    "gene_part": "coding region",
+    "id": "chr16_88733965_TCC",
+    "inheritance": "AD",
+    "motif": "CAG",
+    "mutation_nature": "decrease",
+    "normal": "8 repeats",
+    "omim_id": "",
+    "prefix": "AGCCCCTCGTCCCTGGAG",
+    "prerisk": "7 repeats",
+    "repeat": "TCC",
+    "repeat_location": "chr16:88733965-88733988",
+    "repeat_location.hg19": "chr16:88800373-88800396",
+    "risk": "7- repeats",
+    "src": "wikipedia",
+    "suffix": "TGCTGCTGCTGCTGATGC",
+    "symptom": "Common PIEZO1 allele in African populations causes red blood cell dehydration and attenuates Plasmodium infection.",
+    "title": "",
+    "url": "",
+    "variant_type": "short tandem repeats"
+  }
+}
diff --git a/tredparse/meta.py b/tredparse/meta.py
@@ -13,19 +13,22 @@
 
 import json
 import pandas as pd
+from glob import glob
 
-from .utils import datafile
+from .utils import byteify, datafile
 
 
 REF = "hg38"
 REPO = datafile("TREDs.meta.csv")
 ALTS = datafile("TREDs.alts.csv")
 HLI_BAMS = datafile("HLI_bams.csv.gz")
+#SITES = datafile("sites")
+SITES = "sites"
 
 
 class TREDsRepo(dict):
 
-    def __init__(self, ref=REF, toy=False):
+    def __init__(self, ref=REF, toy=False, use_sites=True):
 
         # Parse ALTS first
         alts = self.get_alts(ref)
@@ -37,6 +40,15 @@ def __init__(self, ref=REF, toy=False):
             self.names.append(name)
         self.df = df
 
+        # Get all user loci names by searching within sites folder
+        if use_sites:
+            for sites in glob("{}/*.json".format(SITES)):
+                sites = byteify(json.load(open(sites)))
+                for name, row in sites.items():
+                    name = str(name)  # json.loads gets a unicode
+                    self[name] = TRED(name, row, ref=ref, alt=alts.get(name, []))
+                    self.names.append(name)
+
         if toy:
             tr = self.get("HD")
             tr.name = "toy"
@@ -82,6 +94,12 @@ def get_alts(self, ref):
             alts[name] = regions
         return alts
 
+    def create_tred(self):
+        """ Make a dictionary keyed by name into another dictionary keyed by the
+        dataframe columns.
+        """
+        return dict((x, "") for x in self.df.columns)
+
 
 class TRED(object):
 

diff --git a/tredparse/utils.py b/tredparse/utils.py
@@ -178,3 +178,20 @@ def sh(cmd, infile=None, outfile=None, errfile=None,
         logger.debug(cmd)
 
     return call(cmd, shell=True, executable=shell)
+
+
+def byteify(input):
+    """ Python's json.loads returns unicode which I sometimes need them to be str
+
+    Borrowed from:
+    <https://stackoverflow.com/questions/956867/how-to-get-string-objects-instead-of-unicode-from-json>
+    """
+    if isinstance(input, dict):
+        return {byteify(key): byteify(value)
+                for key, value in input.iteritems()}
+    elif isinstance(input, list):
+        return [byteify(element) for element in input]
+    elif isinstance(input, unicode):
+        return input.encode('utf-8')
+    else:
+        return input
diff --git a/tredprepare.py b/tredprepare.py
@@ -0,0 +1,80 @@
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+
+"""
+Copyright (c) 2015-2017 Human Longevity Inc.
+
+Author: Haibao Tang <[email protected]>
+License: Non-Commercial Use Only. For details, see `LICENSE` file
+
+Prepare details of a STR locus for computation with TREDPARSE.
+
+The script progresses by asking a few key questions:
+- Locus name
+- Motif
+- Expected number of motifs
+- Chromosomal start location
+"""
+
+import sys
+import json
+
+from pyfaidx import Fasta
+from tredparse.jcvi import mkdir
+from tredparse.meta import TREDsRepo
+
+
+def survey_var(s, cast=str, default=""):
+    """ Convenience function to get variable from command line survey.
+    """
+    res = raw_input("{} [{}]: ".format(s, default))
+    if res == "":
+        res = default
+    return cast(res)
+
+
+def main():
+    # Conduct survey
+    name = survey_var("Enter the locus name", default="HD")
+    repeat = survey_var("Sequence motif", default="CAG")
+    nrepeat = survey_var("Number of motifs in reference", cast=int, default=19)
+    location = survey_var("Chromosomal start location (1-based)",
+                           default="chr4:3074877")
+    ref = survey_var("Enter the FASTA path", default="/mnt/ref/hg38.upper.fa")
+
+    # Extract sequences
+    f = Fasta(ref)
+    c, start = location.split(":")
+    start = int(start)
+    size = len(repeat) * nrepeat
+    end = start + size - 1
+    tract = f[c][start - 1: end].seq.lower()
+    FLANK = 18
+    prefix = f[c][start - FLANK - 1: start - 1].seq.upper()
+    suffix = f[c][end: end + FLANK].seq.upper()
+    print >> sys.stderr, "|".join((prefix, tract, suffix))
+
+    # Populate the tred
+    repo = TREDsRepo()
+    s = repo.create_tred()
+    s["id"] = "{}_{}_{}".format(c, start, repeat)
+    s["prefix"] = prefix
+    s["suffix"] = suffix
+    s["repeat"] = "CAG"
+    s["repeat_location"] = "{}:{}-{}".format(c, start, end)
+    tred = { name: s }
+
+    # Serialize
+    mkdir("sites")
+    outfile = "sites/{}.json".format(name)
+    fw = open(outfile, "w")
+    print >> fw, json.dumps(tred, sort_keys=True, indent=2)
+    fw.close()
+
+    print >> sys.stderr
+    print >> sys.stderr, "Template json file is written to `{}`".format(outfile)
+    print >> sys.stderr, "Please manually fill in the remaining details"
+
+
+if __name__ == '__main__':
+    main()