From 2903f226267b0f5bec0de437e4358103349f7c13 Mon Sep 17 00:00:00 2001 From: Haibao Tang Date: Mon, 9 Oct 2017 17:46:58 -0700 Subject: [PATCH] [scripts] Add tredprepare.py --- sites/PIEZO1.json | 28 ++++++++++++++++ tredparse/meta.py | 22 +++++++++++-- tredparse/utils.py | 17 ++++++++++ tredprepare.py | 80 ++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 145 insertions(+), 2 deletions(-) create mode 100644 sites/PIEZO1.json create mode 100755 tredprepare.py diff --git a/sites/PIEZO1.json b/sites/PIEZO1.json new file mode 100644 index 0000000..bd90c6a --- /dev/null +++ b/sites/PIEZO1.json @@ -0,0 +1,28 @@ +{ + "PIEZO1": { + "allele_freq": "", + "cutoff_prerisk": 7, + "cutoff_risk": 7, + "gene_location": "chr16:88715338-88785211:-", + "gene_name": "PIEZO1", + "gene_part": "coding region", + "id": "chr16_88733965_TCC", + "inheritance": "AD", + "motif": "CAG", + "mutation_nature": "decrease", + "normal": "8 repeats", + "omim_id": "", + "prefix": "AGCCCCTCGTCCCTGGAG", + "prerisk": "7 repeats", + "repeat": "TCC", + "repeat_location": "chr16:88733965-88733988", + "repeat_location.hg19": "chr16:88800373-88800396", + "risk": "7- repeats", + "src": "wikipedia", + "suffix": "TGCTGCTGCTGCTGATGC", + "symptom": "Common PIEZO1 allele in African populations causes red blood cell dehydration and attenuates Plasmodium infection.", + "title": "", + "url": "", + "variant_type": "short tandem repeats" + } +} diff --git a/tredparse/meta.py b/tredparse/meta.py index 20fe802..845f1a5 100644 --- a/tredparse/meta.py +++ b/tredparse/meta.py @@ -13,19 +13,22 @@ import json import pandas as pd +from glob import glob -from .utils import datafile +from .utils import byteify, datafile REF = "hg38" REPO = datafile("TREDs.meta.csv") ALTS = datafile("TREDs.alts.csv") HLI_BAMS = datafile("HLI_bams.csv.gz") +#SITES = datafile("sites") +SITES = "sites" class TREDsRepo(dict): - def __init__(self, ref=REF, toy=False): + def __init__(self, ref=REF, toy=False, use_sites=True): # Parse ALTS first alts = self.get_alts(ref) @@ -37,6 +40,15 @@ def __init__(self, ref=REF, toy=False): self.names.append(name) self.df = df + # Get all user loci names by searching within sites folder + if use_sites: + for sites in glob("{}/*.json".format(SITES)): + sites = byteify(json.load(open(sites))) + for name, row in sites.items(): + name = str(name) # json.loads gets a unicode + self[name] = TRED(name, row, ref=ref, alt=alts.get(name, [])) + self.names.append(name) + if toy: tr = self.get("HD") tr.name = "toy" @@ -82,6 +94,12 @@ def get_alts(self, ref): alts[name] = regions return alts + def create_tred(self): + """ Make a dictionary keyed by name into another dictionary keyed by the + dataframe columns. + """ + return dict((x, "") for x in self.df.columns) + class TRED(object): diff --git a/tredparse/utils.py b/tredparse/utils.py index 67ed461..c80f1de 100644 --- a/tredparse/utils.py +++ b/tredparse/utils.py @@ -178,3 +178,20 @@ def sh(cmd, infile=None, outfile=None, errfile=None, logger.debug(cmd) return call(cmd, shell=True, executable=shell) + + +def byteify(input): + """ Python's json.loads returns unicode which I sometimes need them to be str + + Borrowed from: + + """ + if isinstance(input, dict): + return {byteify(key): byteify(value) + for key, value in input.iteritems()} + elif isinstance(input, list): + return [byteify(element) for element in input] + elif isinstance(input, unicode): + return input.encode('utf-8') + else: + return input diff --git a/tredprepare.py b/tredprepare.py new file mode 100755 index 0000000..b2269ef --- /dev/null +++ b/tredprepare.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -*- + +""" +Copyright (c) 2015-2017 Human Longevity Inc. + +Author: Haibao Tang +License: Non-Commercial Use Only. For details, see `LICENSE` file + +Prepare details of a STR locus for computation with TREDPARSE. + +The script progresses by asking a few key questions: +- Locus name +- Motif +- Expected number of motifs +- Chromosomal start location +""" + +import sys +import json + +from pyfaidx import Fasta +from tredparse.jcvi import mkdir +from tredparse.meta import TREDsRepo + + +def survey_var(s, cast=str, default=""): + """ Convenience function to get variable from command line survey. + """ + res = raw_input("{} [{}]: ".format(s, default)) + if res == "": + res = default + return cast(res) + + +def main(): + # Conduct survey + name = survey_var("Enter the locus name", default="HD") + repeat = survey_var("Sequence motif", default="CAG") + nrepeat = survey_var("Number of motifs in reference", cast=int, default=19) + location = survey_var("Chromosomal start location (1-based)", + default="chr4:3074877") + ref = survey_var("Enter the FASTA path", default="/mnt/ref/hg38.upper.fa") + + # Extract sequences + f = Fasta(ref) + c, start = location.split(":") + start = int(start) + size = len(repeat) * nrepeat + end = start + size - 1 + tract = f[c][start - 1: end].seq.lower() + FLANK = 18 + prefix = f[c][start - FLANK - 1: start - 1].seq.upper() + suffix = f[c][end: end + FLANK].seq.upper() + print >> sys.stderr, "|".join((prefix, tract, suffix)) + + # Populate the tred + repo = TREDsRepo() + s = repo.create_tred() + s["id"] = "{}_{}_{}".format(c, start, repeat) + s["prefix"] = prefix + s["suffix"] = suffix + s["repeat"] = "CAG" + s["repeat_location"] = "{}:{}-{}".format(c, start, end) + tred = { name: s } + + # Serialize + mkdir("sites") + outfile = "sites/{}.json".format(name) + fw = open(outfile, "w") + print >> fw, json.dumps(tred, sort_keys=True, indent=2) + fw.close() + + print >> sys.stderr + print >> sys.stderr, "Template json file is written to `{}`".format(outfile) + print >> sys.stderr, "Please manually fill in the remaining details" + + +if __name__ == '__main__': + main()