Skip to content

Commit

Permalink
[scripts] Add tredprepare.py
Browse files Browse the repository at this point in the history
  • Loading branch information
tanghaibao committed Oct 10, 2017
1 parent 368c406 commit 2903f22
Show file tree
Hide file tree
Showing 4 changed files with 145 additions and 2 deletions.
28 changes: 28 additions & 0 deletions sites/PIEZO1.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
{
"PIEZO1": {
"allele_freq": "",
"cutoff_prerisk": 7,
"cutoff_risk": 7,
"gene_location": "chr16:88715338-88785211:-",
"gene_name": "PIEZO1",
"gene_part": "coding region",
"id": "chr16_88733965_TCC",
"inheritance": "AD",
"motif": "CAG",
"mutation_nature": "decrease",
"normal": "8 repeats",
"omim_id": "",
"prefix": "AGCCCCTCGTCCCTGGAG",
"prerisk": "7 repeats",
"repeat": "TCC",
"repeat_location": "chr16:88733965-88733988",
"repeat_location.hg19": "chr16:88800373-88800396",
"risk": "7- repeats",
"src": "wikipedia",
"suffix": "TGCTGCTGCTGCTGATGC",
"symptom": "Common PIEZO1 allele in African populations causes red blood cell dehydration and attenuates Plasmodium infection.",
"title": "",
"url": "",
"variant_type": "short tandem repeats"
}
}
22 changes: 20 additions & 2 deletions tredparse/meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,19 +13,22 @@

import json
import pandas as pd
from glob import glob

from .utils import datafile
from .utils import byteify, datafile


REF = "hg38"
REPO = datafile("TREDs.meta.csv")
ALTS = datafile("TREDs.alts.csv")
HLI_BAMS = datafile("HLI_bams.csv.gz")
#SITES = datafile("sites")
SITES = "sites"


class TREDsRepo(dict):

def __init__(self, ref=REF, toy=False):
def __init__(self, ref=REF, toy=False, use_sites=True):

# Parse ALTS first
alts = self.get_alts(ref)
Expand All @@ -37,6 +40,15 @@ def __init__(self, ref=REF, toy=False):
self.names.append(name)
self.df = df

# Get all user loci names by searching within sites folder
if use_sites:
for sites in glob("{}/*.json".format(SITES)):
sites = byteify(json.load(open(sites)))
for name, row in sites.items():
name = str(name) # json.loads gets a unicode
self[name] = TRED(name, row, ref=ref, alt=alts.get(name, []))
self.names.append(name)

if toy:
tr = self.get("HD")
tr.name = "toy"
Expand Down Expand Up @@ -82,6 +94,12 @@ def get_alts(self, ref):
alts[name] = regions
return alts

def create_tred(self):
""" Make a dictionary keyed by name into another dictionary keyed by the
dataframe columns.
"""
return dict((x, "") for x in self.df.columns)


class TRED(object):

Expand Down
17 changes: 17 additions & 0 deletions tredparse/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,3 +178,20 @@ def sh(cmd, infile=None, outfile=None, errfile=None,
logger.debug(cmd)

return call(cmd, shell=True, executable=shell)


def byteify(input):
""" Python's json.loads returns unicode which I sometimes need them to be str
Borrowed from:
<https://stackoverflow.com/questions/956867/how-to-get-string-objects-instead-of-unicode-from-json>
"""
if isinstance(input, dict):
return {byteify(key): byteify(value)
for key, value in input.iteritems()}
elif isinstance(input, list):
return [byteify(element) for element in input]
elif isinstance(input, unicode):
return input.encode('utf-8')
else:
return input
80 changes: 80 additions & 0 deletions tredprepare.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
#!/usr/bin/env python
# -*- coding: UTF-8 -*-

"""
Copyright (c) 2015-2017 Human Longevity Inc.
Author: Haibao Tang <[email protected]>
License: Non-Commercial Use Only. For details, see `LICENSE` file
Prepare details of a STR locus for computation with TREDPARSE.
The script progresses by asking a few key questions:
- Locus name
- Motif
- Expected number of motifs
- Chromosomal start location
"""

import sys
import json

from pyfaidx import Fasta
from tredparse.jcvi import mkdir
from tredparse.meta import TREDsRepo


def survey_var(s, cast=str, default=""):
""" Convenience function to get variable from command line survey.
"""
res = raw_input("{} [{}]: ".format(s, default))
if res == "":
res = default
return cast(res)


def main():
# Conduct survey
name = survey_var("Enter the locus name", default="HD")
repeat = survey_var("Sequence motif", default="CAG")
nrepeat = survey_var("Number of motifs in reference", cast=int, default=19)
location = survey_var("Chromosomal start location (1-based)",
default="chr4:3074877")
ref = survey_var("Enter the FASTA path", default="/mnt/ref/hg38.upper.fa")

# Extract sequences
f = Fasta(ref)
c, start = location.split(":")
start = int(start)
size = len(repeat) * nrepeat
end = start + size - 1
tract = f[c][start - 1: end].seq.lower()
FLANK = 18
prefix = f[c][start - FLANK - 1: start - 1].seq.upper()
suffix = f[c][end: end + FLANK].seq.upper()
print >> sys.stderr, "|".join((prefix, tract, suffix))

# Populate the tred
repo = TREDsRepo()
s = repo.create_tred()
s["id"] = "{}_{}_{}".format(c, start, repeat)
s["prefix"] = prefix
s["suffix"] = suffix
s["repeat"] = "CAG"
s["repeat_location"] = "{}:{}-{}".format(c, start, end)
tred = { name: s }

# Serialize
mkdir("sites")
outfile = "sites/{}.json".format(name)
fw = open(outfile, "w")
print >> fw, json.dumps(tred, sort_keys=True, indent=2)
fw.close()

print >> sys.stderr
print >> sys.stderr, "Template json file is written to `{}`".format(outfile)
print >> sys.stderr, "Please manually fill in the remaining details"


if __name__ == '__main__':
main()

0 comments on commit 2903f22

Please sign in to comment.