diff --git a/.travis.yml b/.travis.yml index 9745db7..db969b3 100644 --- a/.travis.yml +++ b/.travis.yml @@ -31,6 +31,6 @@ install: # command to run tests script: - - for VERSION in $PYTHON_VERSIONS ; do source activate $VERSION ; done + - for VERSION in $PYTHON_VERSIONS ; do source activate $VERSION ; py.test tests.py ; done sudo: false diff --git a/README.md b/README.md index 7b53d3f..c42d89d 100755 --- a/README.md +++ b/README.md @@ -92,7 +92,7 @@ A `.report.txt` file will also be generated that contains a summary of number of people affected by over-expanded TREDs as well as population allele frequency. -To better understand the uncertainties in the prediction, one call plot the +To better understand the uncertainties in the prediction, we can plot the likelihood surface based on the model. Using the same example as above at the Huntington disease case, we can run a command on the JSON output, with option `--tred HD` to specify the locus. diff --git a/tests.py b/tests.py new file mode 100644 index 0000000..09b2966 --- /dev/null +++ b/tests.py @@ -0,0 +1,23 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -*- + + +def test_tred(): + """ Run tred.py on sample CSV file and generate TSV file with the genotype + """ + from tredparse.tred import main + main(["tests/samples.csv", "--workdir", "work"]) + + +def test_tredreport(): + """ Highlight the potential risk individuals + """ + from tredparse.tredreport import main + main(["work/t001.json", "work/t002.json", "--tsv", "work.tsv"]) + + +def test_tredplot(): + """ Plot the likelihood surface based on the model + """ + from tredparse.tredplot import likelihood + likelihood(["work/t001.json", "--tred", "HD"]) diff --git a/tred.py b/tred.py index c0ca6ed..18ebfa6 100755 --- a/tred.py +++ b/tred.py @@ -12,515 +12,9 @@ data. """ -import argparse -import shutil -import os -import os.path as op -import json -import gzip import sys -import time -import logging - -from tredparse import __version__ -from tredparse.utils import DefaultHelpParser, InputParams, \ - mkdir, ls_s3, push_to_s3 -from tredparse.bam_parser import BamDepth, BamReadLen, BamParser, \ - BamParserResults, SPAN, read_alignment -from tredparse.models import IntegratedCaller -from tredparse.meta import TREDsRepo -from datetime import datetime as dt, timedelta -from multiprocessing import Pool, cpu_count - -logging.basicConfig() -logger = logging.getLogger(__name__) - - -INFO = """##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -""" - - -def set_argparse(): - TRED_NAMES = TREDsRepo().names - - p = DefaultHelpParser(description=__doc__, prog=op.basename(__file__), - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - p.add_argument('infile', nargs='?', help="Input path (BAM, list of BAMs, or csv format)") - p.add_argument('--ref', help='Reference genome version', - choices=("hg38", "hg38_nochr", "hg19", "hg19_nochr"), default='hg38') - p.add_argument('--tred', help='STR disorder, default is to run all', - action='append', choices=sorted(TRED_NAMES), default=None) - p.add_argument('--haploid', help='Treat these chromosomes as haploid', action='append') - p.add_argument('--useclippedreads', default=False, action="store_true", - help='Include clipped reads in inference') - p.add_argument('--noalts', default=False, action="store_true", - help='Do not scan extra sites for mismapped reads, '\ - 'faster but less accurate') - p.add_argument('--norepeatpairs', default=False, action="store_true", - help='Exclude pairs of repeat-only reads from evidence') - p.add_argument('--log', choices=("INFO", "DEBUG"), default="INFO", - help='Print debug logs, DEBUG=verbose') - p.add_argument('--version', action='version', version="%(prog)s " + __version__) - p.add_argument('--toy', help=argparse.SUPPRESS, action="store_true") - - g = p.add_argument_group("Performance options") - g.add_argument('--cpus', help='Number of CPUs to use', type=int, default=cpu_count()) - g.add_argument('--maxinsert', default=300, type=int, - help="Maximum number of repeats") - g.add_argument('--fullsearch', default=False, action="store_true", - help="Full grid search, could be slow") - - g = p.add_argument_group("I/O options") - g.add_argument("--workdir", default=os.getcwd(), help="Specify work dir") - g.add_argument('--cleanup', default=False, action="store_true", - help="Cleanup the workdir after done") - g.add_argument('--checkexists', default=False, action="store_true", - help="Do not run if JSON output exists") - g.add_argument('--no-output', default=False, action="store_true", - help="Do not write JSON and VCF output") - set_aws_opts(p) - return p - - -def set_aws_opts(p): - g = p.add_argument_group("AWS and Docker options") - # https://github.com/hlids/infrastructure/wiki/Docker-calling-convention - g.add_argument("--sample_id", help="Sample ID") - g.add_argument("--workflow_execution_id", help="Workflow execution ID") - g.add_argument("--input_bam_path", help="Input s3 path, override infile") - g.add_argument("--output_path", help="Output s3 path") - - -def bam_path(bam): - ''' - Is the file network-based? - ''' - if bam.startswith("s3://") or bam.startswith("http://") or \ - bam.startswith("ftp://") or bam.startswith("https://"): - return bam - else: - return op.abspath(bam) - - -def check_bam(bam): - # Check indices - remove if found, otherwise distinct bams may try to use - # the previous index, leading to an error - bbam = op.basename(bam) - baifile = bbam + ".bai" - altbaifile = bbam.rsplit(".", 1)[0] + ".bai" - - for bai in (baifile, altbaifile): - if op.exists(bai): - logger.debug("Remove index `{}`".format(bai)) - os.remove(bai) - - # Does the file exist? - logger.debug("Working on `{}`".format(bam)) - try: - read_alignment(bam) - except (IOError, ValueError) as e: - logger.error("Cannot retrieve file `{}` ({})".format(bam, e)) - return None - - return bam - - -def counter_s(c): - return ";".join(["{}|{}".format(k, int(v)) for k, v in sorted(c.items())]) - - -def runBam(inputParams): - ''' - Convenience function: parses one bam and runs all callers on it - :param inputParams: InputParams - :return: BamParserResult - ''' - maxinsert = inputParams.kwargs["maxinsert"] - fullsearch = inputParams.kwargs["fullsearch"] - bp = BamParser(inputParams) - bp.parse() - - # find the integrated likelihood calls - integratedCaller = IntegratedCaller(bp, maxinsert=maxinsert, - fullsearch=fullsearch) - integratedCaller.call(**inputParams.kwargs) - - return BamParserResults(inputParams, bp, integratedCaller) - - -def cleanup(cwd, samplekey): - """ - Change back to the parent folder and remove the samplekey folder after done - """ - os.chdir(cwd) - shutil.rmtree(samplekey) - - -def run(arg): - ''' - Run Tred Caller on a list of treds - :param bam: path to bam file - :param: referenceVersion, hg19 or hg38 - :return: dict of calls - ''' - samplekey, bam, repo, tredNames, maxinsert, fullsearch, clip, alts, \ - repeatpairs, log = arg - cwd = os.getcwd() - mkdir(samplekey) - os.chdir(samplekey) - gender = 'Unknown' - ydepth = -1 - - tredCalls = {"inferredGender": gender, "depthY": ydepth} - if check_bam(bam) is None: - cleanup(cwd, samplekey) - return {'samplekey': samplekey, 'bam': bam, 'tredCalls': tredCalls} - - # Infer gender based on depth on chrY - if any(repo[tred].is_xlinked for tred in tredNames): - try: - bd = BamDepth(bam, repo.ref, logger) - ydepth = bd.get_Y_depth() - if ydepth > 1: - gender = 'Male' - else: - gender = 'Female' - except: - pass - logger.debug("Inferred gender: {} (depthY={})".format(gender, ydepth)) - tredCalls["inferredGender"] = gender - tredCalls["depthY"] = ydepth - - # Get read length - READLEN = 150 - try: - brl = BamReadLen(bam, logger) - READLEN = brl.readlen - except: - pass - logger.debug("Read length: {}bp".format(READLEN)) - tredCalls["readLen"] = READLEN - - for tred in tredNames: - # Infer local read depth - bd = BamDepth(bam, repo.ref, logger) - xtred = repo[tred] - WINDOW_START = max(0, xtred.repeat_start - SPAN) - WINDOW_END = xtred.repeat_end + SPAN - try: - depth = bd.region_depth(xtred.chr, WINDOW_START, WINDOW_END) - except Exception as e: - depth = 30 - logger.error("Exception on `{}` {} ({}). Set depth={}"\ - .format(bam, tred, e, depth)) - - logger.debug("Inferred depth at locus {}: {}".format(tred, depth)) - ip = InputParams(bam=bam, READLEN=READLEN, tredName=tred, - repo=repo, maxinsert=maxinsert, fullsearch=fullsearch, - gender=gender, depth=depth, clip=clip, alts=alts, - repeatpairs=repeatpairs, log=log) - - #tpResult = runBam(ip) - try: - tpResult = runBam(ip) - except Exception as e: - logger.error("Exception on `{}` {} ({})".format(bam, tred, e)) - continue - - alleles = tpResult.alleles - tredCalls[tred + ".1"] = alleles[0] # .1 is the shorter allele - tredCalls[tred + ".2"] = alleles[1] # .2 is the longer allele - tredCalls[tred + ".FR"] = counter_s(tpResult.counts["FULL"]) - tredCalls[tred + ".PR"] = counter_s(tpResult.counts["PREF"]) - tredCalls[tred + ".RR"] = counter_s(tpResult.counts["REPT"]) - tredCalls[tred + ".DP"] = depth # Average read depth - tredCalls[tred + ".FDP"] = tpResult.FDP # Full spanning depth - tredCalls[tred + ".PDP"] = tpResult.PDP # Partial depth - tredCalls[tred + ".RDP"] = tpResult.RDP # Repeat read depth - tredCalls[tred + ".PEDP"] = tpResult.PEDP # PE depth - tredCalls[tred + ".PEG"] = tpResult.PEG # PE global estimate - tredCalls[tred + ".PET"] = tpResult.PET # PE target estimate - tredCalls[tred + ".CI"] = tpResult.CI # Confidence interval - tredCalls[tred + ".PP"] = tpResult.PP # Prob(disease) - tredCalls[tred + ".label"] = tpResult.label # Disease status - - # Following output are relatively big array of numbers that mostly - # specify probability distribution, only available in JSON output - tredCalls[tred + ".details"] = tpResult.details - tredCalls[tred + ".P_h1"] = tpResult.P_h1 - tredCalls[tred + ".P_h2"] = tpResult.P_h2 - tredCalls[tred + ".P_h1h2"] = tpResult.P_h1h2 - tredCalls[tred + ".P_PEG"] = tpResult.P_PEG - tredCalls[tred + ".P_PET"] = tpResult.P_PET - - cleanup(cwd, samplekey) - return {'samplekey': samplekey, 'bam': bam, 'tredCalls': tredCalls} - - -def vcfstanza(sampleid, bam, tredCalls, ref): - # VCF spec - m = "##fileformat=VCFv4.1\n" - m += "##fileDate={}{:02d}{:02d}\n".format(dt.now().year, dt.now().month, dt.now().day) - m += "##source={} {}\n".format(__file__, bam) - m += "##reference={}\n".format(ref) - m += "##inferredGender={} depthY={}\n".format( - tredCalls["inferredGender"], tredCalls["depthY"]) - m += "##readLen={}bp\n".format(tredCalls["readLen"]) - m += INFO - header = "CHROM POS ID REF ALT QUAL FILTER INFO FORMAT\n".split() + [sampleid] - m += "#" + "\t".join(header) - return m - - -def to_json(results, ref, repo, treds=["HD"], store=None): - sampleid = results['samplekey'] - bam = results['bam'] - calls = results['tredCalls'] - if not calls: - logger.debug("No calls are found for {} `{}`".format(sampleid, bam)) - return - - jsonfile = ".".join((sampleid, "json")) - js = json.dumps(results, sort_keys=True, - indent=4, separators=(',', ': ')) - print js - fw = open(jsonfile, "w") - print >> fw, js - fw.close() - - if store: - push_to_s3(store, jsonfile) - - -def to_vcf(results, ref, repo, treds=["HD"], store=None): - registry = {} - for tred in treds: - tr = repo.get_info(tred) - registry[tred] = tr - - sampleid = results['samplekey'] - bam = results['bam'] - calls = results['tredCalls'] - - if not calls: - logger.debug("No calls are found for {} `{}`".format(sampleid, bam)) - return - - vcffile = ".".join((sampleid, "tred.vcf.gz")) - contents = [] - for tred in treds: - if tred + ".1" not in calls: - continue - a = calls[tred + ".1"] - b = calls[tred + ".2"] - chr, start, ref_copy, repeat, info = registry[tred] - alleles = set([a, b]) - refv = set([ref_copy]) - rpa = sorted(alleles - refv) - alt = ",".join(x * repeat for x in rpa) if (rpa and rpa[0] != -1) else "." - if rpa: - info += ";RPA={}".format(",".join((str(x) for x in rpa))) - if ref_copy in alleles: - gt = "0/1" - elif len(rpa) == 1: - gt = "1/1" - else: - gt = "1/2" - else: - gt = "0/0" - gb = "{}/{}".format(a, b) - fields = "{}:{}:{}:{}:{}:{}:{}:{}:{}:{}:{}:{:.4g}:{}".format(gt, gb, - calls[tred + ".FR"], calls[tred + ".PR"], - calls[tred + ".RR"], calls[tred + ".DP"], - calls[tred + ".FDP"], calls[tred + ".PDP"], - calls[tred + ".RDP"], calls[tred + ".PEDP"], - calls[tred + ".CI"], calls[tred + ".PP"], - calls[tred + ".label"]) - m = "\t".join(str(x) for x in ( - chr, start, tred, ref_copy * repeat, alt, ".", ".", info, - "GT:GB:FR:PR:RR:DP:FDP:PDP:RDP:PEDP:CI:PP:LABEL", fields)) - contents.append((chr, start, m)) - - fw = gzip.open(vcffile, "w") - print >> fw, vcfstanza(sampleid, bam, calls, ref) - contents.sort() - for chr, start, m in contents: - print >> fw, m - fw.close() - logger.debug("VCF file written to `{}`".format(vcffile)) - - if store: - push_to_s3(store, vcffile) - - -def get_HLI_bam(samplekey): - """ - From @176449128, retrieve the S3 path of the BAM - """ - import pandas as pd - from tredparse.meta import HLI_BAMS - - df = pd.read_csv(HLI_BAMS, index_col=0, dtype=str, header=None, - names=["SampleKey", "BAM"]) - return df.ix[samplekey]["BAM"] - - -def read_csv(csvfile, args): - # Mode 0: See if this is just a HLI id, starting with @ - if csvfile[0] == '@': - samplekey = csvfile[1:] - bam = get_HLI_bam(samplekey) - return [(samplekey, bam, None)] - - # Mode 1: See if this is just a BAM file - if csvfile.endswith(".bam") or csvfile.endswith(".cram"): - bam = csvfile - bam = bam_path(bam) - if args.workflow_execution_id and args.sample_id: - samplekey = "_".join((args.workflow_execution_id, args.sample_id)) - else: - samplekey = op.basename(bam).rsplit(".", 1)[0] - return [(samplekey, bam, None)] - - fp = open(csvfile) - # Mode 2: See if the file contains JUST list of BAM files - header = fp.next().strip() - contents = [] - if header.endswith(".bam") and header.count(",") == 0: - fp.seek(0) - for row in fp: - bam = row.strip() - bam = bam_path(bam) - samplekey = op.basename(bam).rsplit(".", 1)[0] - contents.append((samplekey, bam, None)) - return contents - - # Mode 3: Continue reading, this is a CSV file - fp.seek(0) - for row in fp: - atoms = row.strip().split(",") - samplekey, bam = atoms[:2] - tred = atoms[2] if len(atoms) == 3 else None - bam = bam_path(bam) - if bam.endswith(".bam"): - contents.append((samplekey, bam, tred)) - return contents - - -def write_vcf_json(results, ref, repo, treds, store): - try: - to_vcf(results, ref, repo, treds=treds, store=store) - to_json(results, ref, repo, treds=treds, store=store) - except Exception as e: - print >> sys.stderr, "Error writing: {} ({})".format(results, e) +from tredparse.tred import main if __name__ == '__main__': - p = set_argparse() - args = p.parse_args() - - loglevel = getattr(logging, args.log.upper(), "INFO") - logger.setLevel(loglevel) - logger.debug('Commandline Arguments:{}'.format(vars(args))) - - start = time.time() - workdir = args.workdir - cwd = os.getcwd() - - if workdir != cwd: - mkdir(workdir, logger=logger) - - infile = args.input_bam_path or args.infile - if not infile: - sys.exit(not p.print_help()) - - samples = read_csv(infile, args) - store = args.output_path - - # Check available results - if store: - computed = ls_s3(store) - computed = [op.basename(x).split('.')[0] for x in computed if \ - x.endswith(".tred.vcf.gz")] - remaining_samples = [x for x in samples if x[0] not in computed] - - logger.debug("Already computed on `{}`: {}".\ - format(store, len(samples) - len(remaining_samples))) - samples = remaining_samples - - logger.debug("Total samples: {}".format(len(samples))) - - task_args = [] - os.chdir(workdir) - - ref = args.ref - repo = TREDsRepo(ref=ref, toy=args.toy) - repo.set_ploidy(args.haploid) - TRED_NAMES = repo.names - treds = args.tred or TRED_NAMES - - if args.toy: - treds = ["HD"] - - samplekey_index = {} - # Parallel processing - for i, (samplekey, bam, tred) in enumerate(samples): - jsonfile = ".".join((samplekey, "json")) - if args.checkexists and op.exists(jsonfile): - logger.debug("File `{}` exists. Skipped computation."\ - .format(jsonfile)) - continue - _treds = [tred] if tred else treds - task_args.append((samplekey, bam, repo, _treds, - args.maxinsert, args.fullsearch, - args.useclippedreads, (not args.noalts), - (not args.norepeatpairs), args.log)) - samplekey_index[samplekey] = i - - cpus = min(args.cpus, len(task_args)) - if cpus == 0: - logger.debug("All jobs already completed.") - sys.exit(0) - - logger.debug("Starting {} threads for {} jobs.".format(cpus, len(task_args))) - - if cpus == 1: # Serial - for ta in task_args: - results = run(ta) - if args.no_output: - continue - write_vcf_json(results, ref, repo, treds, store) - else: - p = Pool(processes=cpus) - for results in p.imap(run, task_args): - if args.no_output: - continue - write_vcf_json(results, ref, repo, treds, store) - - print >> sys.stderr, "Elapsed time={}"\ - .format(timedelta(seconds=time.time() - start)) - os.chdir(cwd) - - if args.cleanup: - shutil.rmtree(workdir) + main(sys.argv[1:]) diff --git a/tredparse/tred.py b/tredparse/tred.py new file mode 100755 index 0000000..5674dd7 --- /dev/null +++ b/tredparse/tred.py @@ -0,0 +1,526 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -*- + +""" +Copyright (c) 2015-2017 Human Longevity Inc. + +Author: Haibao Tang +License: Non-Commercial Use Only. For details, see `LICENSE` file + +HLI TRED caller: parse the target regions from the input BAM file, extract full and partial +reads, build probabilistic model and calculate the most likely repeat sizes given the +data. +""" + +import argparse +import shutil +import os +import os.path as op +import json +import gzip +import sys +import time +import logging + +from . import __version__ +from .utils import DefaultHelpParser, InputParams, \ + mkdir, ls_s3, push_to_s3 +from .bam_parser import BamDepth, BamReadLen, BamParser, \ + BamParserResults, SPAN, read_alignment +from .models import IntegratedCaller +from .meta import TREDsRepo +from datetime import datetime as dt, timedelta +from multiprocessing import Pool, cpu_count + +logging.basicConfig() +logger = logging.getLogger(__name__) + + +INFO = """##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +""" + + +def set_argparse(): + TRED_NAMES = TREDsRepo().names + + p = DefaultHelpParser(description=__doc__, prog=op.basename(__file__), + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + p.add_argument('infile', nargs='?', help="Input path (BAM, list of BAMs, or csv format)") + p.add_argument('--ref', help='Reference genome version', + choices=("hg38", "hg38_nochr", "hg19", "hg19_nochr"), default='hg38') + p.add_argument('--tred', help='STR disorder, default is to run all', + action='append', choices=sorted(TRED_NAMES), default=None) + p.add_argument('--haploid', help='Treat these chromosomes as haploid', action='append') + p.add_argument('--useclippedreads', default=False, action="store_true", + help='Include clipped reads in inference') + p.add_argument('--noalts', default=False, action="store_true", + help='Do not scan extra sites for mismapped reads, '\ + 'faster but less accurate') + p.add_argument('--norepeatpairs', default=False, action="store_true", + help='Exclude pairs of repeat-only reads from evidence') + p.add_argument('--log', choices=("INFO", "DEBUG"), default="INFO", + help='Print debug logs, DEBUG=verbose') + p.add_argument('--version', action='version', version="%(prog)s " + __version__) + p.add_argument('--toy', help=argparse.SUPPRESS, action="store_true") + + g = p.add_argument_group("Performance options") + g.add_argument('--cpus', help='Number of CPUs to use', type=int, default=cpu_count()) + g.add_argument('--maxinsert', default=300, type=int, + help="Maximum number of repeats") + g.add_argument('--fullsearch', default=False, action="store_true", + help="Full grid search, could be slow") + + g = p.add_argument_group("I/O options") + g.add_argument("--workdir", default=os.getcwd(), help="Specify work dir") + g.add_argument('--cleanup', default=False, action="store_true", + help="Cleanup the workdir after done") + g.add_argument('--checkexists', default=False, action="store_true", + help="Do not run if JSON output exists") + g.add_argument('--no-output', default=False, action="store_true", + help="Do not write JSON and VCF output") + set_aws_opts(p) + return p + + +def set_aws_opts(p): + g = p.add_argument_group("AWS and Docker options") + # https://github.com/hlids/infrastructure/wiki/Docker-calling-convention + g.add_argument("--sample_id", help="Sample ID") + g.add_argument("--workflow_execution_id", help="Workflow execution ID") + g.add_argument("--input_bam_path", help="Input s3 path, override infile") + g.add_argument("--output_path", help="Output s3 path") + + +def bam_path(bam): + ''' + Is the file network-based? + ''' + if bam.startswith("s3://") or bam.startswith("http://") or \ + bam.startswith("ftp://") or bam.startswith("https://"): + return bam + else: + return op.abspath(bam) + + +def check_bam(bam): + # Check indices - remove if found, otherwise distinct bams may try to use + # the previous index, leading to an error + bbam = op.basename(bam) + baifile = bbam + ".bai" + altbaifile = bbam.rsplit(".", 1)[0] + ".bai" + + for bai in (baifile, altbaifile): + if op.exists(bai): + logger.debug("Remove index `{}`".format(bai)) + os.remove(bai) + + # Does the file exist? + logger.debug("Working on `{}`".format(bam)) + try: + read_alignment(bam) + except (IOError, ValueError) as e: + logger.error("Cannot retrieve file `{}` ({})".format(bam, e)) + return None + + return bam + + +def counter_s(c): + return ";".join(["{}|{}".format(k, int(v)) for k, v in sorted(c.items())]) + + +def runBam(inputParams): + ''' + Convenience function: parses one bam and runs all callers on it + :param inputParams: InputParams + :return: BamParserResult + ''' + maxinsert = inputParams.kwargs["maxinsert"] + fullsearch = inputParams.kwargs["fullsearch"] + bp = BamParser(inputParams) + bp.parse() + + # find the integrated likelihood calls + integratedCaller = IntegratedCaller(bp, maxinsert=maxinsert, + fullsearch=fullsearch) + integratedCaller.call(**inputParams.kwargs) + + return BamParserResults(inputParams, bp, integratedCaller) + + +def cleanup(cwd, samplekey): + """ + Change back to the parent folder and remove the samplekey folder after done + """ + os.chdir(cwd) + shutil.rmtree(samplekey) + + +def run(arg): + ''' + Run Tred Caller on a list of treds + :param bam: path to bam file + :param: referenceVersion, hg19 or hg38 + :return: dict of calls + ''' + samplekey, bam, repo, tredNames, maxinsert, fullsearch, clip, alts, \ + repeatpairs, log = arg + cwd = os.getcwd() + mkdir(samplekey) + os.chdir(samplekey) + gender = 'Unknown' + ydepth = -1 + + tredCalls = {"inferredGender": gender, "depthY": ydepth} + if check_bam(bam) is None: + cleanup(cwd, samplekey) + return {'samplekey': samplekey, 'bam': bam, 'tredCalls': tredCalls} + + # Infer gender based on depth on chrY + if any(repo[tred].is_xlinked for tred in tredNames): + try: + bd = BamDepth(bam, repo.ref, logger) + ydepth = bd.get_Y_depth() + if ydepth > 1: + gender = 'Male' + else: + gender = 'Female' + except: + pass + logger.debug("Inferred gender: {} (depthY={})".format(gender, ydepth)) + tredCalls["inferredGender"] = gender + tredCalls["depthY"] = ydepth + + # Get read length + READLEN = 150 + try: + brl = BamReadLen(bam, logger) + READLEN = brl.readlen + except: + pass + logger.debug("Read length: {}bp".format(READLEN)) + tredCalls["readLen"] = READLEN + + for tred in tredNames: + # Infer local read depth + bd = BamDepth(bam, repo.ref, logger) + xtred = repo[tred] + WINDOW_START = max(0, xtred.repeat_start - SPAN) + WINDOW_END = xtred.repeat_end + SPAN + try: + depth = bd.region_depth(xtred.chr, WINDOW_START, WINDOW_END) + except Exception as e: + depth = 30 + logger.error("Exception on `{}` {} ({}). Set depth={}"\ + .format(bam, tred, e, depth)) + + logger.debug("Inferred depth at locus {}: {}".format(tred, depth)) + ip = InputParams(bam=bam, READLEN=READLEN, tredName=tred, + repo=repo, maxinsert=maxinsert, fullsearch=fullsearch, + gender=gender, depth=depth, clip=clip, alts=alts, + repeatpairs=repeatpairs, log=log) + + #tpResult = runBam(ip) + try: + tpResult = runBam(ip) + except Exception as e: + logger.error("Exception on `{}` {} ({})".format(bam, tred, e)) + continue + + alleles = tpResult.alleles + tredCalls[tred + ".1"] = alleles[0] # .1 is the shorter allele + tredCalls[tred + ".2"] = alleles[1] # .2 is the longer allele + tredCalls[tred + ".FR"] = counter_s(tpResult.counts["FULL"]) + tredCalls[tred + ".PR"] = counter_s(tpResult.counts["PREF"]) + tredCalls[tred + ".RR"] = counter_s(tpResult.counts["REPT"]) + tredCalls[tred + ".DP"] = depth # Average read depth + tredCalls[tred + ".FDP"] = tpResult.FDP # Full spanning depth + tredCalls[tred + ".PDP"] = tpResult.PDP # Partial depth + tredCalls[tred + ".RDP"] = tpResult.RDP # Repeat read depth + tredCalls[tred + ".PEDP"] = tpResult.PEDP # PE depth + tredCalls[tred + ".PEG"] = tpResult.PEG # PE global estimate + tredCalls[tred + ".PET"] = tpResult.PET # PE target estimate + tredCalls[tred + ".CI"] = tpResult.CI # Confidence interval + tredCalls[tred + ".PP"] = tpResult.PP # Prob(disease) + tredCalls[tred + ".label"] = tpResult.label # Disease status + + # Following output are relatively big array of numbers that mostly + # specify probability distribution, only available in JSON output + tredCalls[tred + ".details"] = tpResult.details + tredCalls[tred + ".P_h1"] = tpResult.P_h1 + tredCalls[tred + ".P_h2"] = tpResult.P_h2 + tredCalls[tred + ".P_h1h2"] = tpResult.P_h1h2 + tredCalls[tred + ".P_PEG"] = tpResult.P_PEG + tredCalls[tred + ".P_PET"] = tpResult.P_PET + + cleanup(cwd, samplekey) + return {'samplekey': samplekey, 'bam': bam, 'tredCalls': tredCalls} + + +def vcfstanza(sampleid, bam, tredCalls, ref): + # VCF spec + m = "##fileformat=VCFv4.1\n" + m += "##fileDate={}{:02d}{:02d}\n".format(dt.now().year, dt.now().month, dt.now().day) + m += "##source={} {}\n".format(__file__, bam) + m += "##reference={}\n".format(ref) + m += "##inferredGender={} depthY={}\n".format( + tredCalls["inferredGender"], tredCalls["depthY"]) + m += "##readLen={}bp\n".format(tredCalls["readLen"]) + m += INFO + header = "CHROM POS ID REF ALT QUAL FILTER INFO FORMAT\n".split() + [sampleid] + m += "#" + "\t".join(header) + return m + + +def to_json(results, ref, repo, treds=["HD"], store=None): + sampleid = results['samplekey'] + bam = results['bam'] + calls = results['tredCalls'] + if not calls: + logger.debug("No calls are found for {} `{}`".format(sampleid, bam)) + return + + jsonfile = ".".join((sampleid, "json")) + js = json.dumps(results, sort_keys=True, + indent=4, separators=(',', ': ')) + print js + fw = open(jsonfile, "w") + print >> fw, js + fw.close() + + if store: + push_to_s3(store, jsonfile) + + +def to_vcf(results, ref, repo, treds=["HD"], store=None): + registry = {} + for tred in treds: + tr = repo.get_info(tred) + registry[tred] = tr + + sampleid = results['samplekey'] + bam = results['bam'] + calls = results['tredCalls'] + + if not calls: + logger.debug("No calls are found for {} `{}`".format(sampleid, bam)) + return + + vcffile = ".".join((sampleid, "tred.vcf.gz")) + contents = [] + for tred in treds: + if tred + ".1" not in calls: + continue + a = calls[tred + ".1"] + b = calls[tred + ".2"] + chr, start, ref_copy, repeat, info = registry[tred] + alleles = set([a, b]) + refv = set([ref_copy]) + rpa = sorted(alleles - refv) + alt = ",".join(x * repeat for x in rpa) if (rpa and rpa[0] != -1) else "." + if rpa: + info += ";RPA={}".format(",".join((str(x) for x in rpa))) + if ref_copy in alleles: + gt = "0/1" + elif len(rpa) == 1: + gt = "1/1" + else: + gt = "1/2" + else: + gt = "0/0" + gb = "{}/{}".format(a, b) + fields = "{}:{}:{}:{}:{}:{}:{}:{}:{}:{}:{}:{:.4g}:{}".format(gt, gb, + calls[tred + ".FR"], calls[tred + ".PR"], + calls[tred + ".RR"], calls[tred + ".DP"], + calls[tred + ".FDP"], calls[tred + ".PDP"], + calls[tred + ".RDP"], calls[tred + ".PEDP"], + calls[tred + ".CI"], calls[tred + ".PP"], + calls[tred + ".label"]) + m = "\t".join(str(x) for x in ( + chr, start, tred, ref_copy * repeat, alt, ".", ".", info, + "GT:GB:FR:PR:RR:DP:FDP:PDP:RDP:PEDP:CI:PP:LABEL", fields)) + contents.append((chr, start, m)) + + fw = gzip.open(vcffile, "w") + print >> fw, vcfstanza(sampleid, bam, calls, ref) + contents.sort() + for chr, start, m in contents: + print >> fw, m + fw.close() + logger.debug("VCF file written to `{}`".format(vcffile)) + + if store: + push_to_s3(store, vcffile) + + +def get_HLI_bam(samplekey): + """ + From @176449128, retrieve the S3 path of the BAM + """ + import pandas as pd + from .meta import HLI_BAMS + + df = pd.read_csv(HLI_BAMS, index_col=0, dtype=str, header=None, + names=["SampleKey", "BAM"]) + return df.ix[samplekey]["BAM"] + + +def read_csv(csvfile, args): + # Mode 0: See if this is just a HLI id, starting with @ + if csvfile[0] == '@': + samplekey = csvfile[1:] + bam = get_HLI_bam(samplekey) + return [(samplekey, bam, None)] + + # Mode 1: See if this is just a BAM file + if csvfile.endswith(".bam") or csvfile.endswith(".cram"): + bam = csvfile + bam = bam_path(bam) + if args.workflow_execution_id and args.sample_id: + samplekey = "_".join((args.workflow_execution_id, args.sample_id)) + else: + samplekey = op.basename(bam).rsplit(".", 1)[0] + return [(samplekey, bam, None)] + + fp = open(csvfile) + # Mode 2: See if the file contains JUST list of BAM files + header = fp.next().strip() + contents = [] + if header.endswith(".bam") and header.count(",") == 0: + fp.seek(0) + for row in fp: + bam = row.strip() + bam = bam_path(bam) + samplekey = op.basename(bam).rsplit(".", 1)[0] + contents.append((samplekey, bam, None)) + return contents + + # Mode 3: Continue reading, this is a CSV file + fp.seek(0) + for row in fp: + atoms = row.strip().split(",") + samplekey, bam = atoms[:2] + tred = atoms[2] if len(atoms) == 3 else None + bam = bam_path(bam) + if bam.endswith(".bam"): + contents.append((samplekey, bam, tred)) + return contents + + +def write_vcf_json(results, ref, repo, treds, store): + try: + to_vcf(results, ref, repo, treds=treds, store=store) + to_json(results, ref, repo, treds=treds, store=store) + except Exception as e: + print >> sys.stderr, "Error writing: {} ({})".format(results, e) + + +def main(args): + p = set_argparse() + args = p.parse_args(args) + + loglevel = getattr(logging, args.log.upper(), "INFO") + logger.setLevel(loglevel) + logger.debug('Commandline Arguments:{}'.format(vars(args))) + + start = time.time() + workdir = args.workdir + cwd = os.getcwd() + + if workdir != cwd: + mkdir(workdir, logger=logger) + + infile = args.input_bam_path or args.infile + if not infile: + sys.exit(not p.print_help()) + + samples = read_csv(infile, args) + store = args.output_path + + # Check available results + if store: + computed = ls_s3(store) + computed = [op.basename(x).split('.')[0] for x in computed if \ + x.endswith(".tred.vcf.gz")] + remaining_samples = [x for x in samples if x[0] not in computed] + + logger.debug("Already computed on `{}`: {}".\ + format(store, len(samples) - len(remaining_samples))) + samples = remaining_samples + + logger.debug("Total samples: {}".format(len(samples))) + + task_args = [] + os.chdir(workdir) + + ref = args.ref + repo = TREDsRepo(ref=ref, toy=args.toy) + repo.set_ploidy(args.haploid) + TRED_NAMES = repo.names + treds = args.tred or TRED_NAMES + + if args.toy: + treds = ["HD"] + + samplekey_index = {} + # Parallel processing + for i, (samplekey, bam, tred) in enumerate(samples): + jsonfile = ".".join((samplekey, "json")) + if args.checkexists and op.exists(jsonfile): + logger.debug("File `{}` exists. Skipped computation."\ + .format(jsonfile)) + continue + _treds = [tred] if tred else treds + task_args.append((samplekey, bam, repo, _treds, + args.maxinsert, args.fullsearch, + args.useclippedreads, (not args.noalts), + (not args.norepeatpairs), args.log)) + samplekey_index[samplekey] = i + + cpus = min(args.cpus, len(task_args)) + if cpus == 0: + logger.debug("All jobs already completed.") + sys.exit(0) + + logger.debug("Starting {} threads for {} jobs.".format(cpus, len(task_args))) + + if cpus == 1: # Serial + for ta in task_args: + results = run(ta) + if args.no_output: + continue + write_vcf_json(results, ref, repo, treds, store) + else: + p = Pool(processes=cpus) + for results in p.imap(run, task_args): + if args.no_output: + continue + write_vcf_json(results, ref, repo, treds, store) + + print >> sys.stderr, "Elapsed time={}"\ + .format(timedelta(seconds=time.time() - start)) + os.chdir(cwd) + + if args.cleanup: + shutil.rmtree(workdir) diff --git a/tredparse/tredplot.py b/tredparse/tredplot.py new file mode 100755 index 0000000..17c409f --- /dev/null +++ b/tredparse/tredplot.py @@ -0,0 +1,311 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -*- + +""" +Copyright (c) 2015-2017 Human Longevity Inc. + +Author: Haibao Tang +License: Non-Commercial Use Only. For details, see `LICENSE` file + +Related scripts for the HLI-STR (TREDPARSE) paper. +""" + +import os.path as op +import sys +import json +import logging +import numpy as np + +from .jcvi import FancyArrow, normalize_axes, panel_labels, plt, \ + savefig, OptionParser, ActionDispatcher + + +# Huntington risk allele +infected_thr = 40 +ref_thr = 19 +SIMULATED_HAPLOID = r'Simulated haploid $\mathit{h}$' +SIMULATED_DIPLOID = r"Simulated diploid $\mathit{20/h}$" +lsg = "lightslategray" + + +def main(): + + actions = ( + # Plotting + ('likelihood', 'plot likelihood surface and marginals'), + ('likelihood2', 'plot likelihood surface and marginals for two settings'), + # Diagram + ('diagram', 'plot the predictive power of various evidences'), + ) + p = ActionDispatcher(actions) + p.dispatch(globals()) + + +def mask_upper_triangle(data): + mask = np.zeros_like(data) + mask[np.triu_indices_from(mask)] = True + data = np.ma.array(data, mask=mask) + return data + + +def ax_plot(ax, P_h, h_hat, CI_h, xlabel, ylabel, ticks=True): + max_P = max(P_h.values()) + a, b = CI_h + + ax.plot([h_hat, h_hat], [0, max_P], ":", color=lsg, lw=2) + ax.set_xlabel(r"$%s$" % xlabel) + ax.set_ylabel(ylabel) + + data = [] + for k in xrange(301): + v = P_h.get(str(k), 0) + data.append((k, v)) + data.sort() + x, y = zip(*data) + x = np.array(x) + curve, = ax.plot(x, y, "-", color=lsg, lw=2) + title = "Marginal distribution for $%s$" % xlabel + ax.set_title(title) + if not ticks: + ax.set_yticks([]) + + if a == b: + ax.plot([h_hat, h_hat], [0, max_P], "-", color=lsg, lw=2) + else: + ax.fill_between(x, [0] * len(x), y, where=(x >= a) & (x <= b), + color=lsg, alpha=.5) + ax.set_xlim(0, 300) + + ymin, ymax = ax.get_ylim() + if h_hat < 150: + ax.text(h_hat + 20, ymax * 4. / 5, r"$\hat{%s}=%d$" % (xlabel, h_hat), + color=lsg, va="center") + ax.text(h_hat + 20, ymax * 3. / 5, "95$\%$ CI" + r"$=%s-%s$" % (a, b), + color=lsg, va="center") + else: + ax.text(h_hat - 30, ymax * 4. / 5, r"$\hat{%s}=%d$" % (xlabel, h_hat), + color=lsg, ha="right", va="center") + ax.text(h_hat - 30, ymax * 3. / 5, "95$\%$ CI" + r"$=%s-%s$" % (a, b), + color=lsg, ha="right", va="center") + + ymin, ymax = ax.get_ylim() + ax.set_ylim(ymin, ymax * 1.05) + + +def ax_imshow(ax, P_h1h2, cmap, label, h1_hat, h2_hat, samplekey, + r=4, draw_circle=True, ticks=True): + im = ax.imshow(P_h1h2, cmap=cmap, origin="lower") + + from mpl_toolkits.axes_grid1 import make_axes_locatable + divider = make_axes_locatable(ax) + cax = divider.append_axes("right", size="5%", pad=.05) + cb = plt.colorbar(im, cax) + cb.set_label(label) + if not ticks: + cb.set_ticks([]) + + if draw_circle: + circle = plt.Circle((h1_hat, h2_hat), r, ec='w', fill=False) + ax.add_artist(circle) + + annotation = "$\hat{h_1}=%d, \hat{h_2}=%d$" % (h1_hat, h2_hat) + ax.text(200, 100, annotation, color=lsg, ha="center", va="center") + + ax.set_xlabel(r"$h_1$") + ax.set_ylabel(r"$h_2$") + title = "Joint probability density for $h_1$ and $h_2$ ({})"\ + .format(samplekey) + ax.set_title(title) + + +def likelihood(args): + """ + %prog likelihood 100_20.json + + Plot the likelihood surface and marginal distributions. + """ + from matplotlib import gridspec + + p = OptionParser(likelihood.__doc__) + p.add_option("--tred", default="HD", + help="TRED name to extract") + opts, args, iopts = p.set_image_options(args, figsize="10x5", + style="white", cmap="coolwarm") + + if len(args) != 1: + sys.exit(not p.print_help()) + + jsonfile, = args + fig = plt.figure(figsize=(iopts.w, iopts.h)) + gs = gridspec.GridSpec(2, 2) + ax1 = fig.add_subplot(gs[:, 0]) + ax2 = fig.add_subplot(gs[0, 1]) + ax3 = fig.add_subplot(gs[1, 1]) + plt.tight_layout(pad=3) + pf = plot_panel(jsonfile, ax1, ax2, ax3, opts.cmap, tred=opts.tred) + if not pf: + logging.error("NO data found in P_h1h2. Plotting aborted.") + return + + root = fig.add_axes([0, 0, 1, 1]) + normalize_axes(root) + + image_name = "likelihood.{}.".format(pf) + iopts.format + savefig(image_name, dpi=iopts.dpi, iopts=iopts) + + +def likelihood2(args): + """ + %prog likelihood2 200_20.json 200_100.json + + Plot the likelihood surface and marginal distributions for two settings. + """ + from matplotlib import gridspec + + p = OptionParser(likelihood2.__doc__) + p.add_option("--tred", default="HD", + help="TRED name to extract") + opts, args, iopts = p.set_image_options(args, figsize="10x10", + style="white", cmap="coolwarm") + if len(args) != 2: + sys.exit(not p.print_help()) + + jsonfile1, jsonfile2 = args + fig = plt.figure(figsize=(iopts.w, iopts.h)) + gs = gridspec.GridSpec(9, 2) + ax1 = fig.add_subplot(gs[:4, 0]) + ax2 = fig.add_subplot(gs[:2, 1]) + ax3 = fig.add_subplot(gs[2:4, 1]) + ax4 = fig.add_subplot(gs[5:, 0]) + ax5 = fig.add_subplot(gs[5:7, 1]) + ax6 = fig.add_subplot(gs[7:, 1]) + plt.tight_layout(pad=2) + + plot_panel(jsonfile1, ax1, ax2, ax3, opts.cmap, tred=opts.tred) + plot_panel(jsonfile2, ax4, ax5, ax6, opts.cmap, tred=opts.tred) + + root = fig.add_axes([0, 0, 1, 1]) + pad = .02 + panel_labels(root, ((pad, 1 - pad, "A"), (pad, 4. / 9, "B"))) + normalize_axes(root) + + image_name = "likelihood2." + iopts.format + savefig(image_name, dpi=iopts.dpi, iopts=iopts) + + +def plot_panel(jsonfile, ax1, ax2, ax3, cmap, tred='toy'): + j = json.load(open(jsonfile)) + calls = j["tredCalls"] + P_h1h2 = calls[tred + ".P_h1h2"] + if not P_h1h2: + return + + data = np.zeros((301, 301)) + data = mask_upper_triangle(data) + for k, v in P_h1h2.items(): + a, b = k.split(",") + a, b = int(a), int(b) + if a < b: + a, b = b, a + data[a, b] = v + + label = "Probability density" + h1_hat, h2_hat = calls[tred + ".1"], calls[tred + ".2"] + pf = op.basename(jsonfile).split(".")[0] + ax_imshow(ax1, data, cmap, label, h1_hat, h2_hat, pf, + draw_circle=False, ticks=False) + + CI = calls[tred + ".CI"] + CI_h1, CI_h2 = CI.split("|") + CI_h1 = [int(x) for x in CI_h1.split('-')] + CI_h2 = [int(x) for x in CI_h2.split('-')] + P_h1 = calls[tred + ".P_h1"] + P_h2 = calls[tred + ".P_h2"] + + ax_plot(ax2, P_h1, h1_hat, CI_h1, "h_1", label, ticks=False) + ax_plot(ax3, P_h2, h2_hat, CI_h2, "h_2", label, ticks=False) + + return pf + + +def diagram(args): + """ + %prog diagram + + Plot the predictive power of various evidences. + """ + p = OptionParser(diagram.__doc__) + opts, args, iopts = p.set_image_options(args, figsize="8x4", format="png") + + if len(args) != 0: + sys.exit(not p.print_help()) + + fig = plt.figure(1, (iopts.w, iopts.h)) + root = fig.add_axes([0, 0, 1, 1]) + + # Gauge on top, this is log-scale + yy = .7 + yinterval = .1 + height = .05 + yp = yy - yinterval - height + canvas = .95 + xstart = .025 + convert = lambda x: xstart + x * canvas / 600 + # Symbols + root.text(.5, .9, r"$L$: Read length, $F$: Flank size, $V$: Pair distance", ha="center") + root.text(.5, .85, r"ex. $L=150bp, F=9bp, V=500bp$", ha="center") + root.text(xstart + canvas, yy - height, "STR repeat length", ha="center", + color=lsg, size=10) + + # Mark the key events + pad = .02 + arrowlen = canvas * 1.05 + arrowprops = dict(length_includes_head=True, width=.01, fc=lsg, lw=0, + head_length=arrowlen * .12, head_width=.04) + p = FancyArrow(xstart, yy, arrowlen, 0, shape="right", **arrowprops) + root.add_patch(p) + + ppad = 30 + keyevents = (( 0, 0, -1, r"$0$"), + (150 - 18, 150 - 18 - ppad, 0, r"$L - 2F$"), + (150 - 9, 150 - 9, 1, r"$L - F$"), + (150, 150 + ppad, 2, r"$L$"), + (500 - 9, 500 - 9, 3, r"$V - F$"), + ) + for event, pos, i, label in keyevents: + _event = convert(event) + _pos = convert(pos) + root.plot((_event, _event), (yy - height / 4, yy + height / 4), + '-', color='k') + root.text(_pos, yy + pad, label, rotation=45, va="bottom", size=8) + if i < 0: + continue + ystart = yp - i * yinterval + root.plot((_event, _event), (ystart, yy - height / 4), ':', color=lsg) + + # Range on bottom. These are simple 4 rectangles, with the range indicating + # the predictive range. + CLOSED, OPEN = range(2) + ranges = ((0, 150 - 18, CLOSED, "Spanning reads"), + (9, 150 - 9, OPEN, "Partial reads"), + (150, 500 - 9, CLOSED, "Repeat reads"), + (0, 500 - 9, CLOSED, "Paired-end reads"), + ) + for start, end, starttag, label in ranges: + _start = convert(start) + _end = convert(end) + data = [[0., 1.], [0., 1.]] if starttag == OPEN else \ + [[1., 0.], [1., 0.]] + root.imshow(data, interpolation='bicubic', cmap=plt.cm.Greens, + extent=[_start, _end, yp, yp + height]) + root.text(_end + pad, yp + height / 2, label, va="center") + yp -= yinterval + + normalize_axes(root) + + image_name = "diagram." + iopts.format + savefig(image_name, dpi=iopts.dpi, iopts=iopts) + + +if __name__ == '__main__': + main() diff --git a/tredparse/tredreport.py b/tredparse/tredreport.py new file mode 100755 index 0000000..8dcc32f --- /dev/null +++ b/tredparse/tredreport.py @@ -0,0 +1,302 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -*- + +""" +Copyright (c) 2015-2017 Human Longevity Inc. + +Author: Haibao Tang +License: Non-Commercial Use Only. For details, see `LICENSE` file + +Report signficant calls - predisease and disease. When VCF or JSON files are +given, a new tsv file will be written - otherwise will summarize calls based on +the given tsv file. +""" + +import argparse +import os.path as op +import json +import math +import sys +import pandas as pd + +from collections import Counter +from multiprocessing import Pool, cpu_count + +from . import __version__ +from .meta import TREDsRepo +from .utils import DefaultHelpParser + + +def left_truncate_text(a, maxcol=30): + trim = lambda t: (t if not isinstance(t, basestring) \ + or len(t) <= maxcol else "..." + t[-(maxcol - 3):]) + return [trim(x) for x in list(a)] + + +def get_tred_summary(df, tred, repo, minPP=.5, casesfw=None, detailsfw=None): + pf1 = tred + ".1" + pf2 = tred + ".2" + tr = repo[tred] + row = tr.row + title = row["title"] + inheritance = row["inheritance"] + repeat = row["repeat"] + repeat_location = row["repeat_location"] + + cutoff_prerisk, cutoff_risk = tr.cutoff_prerisk, tr.cutoff_risk + label = tred + ".label" + pp = tred + ".PP" + prerisk = df[df[label] == "prerisk"] + risk = df[(df[label] == "risk") & (df[pp] > minPP)] + if tr.is_expansion: + carrier = df[(df[label] != "risk") & (df[pf2] >= cutoff_risk)] + else: + carrier = df[(df[label] != "risk") & (df[pf2] <= cutoff_risk) \ + & (df[pf2] > 0)] + + risk = risk.copy() + n_prerisk = prerisk.shape[0] + n_risk = risk.shape[0] + n_carrier = carrier.shape[0] + calls = tred + ".calls" + + core = ["SampleKey", "inferredGender", calls] + columns = core + [tred + ".FR", tred + ".PR", tred + ".RR", pp] + # Truncate the display of FR/PR + risk[tred + ".FR"] = left_truncate_text(risk[tred + ".FR"]) + risk[tred + ".PR"] = left_truncate_text(risk[tred + ".PR"]) + risk[tred + ".RR"] = left_truncate_text(risk[tred + ".RR"]) + + if detailsfw: + details_columns = core[:] + details_columns.extend([tred + ".FDP", tred + ".PDP", + tred + ".RDP", tred + ".PEDP"]) + for idx, row in risk[details_columns].iterrows(): + if tred == "AR": + break + + samplekey, sex, calls, fdp, pdp, rdp, pedp = row + fdp = int(fdp) + pdp = int(pdp) + rdp = int(rdp) + pedp = int(pedp) + atoms = [tred, tr.inheritance, samplekey, sex, calls, fdp, pdp, rdp, pedp] + print >> detailsfw, "\t".join(str(x) for x in atoms) + + pt = risk[columns] + if n_risk: + print >> casesfw, "[{}] - {}".format(tred, title) + print >> casesfw, "rep={}".format(repeat), "inherit={}".format(inheritance),\ + "cutoff={}".format(cutoff_risk), \ + "n_risk={}".format(n_risk), \ + "n_carrier={}".format(n_carrier), \ + "loc={}".format(repeat_location) + print >> casesfw, pt.to_string(index=False) + print >> casesfw + + # Allele frequency + cnt = Counter() + cnt.update(df[tred + ".1_"]) + cnt.update(df[tred + ".2_"]) + del cnt[-1] + + return tr, n_prerisk, n_risk, n_carrier, counts_to_af(cnt) + + +def counts_to_af(counts): + return "{" + ",".join("{}:{}".format(k, v) for k, v in \ + sorted(counts.items()) if not (k == '.' or math.isnan(k))) + "}" + + +def df_to_tsv(df, tsvfile, extra_columns=[], jsonformat=True, + ref="hg38"): + + df = df.fillna(-1) + dd = ["SampleKey"] + if jsonformat: + dd += ["inferredGender"] + + repo = TREDsRepo(ref) + for tred in repo.names: + tr = repo[tred] + if tred + ".1" not in df.columns: + continue + df[tred + ".1_"] = df[tred + ".1"].astype("int") + df[tred + ".2_"] = df[tred + ".2"].astype("int") + if jsonformat and tr.is_xlinked: + df.loc[(df["inferredGender"] == "Male"), tred + ".2_"] = "." + df[tred + ".calls"] = ["{}|{}".format(a, b) for (a, b) in + zip(df[tred + ".1_"], df[tred + ".2_"])] + + all_columns = ["calls", "label"] + extra_columns + columns = dd + sorted([x for x in df.columns if (x not in dd) and \ + any(x.endswith("." + z) for z in all_columns)]) + + tf = df.reindex_axis(columns, axis='columns') + tf.sort_values("SampleKey") + tf.to_csv(tsvfile, sep='\t', index=False) + print >> sys.stderr, "TSV output written to `{}` (# samples={})"\ + .format(tsvfile, tf.shape[0]) + + return df + + +def vcf_to_df_worker(vcffile): + """ Convert VCF to data frame, single thread + """ + import vcf + + samplekey = op.basename(vcffile).split(".")[0] + reader = vcf.Reader(open(vcffile, "rb")) + d = {'SampleKey': samplekey} + for rec in reader: + tr = rec.ID + sample = rec.samples[0] + a, b = sample["GB"].split("/") + d[tr + ".1"] = int(a) + d[tr + ".2"] = int(b) + for k in ["PP", "FR", "PR"]: + d[tr + "." + k] = sample[k] + d[tr + ".label"] = sample["LABEL"] + return d + + +def vcf_to_df(vcffiles, tsvfile, cpus): + """ + Compile a number of vcf files into tsv file for easier manipulation. + """ + df = pd.DataFrame() + p = Pool(processes=cpus) + results = [] + r = p.map_async(vcf_to_df_worker, vcffiles, + callback=results.append) + r.wait() + + for res in results: + df = df.append(res, ignore_index=True) + return df + + +def json_to_df_worker(jsonfile): + js = json.load(open(jsonfile)) + samplekey = js['samplekey'] + results = js['tredCalls'] + # We'll infer sample key from file names for now since the keys are not + # properly populated in v0.6.4 and v0.6.5 + samplekey = op.basename(jsonfile).split(".")[0] + d = {'SampleKey': samplekey} + d.update(results) + + return d + + +def json_to_df(jsonfiles, tsvfile, cpus): + """ + Compile a number of json files into tsv file for easier manipulation. + """ + df = pd.DataFrame() + p = Pool(processes=cpus) + results = [] + r = p.map_async(json_to_df_worker, jsonfiles, + callback=results.append) + r.wait() + + for res in results: + df = df.append(res, ignore_index=True) + return df + + +def main(args): + p = DefaultHelpParser(description=__doc__, prog=op.basename(__file__), + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + p.add_argument("files", nargs="*") + p.add_argument('--ref', help='Reference genome version', + choices=("hg38", "hg38_nochr", "hg19", "hg19_nochr"), default='hg38') + p.add_argument('--tsv', default="out.tsv", + help="Path to the tsv file") + p.add_argument('--columns', + help="Columns to extract, use comma to separate") + p.add_argument('--minPP', default=.5, type=float, + help="Minimum Prob(pathological) to report cases") + p.add_argument('--cpus', default=cpu_count(), + help='Number of threads') + p.add_argument('--version', action='version', version="%(prog)s " + __version__) + args = p.parse_args(args) + + files = args.files + ref = args.ref + tsvfile = args.tsv + columns = args.columns.split(",") if args.columns else [] + + repo = TREDsRepo(ref) + alltreds = repo.names + if files: + nfiles = len(files) + cpus = min(nfiles, args.cpus) + jsonformat = files[0].endswith(".json") + suffix = "JSON" if jsonformat else "VCF" + print >> sys.stderr, "Using {} cpus to parse {} {} files"\ + .format(cpus, nfiles, suffix) + if jsonformat: + df = json_to_df(files, tsvfile, cpus) + else: + df = vcf_to_df(files, tsvfile, cpus) + df = df_to_tsv(df, tsvfile, extra_columns=columns, jsonformat=jsonformat, + ref=ref) + else: + if op.exists(tsvfile): + df = pd.read_csv(tsvfile, sep="\t") + else: + sys.exit(not p.print_help()) + + if df.empty: + print >> sys.stderr, "Dataframe empty - check input files" + sys.exit(1) + + reportfile = tsvfile + ".report.txt" + summary = pd.DataFrame() + total_prerisk = total_risk = total_carrier = total_loci = 0 + + # Outlier cases and associated read count details + cases = tsvfile + ".cases.txt" + casesfw = open(cases, "w") + details = tsvfile + ".details.txt" + detailsfw = open(details, "w") + header = "Locus,Inheritance,SampleKey,Sex,Calls,FullReads,PartialReads,"\ + "RepeatReads,PairedReads" + print >> detailsfw, "\t".join(header.split(',')) + + for tred in alltreds: + if tred + ".label" not in df.columns: + continue + tr, n_prerisk, n_risk, n_carrier, af = \ + get_tred_summary(df, tred, repo, minPP=args.minPP, + casesfw=casesfw, detailsfw=detailsfw) + total_prerisk += n_prerisk + total_risk += n_risk + total_carrier += n_carrier + if n_risk: + total_loci += 1 + + tr = tr.row + columns = ["abbreviation", "title", "motif", "inheritance", + "cutoff_prerisk", "cutoff_risk"] + d = dict((c, tr[c]) for c in columns[1:]) + d["abbreviation"] = tred + d["n_prerisk"] = n_prerisk + d["n_risk"] = n_risk + d["n_carrier"] = n_carrier + d["allele_freq"] = af + summary = summary.append(d, ignore_index=True) + + print >> sys.stderr, "Outlier cases saved to `{}`".format(casesfw.name) + casesfw.close() + print >> sys.stderr, "Read count details saved to `{}`"\ + .format(detailsfw.name) + detailsfw.close() + + summary.to_csv(reportfile, sep="\t", index=False, float_format="%d") + print >> sys.stderr, "Summary report written to `{}` (# samples={})"\ + .format(reportfile, summary.shape[0]) + print >> sys.stderr, "Summary: n_prerisk={}, n_risk={}, n_carrier={}, n_affected_loci={}"\ + .format(total_prerisk, total_risk, total_carrier, total_loci) diff --git a/tredplot.py b/tredplot.py index 93e0b86..0a7dbb3 100755 --- a/tredplot.py +++ b/tredplot.py @@ -10,302 +10,9 @@ Related scripts for the HLI-STR (TREDPARSE) paper. """ -import os.path as op import sys -import json -import logging -import numpy as np - -from tredparse.jcvi import FancyArrow, normalize_axes, panel_labels, plt, \ - savefig, OptionParser, ActionDispatcher - - -# Huntington risk allele -infected_thr = 40 -ref_thr = 19 -SIMULATED_HAPLOID = r'Simulated haploid $\mathit{h}$' -SIMULATED_DIPLOID = r"Simulated diploid $\mathit{20/h}$" -lsg = "lightslategray" - - -def main(): - - actions = ( - # Plotting - ('likelihood', 'plot likelihood surface and marginals'), - ('likelihood2', 'plot likelihood surface and marginals for two settings'), - # Diagram - ('diagram', 'plot the predictive power of various evidences'), - ) - p = ActionDispatcher(actions) - p.dispatch(globals()) - - -def mask_upper_triangle(data): - mask = np.zeros_like(data) - mask[np.triu_indices_from(mask)] = True - data = np.ma.array(data, mask=mask) - return data - - -def ax_plot(ax, P_h, h_hat, CI_h, xlabel, ylabel, ticks=True): - max_P = max(P_h.values()) - a, b = CI_h - - ax.plot([h_hat, h_hat], [0, max_P], ":", color=lsg, lw=2) - ax.set_xlabel(r"$%s$" % xlabel) - ax.set_ylabel(ylabel) - - data = [] - for k in xrange(301): - v = P_h.get(str(k), 0) - data.append((k, v)) - data.sort() - x, y = zip(*data) - x = np.array(x) - curve, = ax.plot(x, y, "-", color=lsg, lw=2) - title = "Marginal distribution for $%s$" % xlabel - ax.set_title(title) - if not ticks: - ax.set_yticks([]) - - if a == b: - ax.plot([h_hat, h_hat], [0, max_P], "-", color=lsg, lw=2) - else: - ax.fill_between(x, [0] * len(x), y, where=(x >= a) & (x <= b), - color=lsg, alpha=.5) - ax.set_xlim(0, 300) - - ymin, ymax = ax.get_ylim() - if h_hat < 150: - ax.text(h_hat + 20, ymax * 4. / 5, r"$\hat{%s}=%d$" % (xlabel, h_hat), - color=lsg, va="center") - ax.text(h_hat + 20, ymax * 3. / 5, "95$\%$ CI" + r"$=%s-%s$" % (a, b), - color=lsg, va="center") - else: - ax.text(h_hat - 30, ymax * 4. / 5, r"$\hat{%s}=%d$" % (xlabel, h_hat), - color=lsg, ha="right", va="center") - ax.text(h_hat - 30, ymax * 3. / 5, "95$\%$ CI" + r"$=%s-%s$" % (a, b), - color=lsg, ha="right", va="center") - - ymin, ymax = ax.get_ylim() - ax.set_ylim(ymin, ymax * 1.05) - - -def ax_imshow(ax, P_h1h2, cmap, label, h1_hat, h2_hat, samplekey, - r=4, draw_circle=True, ticks=True): - im = ax.imshow(P_h1h2, cmap=cmap, origin="lower") - - from mpl_toolkits.axes_grid1 import make_axes_locatable - divider = make_axes_locatable(ax) - cax = divider.append_axes("right", size="5%", pad=.05) - cb = plt.colorbar(im, cax) - cb.set_label(label) - if not ticks: - cb.set_ticks([]) - - if draw_circle: - circle = plt.Circle((h1_hat, h2_hat), r, ec='w', fill=False) - ax.add_artist(circle) - - annotation = "$\hat{h_1}=%d, \hat{h_2}=%d$" % (h1_hat, h2_hat) - ax.text(200, 100, annotation, color=lsg, ha="center", va="center") - - ax.set_xlabel(r"$h_1$") - ax.set_ylabel(r"$h_2$") - title = "Joint probability density for $h_1$ and $h_2$ ({})"\ - .format(samplekey) - ax.set_title(title) - - -def likelihood(args): - """ - %prog likelihood 100_20.json - - Plot the likelihood surface and marginal distributions. - """ - from matplotlib import gridspec - - p = OptionParser(likelihood.__doc__) - p.add_option("--tred", default="HD", - help="TRED name to extract") - opts, args, iopts = p.set_image_options(args, figsize="10x5", - style="white", cmap="coolwarm") - - if len(args) != 1: - sys.exit(not p.print_help()) - - jsonfile, = args - fig = plt.figure(figsize=(iopts.w, iopts.h)) - gs = gridspec.GridSpec(2, 2) - ax1 = fig.add_subplot(gs[:, 0]) - ax2 = fig.add_subplot(gs[0, 1]) - ax3 = fig.add_subplot(gs[1, 1]) - plt.tight_layout(pad=3) - pf = plot_panel(jsonfile, ax1, ax2, ax3, opts.cmap, tred=opts.tred) - if not pf: - logging.error("NO data found in P_h1h2. Plotting aborted.") - return - - root = fig.add_axes([0, 0, 1, 1]) - normalize_axes(root) - - image_name = "likelihood.{}.".format(pf) + iopts.format - savefig(image_name, dpi=iopts.dpi, iopts=iopts) - - -def likelihood2(args): - """ - %prog likelihood2 200_20.json 200_100.json - - Plot the likelihood surface and marginal distributions for two settings. - """ - from matplotlib import gridspec - - p = OptionParser(likelihood2.__doc__) - p.add_option("--tred", default="HD", - help="TRED name to extract") - opts, args, iopts = p.set_image_options(args, figsize="10x10", - style="white", cmap="coolwarm") - if len(args) != 2: - sys.exit(not p.print_help()) - - jsonfile1, jsonfile2 = args - fig = plt.figure(figsize=(iopts.w, iopts.h)) - gs = gridspec.GridSpec(9, 2) - ax1 = fig.add_subplot(gs[:4, 0]) - ax2 = fig.add_subplot(gs[:2, 1]) - ax3 = fig.add_subplot(gs[2:4, 1]) - ax4 = fig.add_subplot(gs[5:, 0]) - ax5 = fig.add_subplot(gs[5:7, 1]) - ax6 = fig.add_subplot(gs[7:, 1]) - plt.tight_layout(pad=2) - - plot_panel(jsonfile1, ax1, ax2, ax3, opts.cmap, tred=opts.tred) - plot_panel(jsonfile2, ax4, ax5, ax6, opts.cmap, tred=opts.tred) - - root = fig.add_axes([0, 0, 1, 1]) - pad = .02 - panel_labels(root, ((pad, 1 - pad, "A"), (pad, 4. / 9, "B"))) - normalize_axes(root) - - image_name = "likelihood2." + iopts.format - savefig(image_name, dpi=iopts.dpi, iopts=iopts) - - -def plot_panel(jsonfile, ax1, ax2, ax3, cmap, tred='toy'): - j = json.load(open(jsonfile)) - calls = j["tredCalls"] - P_h1h2 = calls[tred + ".P_h1h2"] - if not P_h1h2: - return - - data = np.zeros((301, 301)) - data = mask_upper_triangle(data) - for k, v in P_h1h2.items(): - a, b = k.split(",") - a, b = int(a), int(b) - if a < b: - a, b = b, a - data[a, b] = v - - label = "Probability density" - h1_hat, h2_hat = calls[tred + ".1"], calls[tred + ".2"] - pf = op.basename(jsonfile).split(".")[0] - ax_imshow(ax1, data, cmap, label, h1_hat, h2_hat, pf, - draw_circle=False, ticks=False) - - CI = calls[tred + ".CI"] - CI_h1, CI_h2 = CI.split("|") - CI_h1 = [int(x) for x in CI_h1.split('-')] - CI_h2 = [int(x) for x in CI_h2.split('-')] - P_h1 = calls[tred + ".P_h1"] - P_h2 = calls[tred + ".P_h2"] - - ax_plot(ax2, P_h1, h1_hat, CI_h1, "h_1", label, ticks=False) - ax_plot(ax3, P_h2, h2_hat, CI_h2, "h_2", label, ticks=False) - - return pf - - -def diagram(args): - """ - %prog diagram - - Plot the predictive power of various evidences. - """ - p = OptionParser(diagram.__doc__) - opts, args, iopts = p.set_image_options(args, figsize="8x4", format="png") - - if len(args) != 0: - sys.exit(not p.print_help()) - - fig = plt.figure(1, (iopts.w, iopts.h)) - root = fig.add_axes([0, 0, 1, 1]) - - # Gauge on top, this is log-scale - yy = .7 - yinterval = .1 - height = .05 - yp = yy - yinterval - height - canvas = .95 - xstart = .025 - convert = lambda x: xstart + x * canvas / 600 - # Symbols - root.text(.5, .9, r"$L$: Read length, $F$: Flank size, $V$: Pair distance", ha="center") - root.text(.5, .85, r"ex. $L=150bp, F=9bp, V=500bp$", ha="center") - root.text(xstart + canvas, yy - height, "STR repeat length", ha="center", - color=lsg, size=10) - - # Mark the key events - pad = .02 - arrowlen = canvas * 1.05 - arrowprops = dict(length_includes_head=True, width=.01, fc=lsg, lw=0, - head_length=arrowlen * .12, head_width=.04) - p = FancyArrow(xstart, yy, arrowlen, 0, shape="right", **arrowprops) - root.add_patch(p) - - ppad = 30 - keyevents = (( 0, 0, -1, r"$0$"), - (150 - 18, 150 - 18 - ppad, 0, r"$L - 2F$"), - (150 - 9, 150 - 9, 1, r"$L - F$"), - (150, 150 + ppad, 2, r"$L$"), - (500 - 9, 500 - 9, 3, r"$V - F$"), - ) - for event, pos, i, label in keyevents: - _event = convert(event) - _pos = convert(pos) - root.plot((_event, _event), (yy - height / 4, yy + height / 4), - '-', color='k') - root.text(_pos, yy + pad, label, rotation=45, va="bottom", size=8) - if i < 0: - continue - ystart = yp - i * yinterval - root.plot((_event, _event), (ystart, yy - height / 4), ':', color=lsg) - - # Range on bottom. These are simple 4 rectangles, with the range indicating - # the predictive range. - CLOSED, OPEN = range(2) - ranges = ((0, 150 - 18, CLOSED, "Spanning reads"), - (9, 150 - 9, OPEN, "Partial reads"), - (150, 500 - 9, CLOSED, "Repeat reads"), - (0, 500 - 9, CLOSED, "Paired-end reads"), - ) - for start, end, starttag, label in ranges: - _start = convert(start) - _end = convert(end) - data = [[0., 1.], [0., 1.]] if starttag == OPEN else \ - [[1., 0.], [1., 0.]] - root.imshow(data, interpolation='bicubic', cmap=plt.cm.Greens, - extent=[_start, _end, yp, yp + height]) - root.text(_end + pad, yp + height / 2, label, va="center") - yp -= yinterval - - normalize_axes(root) - - image_name = "diagram." + iopts.format - savefig(image_name, dpi=iopts.dpi, iopts=iopts) +from tredparse.tredplot import main if __name__ == '__main__': - main() + main(sys.argv[1:]) diff --git a/tredreport.py b/tredreport.py index e6eb8fb..d108c73 100755 --- a/tredreport.py +++ b/tredreport.py @@ -12,292 +12,9 @@ the given tsv file. """ -import argparse -import os.path as op -import json -import math import sys -import vcf -import pandas as pd - -from collections import Counter -from multiprocessing import Pool, cpu_count - -from tredparse import __version__ -from tredparse.meta import TREDsRepo -from tredparse.utils import DefaultHelpParser - - -def left_truncate_text(a, maxcol=30): - trim = lambda t: (t if not isinstance(t, basestring) \ - or len(t) <= maxcol else "..." + t[-(maxcol - 3):]) - return [trim(x) for x in list(a)] - - -def get_tred_summary(df, tred, repo, minPP=.5, casesfw=None, detailsfw=None): - pf1 = tred + ".1" - pf2 = tred + ".2" - tr = repo[tred] - row = tr.row - title = row["title"] - inheritance = row["inheritance"] - repeat = row["repeat"] - repeat_location = row["repeat_location"] - - cutoff_prerisk, cutoff_risk = tr.cutoff_prerisk, tr.cutoff_risk - label = tred + ".label" - pp = tred + ".PP" - prerisk = df[df[label] == "prerisk"] - risk = df[(df[label] == "risk") & (df[pp] > minPP)] - if tr.is_expansion: - carrier = df[(df[label] != "risk") & (df[pf2] >= cutoff_risk)] - else: - carrier = df[(df[label] != "risk") & (df[pf2] <= cutoff_risk) \ - & (df[pf2] > 0)] - - risk = risk.copy() - n_prerisk = prerisk.shape[0] - n_risk = risk.shape[0] - n_carrier = carrier.shape[0] - calls = tred + ".calls" - - core = ["SampleKey", "inferredGender", calls] - columns = core + [tred + ".FR", tred + ".PR", tred + ".RR", pp] - # Truncate the display of FR/PR - risk[tred + ".FR"] = left_truncate_text(risk[tred + ".FR"]) - risk[tred + ".PR"] = left_truncate_text(risk[tred + ".PR"]) - risk[tred + ".RR"] = left_truncate_text(risk[tred + ".RR"]) - - if detailsfw: - details_columns = core[:] - details_columns.extend([tred + ".FDP", tred + ".PDP", - tred + ".RDP", tred + ".PEDP"]) - for idx, row in risk[details_columns].iterrows(): - if tred == "AR": - break - - samplekey, sex, calls, fdp, pdp, rdp, pedp = row - fdp = int(fdp) - pdp = int(pdp) - rdp = int(rdp) - pedp = int(pedp) - atoms = [tred, tr.inheritance, samplekey, sex, calls, fdp, pdp, rdp, pedp] - print >> detailsfw, "\t".join(str(x) for x in atoms) - - pt = risk[columns] - if n_risk: - print >> casesfw, "[{}] - {}".format(tred, title) - print >> casesfw, "rep={}".format(repeat), "inherit={}".format(inheritance),\ - "cutoff={}".format(cutoff_risk), \ - "n_risk={}".format(n_risk), \ - "n_carrier={}".format(n_carrier), \ - "loc={}".format(repeat_location) - print >> casesfw, pt.to_string(index=False) - print >> casesfw - - # Allele frequency - cnt = Counter() - cnt.update(df[tred + ".1_"]) - cnt.update(df[tred + ".2_"]) - del cnt[-1] - - return tr, n_prerisk, n_risk, n_carrier, counts_to_af(cnt) - - -def counts_to_af(counts): - return "{" + ",".join("{}:{}".format(k, v) for k, v in \ - sorted(counts.items()) if not (k == '.' or math.isnan(k))) + "}" - - -def df_to_tsv(df, tsvfile, extra_columns=[], jsonformat=True, - ref="hg38"): - - df = df.fillna(-1) - dd = ["SampleKey"] - if jsonformat: - dd += ["inferredGender"] - - repo = TREDsRepo(ref) - for tred in repo.names: - tr = repo[tred] - if tred + ".1" not in df.columns: - continue - df[tred + ".1_"] = df[tred + ".1"].astype("int") - df[tred + ".2_"] = df[tred + ".2"].astype("int") - if jsonformat and tr.is_xlinked: - df.loc[(df["inferredGender"] == "Male"), tred + ".2_"] = "." - df[tred + ".calls"] = ["{}|{}".format(a, b) for (a, b) in - zip(df[tred + ".1_"], df[tred + ".2_"])] - - all_columns = ["calls", "label"] + extra_columns - columns = dd + sorted([x for x in df.columns if (x not in dd) and \ - any(x.endswith("." + z) for z in all_columns)]) - - tf = df.reindex_axis(columns, axis='columns') - tf.sort_values("SampleKey") - tf.to_csv(tsvfile, sep='\t', index=False) - print >> sys.stderr, "TSV output written to `{}` (# samples={})"\ - .format(tsvfile, tf.shape[0]) - - return df - - -def vcf_to_df_worker(vcffile): - samplekey = op.basename(vcffile).split(".")[0] - reader = vcf.Reader(open(vcffile, "rb")) - d = {'SampleKey': samplekey} - for rec in reader: - tr = rec.ID - sample = rec.samples[0] - a, b = sample["GB"].split("/") - d[tr + ".1"] = int(a) - d[tr + ".2"] = int(b) - for k in ["PP", "FR", "PR"]: - d[tr + "." + k] = sample[k] - d[tr + ".label"] = sample["LABEL"] - return d - - -def vcf_to_df(vcffiles, tsvfile, cpus): - """ - Compile a number of vcf files into tsv file for easier manipulation. - """ - df = pd.DataFrame() - p = Pool(processes=cpus) - results = [] - r = p.map_async(vcf_to_df_worker, vcffiles, - callback=results.append) - r.wait() - - for res in results: - df = df.append(res, ignore_index=True) - return df - - -def json_to_df_worker(jsonfile): - js = json.load(open(jsonfile)) - samplekey = js['samplekey'] - results = js['tredCalls'] - # We'll infer sample key from file names for now since the keys are not - # properly populated in v0.6.4 and v0.6.5 - samplekey = op.basename(jsonfile).split(".")[0] - d = {'SampleKey': samplekey} - d.update(results) - - return d - - -def json_to_df(jsonfiles, tsvfile, cpus): - """ - Compile a number of json files into tsv file for easier manipulation. - """ - df = pd.DataFrame() - p = Pool(processes=cpus) - results = [] - r = p.map_async(json_to_df_worker, jsonfiles, - callback=results.append) - r.wait() - - for res in results: - df = df.append(res, ignore_index=True) - return df - - -def main(): - p = DefaultHelpParser(description=__doc__, prog=op.basename(__file__), - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - p.add_argument("files", nargs="*") - p.add_argument('--ref', help='Reference genome version', - choices=("hg38", "hg38_nochr", "hg19", "hg19_nochr"), default='hg38') - p.add_argument('--tsv', default="out.tsv", - help="Path to the tsv file") - p.add_argument('--columns', - help="Columns to extract, use comma to separate") - p.add_argument('--minPP', default=.5, type=float, - help="Minimum Prob(pathological) to report cases") - p.add_argument('--cpus', default=cpu_count(), - help='Number of threads') - p.add_argument('--version', action='version', version="%(prog)s " + __version__) - args = p.parse_args() - - files = args.files - ref = args.ref - tsvfile = args.tsv - columns = args.columns.split(",") if args.columns else [] - - repo = TREDsRepo(ref) - alltreds = repo.names - if files: - nfiles = len(files) - cpus = min(nfiles, args.cpus) - jsonformat = files[0].endswith(".json") - suffix = "JSON" if jsonformat else "VCF" - print >> sys.stderr, "Using {} cpus to parse {} {} files"\ - .format(cpus, nfiles, suffix) - if jsonformat: - df = json_to_df(files, tsvfile, cpus) - else: - df = vcf_to_df(files, tsvfile, cpus) - df = df_to_tsv(df, tsvfile, extra_columns=columns, jsonformat=jsonformat, - ref=ref) - else: - if op.exists(tsvfile): - df = pd.read_csv(tsvfile, sep="\t") - else: - sys.exit(not p.print_help()) - - if df.empty: - print >> sys.stderr, "Dataframe empty - check input files" - sys.exit(1) - - reportfile = tsvfile + ".report.txt" - summary = pd.DataFrame() - total_prerisk = total_risk = total_carrier = total_loci = 0 - - # Outlier cases and associated read count details - cases = tsvfile + ".cases.txt" - casesfw = open(cases, "w") - details = tsvfile + ".details.txt" - detailsfw = open(details, "w") - header = "Locus,Inheritance,SampleKey,Sex,Calls,FullReads,PartialReads,"\ - "RepeatReads,PairedReads" - print >> detailsfw, "\t".join(header.split(',')) - - for tred in alltreds: - if tred + ".label" not in df.columns: - continue - tr, n_prerisk, n_risk, n_carrier, af = \ - get_tred_summary(df, tred, repo, minPP=args.minPP, - casesfw=casesfw, detailsfw=detailsfw) - total_prerisk += n_prerisk - total_risk += n_risk - total_carrier += n_carrier - if n_risk: - total_loci += 1 - - tr = tr.row - columns = ["abbreviation", "title", "motif", "inheritance", - "cutoff_prerisk", "cutoff_risk"] - d = dict((c, tr[c]) for c in columns[1:]) - d["abbreviation"] = tred - d["n_prerisk"] = n_prerisk - d["n_risk"] = n_risk - d["n_carrier"] = n_carrier - d["allele_freq"] = af - summary = summary.append(d, ignore_index=True) - - print >> sys.stderr, "Outlier cases saved to `{}`".format(casesfw.name) - casesfw.close() - print >> sys.stderr, "Read count details saved to `{}`"\ - .format(detailsfw.name) - detailsfw.close() - - summary.to_csv(reportfile, sep="\t", index=False, float_format="%d") - print >> sys.stderr, "Summary report written to `{}` (# samples={})"\ - .format(reportfile, summary.shape[0]) - print >> sys.stderr, "Summary: n_prerisk={}, n_risk={}, n_carrier={}, n_affected_loci={}"\ - .format(total_prerisk, total_risk, total_carrier, total_loci) +from tredparse.tredreport import main if __name__ == '__main__': - main() + main(sys.argv[1:])