diff --git a/MANIFEST.in b/MANIFEST.in index 99ff29cfa..44c74b012 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -22,5 +22,8 @@ exclude scripts/* include scripts/__init__.py include scripts/version.py include scripts/cgat_ruffus_profile.py +include cgatpipelines/Rtools/* +include cgatpipelines/experiment.R + # extensions diff --git a/cgatpipelines/Rtools/diffexpression.R b/cgatpipelines/Rtools/diffexpression.R index b69374312..e5c279b2d 100644 --- a/cgatpipelines/Rtools/diffexpression.R +++ b/cgatpipelines/Rtools/diffexpression.R @@ -22,11 +22,11 @@ suppressMessages(library(goseq)) source(file.path(Sys.getenv("R_ROOT"), "experiment.R")) -mart = useMart(biomart = "ENSEMBL_MART_ENSEMBL",dataset="hsapiens_gene_ensembl", host = "jul2018.archive.ensembl.org") +mart = useMart(biomart = "ENSEMBL_MART_ENSEMBL",dataset="hsapiens_gene_ensembl", host = "https://aug2020.archive.ensembl.org") getmart <- function(values){ data<- getBM( filters= "ensembl_gene_id", - attributes= c("ensembl_gene_id", "external_gene_name", "description","entrezgene", 'chromosome_name', + attributes= c("ensembl_gene_id", "external_gene_name", "description","entrezgene_id", 'chromosome_name', 'start_position', 'end_position'), values= values, mart= mart, @@ -35,7 +35,7 @@ getmart <- function(values){ return(data) } -start_plot <- function(section, height = 6, width = 6, type = "png", outdir="") { +start_plot <- function(section, height = 10, width = 10, type = "png", outdir="") { file = get_output_filename(paste0(outdir,"/",section, ".", type)) Cairo(file = file, type = type, @@ -78,7 +78,8 @@ plotTPMs <- function(dftemp, contrast_name){ ggplot(aes(x = contrast, y = value, color = contrast)) + geom_point(position = position_jitter(w = 0.15, h = 0)) + facet_wrap(~ var, scales = "free") + theme_bw() + - ylab("normalised counts") + xlab (contrast_name) + guides(color = "none") + ylab("normalised counts") + xlab (contrast_name) + guides(color = "none")+ + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) } @@ -114,7 +115,7 @@ run <- function(opt) { flog.info("... plotting MA") ## MA Plot start_plot("MAPlot", outdir=opt$outdir) - DESeq2::plotMA(resLFC, ylim = c(-3,3)) + DESeq2::plotMA(resLFC, ylim = c(ceiling(max(resLFC$log2FoldChange)),floor(min(resLFC$log2FoldChange)))) end_plot() flog.info("... saving DE data") @@ -178,6 +179,7 @@ run <- function(opt) { flog.info("... plotting user-defined genes") genelist <- unlist(strsplit(opt$userlist, ",")) + genelist <- genelist[genelist %in% rownames(dds)] dftemp <- makeTPMtable(genelist, counts(dds, normalized=TRUE), colData(dds), opt$contrast) start_plot("Userdefined", outdir=opt$outdir) print(plotTPMs(dftemp, opt$contrast)) diff --git a/cgatpipelines/Rtools/exploratory.R b/cgatpipelines/Rtools/exploratory.R index ce966434d..20aa2b7bc 100755 --- a/cgatpipelines/Rtools/exploratory.R +++ b/cgatpipelines/Rtools/exploratory.R @@ -33,11 +33,11 @@ source(file.path(Sys.getenv("R_ROOT"), "experiment.R")) # Dependencies: biomaRt package # Input: vector of ensembl gene ids # Output: data frame with description, symbol and entrez gene -mart = useMart(biomart = "ENSEMBL_MART_ENSEMBL",dataset="hsapiens_gene_ensembl", host = "jul2018.archive.ensembl.org") +mart = useMart(biomart = "ENSEMBL_MART_ENSEMBL",dataset="hsapiens_gene_ensembl", host = "https://aug2020.archive.ensembl.org") getmart <- function(values){ data<- getBM( filters= "ensembl_gene_id", - attributes= c("ensembl_gene_id", "external_gene_name", "description","entrezgene"), + attributes= c("ensembl_gene_id", "external_gene_name", "description","entrezgene_id"), values= values, mart= mart, useCache = FALSE) @@ -74,35 +74,15 @@ run <- function(opt) { dds = DESeqDataSetFromMatrix(experiment$counts, experiment$sample, design = formula(opt$model)) } futile.logger::flog.info(paste("reading Experiment object", paste(dim(counts(experiment)), collapse = ","))) - - ### SVA - ANALYSIS OF BIASES ### - futile.logger::flog.info(paste("Performing Surrogate Variable Analysis")) - dds <- estimateSizeFactors(dds) - dat <- counts(dds, normalized = TRUE) - idx <- rowMeans(dat) > 1 - dat <- dat[idx, ] - mod <- model.matrix(formula(opt$model), colData(dds)) - mod0 <- model.matrix(~ 1, colData(dds)) - svseq <- svaseq(dat, mod, mod0, n.sv = 2) - for(factor in opt$factors){ - start_plot(paste0('SVA for ', factor), outdir=opt$outdir) - par(mfrow = c(2, 1), mar = c(3,5,3,1)) - for (i in 1:2) { - stripchart(svseq$sv[, i] ~ colData(dds)[, factor], vertical = TRUE, main = paste0("SV", i)) - abline(h = 0) - } - end_plot() - } - + ### TRANSFORMATION OF DATA ### futile.logger::flog.info(paste("Transforming data")) - rld<- rlog(dds) + dds <- estimateSizeFactors(dds) vsd<- vst(dds) df <- bind_rows( as_tibble(log2(counts(dds, normalized=TRUE)[, 1:2]+1)) %>% mutate(transformation = "log2(x + 1)"), - as_tibble(assay(vsd)[, 1:2]) %>% mutate(transformation = "vst"), - as_tibble(assay(rld)[, 1:2]) %>% mutate(transformation = "rlog")) + as_tibble(assay(vsd)[, 1:2]) %>% mutate(transformation = "vst")) colnames(df)[1:2] <- c("x", "y") start_plot('Variance_Transformations', outdir=opt$outdir) print(ggplot(df, aes(x = x, y = y)) + geom_hex(bins = 80) + @@ -120,11 +100,12 @@ run <- function(opt) { if(dim(pca$x)[1]>7){ dim_pca = 8 } else { - dim_pca = dim(pca$x) + dim_pca = dim(pca$x)[1] } scores <- data.frame(variable.group, pca$x[,1:dim_pca]) start_plot(paste0('PCA_', factor), outdir=opt$outdir) - print(qplot(x=PC1, y=PC2, data=scores, colour=factor(variable.group)) + + print(ggplot(scores, aes(x=PC1, y=PC2, colour=factor(variable.group))) + + geom_point() + theme(legend.position="right") + labs(colour=factor, x=paste0("PC1 (", percentVar[1],"% of variance)"), y=paste0("PC2 (", percentVar[2],"% of variance)")) + @@ -133,7 +114,8 @@ run <- function(opt) { theme(text=element_text(family='serif'))) end_plot() start_plot(paste0('PCA_13_', factor), outdir=opt$outdir) - print(qplot(x=PC1, y=PC3, data=scores, colour=factor(variable.group)) + + print(ggplot(scores, aes(x=PC1, y=PC3, colour=factor(variable.group))) + + geom_point() + theme(legend.position="right") + labs(colour=factor, x=paste0("PC1 (", percentVar[1],"% of variance)"), y=paste0("PC3 (", percentVar[3],"% of variance)")) + @@ -146,7 +128,7 @@ run <- function(opt) { names(variable.group) <- opt$contrast scores <- data.frame(variable.group, pca$x[,1:dim_pca]) start_plot(paste0('PCA_grid'), outdir=opt$outdir) - if(dim_pca>7){ + if(dim_pca==8){ print(ggplot(scores, aes(x = .panel_x, y = .panel_y, fill = variable.group, colour = variable.group)) + geom_point(shape = 16, size = 0.5, position = 'auto') + geom_autodensity(alpha = 0.3, colour = NA, position = 'identity') + @@ -171,14 +153,18 @@ run <- function(opt) { df <- as.data.frame(colData(dds)[,opt$factors]) rownames(df) <- colData(dds)$track # Heatmap of Top 20 Expressed Genes - select <- order(rowMeans(counts(dds,normalized=TRUE)),decreasing=TRUE)[1:20] + selected <- order(rowMeans(counts(dds,normalized=TRUE)),decreasing=TRUE)[1:20] start_plot('Heatmap_topExpressed', outdir=opt$outdir) - pheatmap(assay(vsd)[select,], cluster_rows=FALSE, cluster_cols=FALSE, show_rownames=FALSE, annotation_col=df) + pheatmap(assay(vsd)[selected,], cluster_rows=FALSE, cluster_cols=FALSE, show_rownames=FALSE, annotation_col=df) end_plot() # Heatmap of Top 20 Variable Genes topVarGenes <- head(order(rowVars(assay(vsd)),decreasing=TRUE),20) + futile.logger::flog.info(paste("topVarGenes ",topVarGenes)) mat <- assay(vsd)[ topVarGenes, ] + futile.logger::flog.info(paste("mat ",rownames(mat))) temp <- getmart(rownames(mat)) + temp <- temp[!duplicated(temp$ensembl_gene_id), ] + futile.logger::flog.info(paste("temp ",temp)) row.names(temp) <- temp$ensembl_gene_id rownames(mat) <- temp[rownames(mat),"external_gene_name"] start_plot('Heatmap_topVariable', outdir=opt$outdir) @@ -187,7 +173,9 @@ run <- function(opt) { # Heatmap of Genes of interest if (!is.null(opt$genes_of_interest)) { print(rownames(assay(vsd))) - mat <- assay(vsd)[opt$genes_of_interest, ] + goi <- opt$genes_of_interest + goi <- goi[goi %in% rownames(assay(vsd))] + mat <- assay(vsd)[goi, ] mat <- mat - rowMeans(mat) temp <- getmart(rownames(mat)) row.names(temp) <- temp$ensembl_gene_id @@ -227,6 +215,7 @@ run <- function(opt) { ### EXPLORE BATCH EFFECTS ### futile.logger::flog.info(paste("Exploring Batch Effects")) + futile.logger::flog.info(dim_pca) if(dim_pca == 8){ for(factor in opt$factors){ factor_transformed <- vsd @@ -237,7 +226,8 @@ run <- function(opt) { percentVar <- round(100 * summary(pca)$importance[2,]) scores <- data.frame(variable.group, sample.group, pca$x[,1:2]) start_plot(paste0('PCA_', factor, '_removed'), outdir=opt$outdir) - print(qplot(x=PC1, y=PC2, data=scores, colour=factor(variable.group), shape=factor(sample.group)) + + print(ggplot(scores, aes(x=PC1, y=PC2, colour=factor(variable.group), shape=factor(sample.group))) + + geom_point() + theme(legend.position="right") + labs(colour=opt$contrast, shape=factor, x=paste0("PC1 (", percentVar[1],"% of variance)"), y=paste0("PC2 (", percentVar[2],"% of variance)")) + diff --git a/cgatpipelines/Rtools/filtercounts.R b/cgatpipelines/Rtools/filtercounts.R index 00ff92ef9..6f8d7ebd7 100755 --- a/cgatpipelines/Rtools/filtercounts.R +++ b/cgatpipelines/Rtools/filtercounts.R @@ -28,9 +28,10 @@ run <- function(opt) { ### READING DATA ### # Read in sampleData Table futile.logger::flog.info(paste("reading sampleData table from", normalizePath(opt$sampleData))) - sampleData <- read.table(opt$sampleData, header = TRUE) + sampleData <- read_tsv(opt$sampleData) sampleData <-sampleData[sampleData$include ==1, ] futile.logger::flog.info(paste("read sampleData ", paste(dim(sampleData), collapse = ","))) + futile.logger::flog.info(paste(sampleData)) rownames(sampleData) <- sampleData$track futile.logger::flog.info(paste("reading in data from ", opt$source)) @@ -81,7 +82,8 @@ run <- function(opt) { flattenedfile=opt$flattenedFile) } else if(opt$source == "counts_table"){ # Read in Data - raw <- read.table(file = gzfile(opt$counts_tsv), header=TRUE, row.name=1) + raw <- read.table(file = gzfile(opt$counts_tsv), header=TRUE, row.name=1, check.names = FALSE) + futile.logger::flog.info(paste("Colnames: ", colnames(raw))) experiment_tsv <- raw[,sampleData$track,drop=FALSE] if(opt$method == "deseq2"){ dataset = DESeqDataSetFromMatrix(experiment_tsv, sampleData, design = formula(opt$model)) @@ -107,6 +109,9 @@ run <- function(opt) { futile.logger::flog.info(paste("Counts before filtering ", paste(dim(counts(dataset)), collapse = ","))) keep <- rowSums(counts(dataset)) >= 10 dataset <- dataset[keep,] + #dataset <- dataset[Reduce('|', dataset[sapply(dataset, is.numeric)]),] + #dataset <- dataset[rowSums(dataset[,sapply(dataset, is.numeric)]==0)<6,] + #dataset <- subset(dataset, !rowSums(assay(dataset) == 0) > 6) counts_table <- counts(dataset) futile.logger::flog.info(paste("Counts after filtering ", paste(dim(counts(dataset)), collapse = ","))) } else if(opt$method == "dexseq"){ diff --git a/cgatpipelines/cgatflow.py b/cgatpipelines/cgatflow.py index 0bf22afe4..374a21180 100644 --- a/cgatpipelines/cgatflow.py +++ b/cgatpipelines/cgatflow.py @@ -21,7 +21,7 @@ import sys import re import glob -import imp +import importlib.util import cgatpipelines @@ -84,9 +84,10 @@ def main(argv=None): # remove 'cgatflow' from sys.argv del sys.argv[0] - (file, pathname, description) = imp.find_module(pipeline, paths) - - module = imp.load_module(pipeline, file, pathname, description) + spec = importlib.util.spec_from_file_location(pipeline, path+"/"+pipeline+".py") + module = importlib.util.module_from_spec(spec) + sys.modules[pipeline] = module + spec.loader.exec_module(module) module.main(sys.argv) diff --git a/cgatpipelines/tasks/bamstats.py b/cgatpipelines/tasks/bamstats.py index e67ca4916..720e03fa7 100644 --- a/cgatpipelines/tasks/bamstats.py +++ b/cgatpipelines/tasks/bamstats.py @@ -67,7 +67,7 @@ def buildPicardInsertSizeStats(infile, outfile, genome_file, Filename with genomic sequence. ''' job_memory = picardmem - picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals() + picard_opts = '-Xmx%(job_memory)s -XX:+UseG1GC' % locals() job_threads = 3 if BamTools.getNumReads(infile) == 0: @@ -159,7 +159,7 @@ def buildPicardAlignmentStats(infile, outfile, genome_file, ''' job_memory = picardmem - picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals() + picard_opts = '-Xmx%(job_memory)s -XX:+UseG1GC ' % locals() job_threads = 3 if BamTools.getNumReads(infile) == 0: @@ -191,7 +191,7 @@ def buildPicardDuplicationStats(infile, outfile, picardmem): ''' job_memory = picardmem - picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals() + picard_opts = '-Xmx%(job_memory)s -XX:+UseG1GC ' % locals() job_threads = 3 if BamTools.getNumReads(infile) == 0: @@ -244,7 +244,7 @@ def buildPicardDuplicateStats(infile, outfile, picardmem): Output filename with picard output. ''' job_memory = picardmem - picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals() + picard_opts = '-Xmx%(job_memory)s -XX:+UseG1GC ' % locals() job_threads = 3 if BamTools.getNumReads(infile) == 0: @@ -279,7 +279,7 @@ def buildPicardCoverageStats(infile, outfile, baits, regions, ''' job_memory = picardmem - picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals() + picard_opts = '-Xmx%(job_memory)s -XX:+UseG1GC ' % locals() job_threads = 3 if BamTools.getNumReads(infile) == 0: @@ -310,7 +310,7 @@ def buildPicardGCStats(infile, outfile, genome_file, picardmem): """ job_memory = picardmem - picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals() + picard_opts = '-Xmx%(job_memory)s -XX:+UseG1GC ' % locals() job_threads = 3 if BamTools.getNumReads(infile) == 0: @@ -1060,7 +1060,7 @@ def loadIdxstats(infiles, outfile): columns=(['region', 'mapped'])) # reformat the df - df = df.append(reformatted_df, ignore_index=True) + df = df._append(reformatted_df, ignore_index=True) df.set_index('region', inplace=True) df1 = df[['mapped']].T dfs.append(df1) @@ -1291,7 +1291,7 @@ def buildPicardRnaSeqMetrics(infiles, strand, outfile, picardmem): ''' job_memory = picardmem - picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals() + picard_opts = '-Xmx%(job_memory)s -XX:+UseG1GC ' % locals() job_threads = 3 infile, genome, rRNA_intervals = infiles diff --git a/cgatpipelines/tasks/diffexonexpression.R b/cgatpipelines/tasks/diffexonexpression.R new file mode 100644 index 000000000..8cbfe9a8b --- /dev/null +++ b/cgatpipelines/tasks/diffexonexpression.R @@ -0,0 +1,269 @@ +#' Running DEXSeq +#' +#' WARNING: This script is work-in-progress +#' +#' Example usage: +#' +#' Rscript PATH/TO/diffexonexpression.R --rds-filename=experiment.rds --model=~sample+exon+group:exon --reducedmodel=~sample+exon +#' +#' `experiment.rds` is a DEXSeq experiment object after filtering +#' +#' +#' -> todo: ideas + + + +suppressMessages(library(futile.logger)) +suppressMessages(library(getopt)) +suppressMessages(library(fgsea)) +suppressMessages(library(data.table)) +suppressMessages(library(sva)) +suppressMessages(library(biomaRt)) +suppressMessages(library(tidyverse)) +suppressMessages(library(RUVSeq)) +suppressMessages(library(Cairo)) +suppressMessages(library(goseq)) +suppressMessages(library(DEXSeq)) + +source(file.path(Sys.getenv("R_ROOT"), "experiment.R")) + + +mart = useMart(biomart = "ENSEMBL_MART_ENSEMBL",dataset="hsapiens_gene_ensembl", host = "jul2018.archive.ensembl.org") +getmart <- function(values){ + data<- getBM( + filters= "ensembl_gene_id", + attributes= c("ensembl_gene_id", "external_gene_name", "description","entrezgene"), + values= values, + mart= mart) + data$description <- gsub("\t", "", data$description) + return(data) +} + +start_plot <- function(section, outdir = "", height = 6, width = 6, type = "png") { + file = get_output_filename(paste0(outdir,"/",section, ".", type)) + Cairo(file = file, + type = type, + width = width, + height = height, + units="in", + dpi = 300, + bg = "white") + #opar <- par(lwd=0.5) +} + +end_plot <- function() { + dev.off() +} + + +# plotTPMs function +# Function: plots a table of TPMs from a data frame, ensuring appropriate formating for ggplot +# Input: data frame +# Output: plot object +plotTPMs <- function(dftemp, contrast_name){ + dftemp %>% + gather(key = "var", value="value", -contrast, -track) %>% + mutate(var = factor(var, levels=unique(var))) %>% + ggplot(aes(x = contrast, y = value, color = contrast)) + + geom_point(position = position_jitter(w = 0.15, h = 0)) + + facet_wrap(~ var, scales = "free") + theme_bw() + + ylab("normalised counts") + xlab (contrast_name) + guides(color = "none") +} + + +run <- function(opt) { + + futile.logger::flog.info(paste("reading Experiment object from", normalizePath(opt$rds_filename))) + experiment <- readRDS(opt$rds_filename) + if (class(experiment) != "DEXSeqDataSet"){ + stop("This script supports DEXSeqDataSet objects only.") + } + flog.info("Running DEXSeq") + dxd = estimateSizeFactors(experiment) + dxd = estimateDispersions(dxd) + dxd = testForDEU(dxd, reducedModel = formula(opt$reducedmodel), + fullModel = formula(opt$model)) + dxd = estimateExonFoldChanges( dxd, fitExpToVar=opt$contrast) + res = DEXSeqResults(dxd) + + flog.info("... plotting dispersion estimates") + ## Plot dispersion estimates + start_plot("Dispersion", opt$outdir) + plotDispEsts(dxd) + end_plot() + + flog.info("... plotting MA") + ## MA Plot + start_plot("MAPlot", opt$outdir) + plotMA(res, ylim = c(-3,3)) + end_plot() + + flog.info("... saving DE data") + ## Save DE data + #resSig <- subset(res, padj < opt$alpha) + #data <- getmart(as.character(map(res$transcripts, 1))) + #res$symbol<-data$external_gene_name[match(data$ensembl_transcript_id, as.character(map(res$transcripts, 1)))] + #res$desc<-data$description[match(data$ensembl_transcript_id, as.character(map(res$transcripts, 1)))] + resSig <- subset(res, padj < opt$alpha) + write.table(resSig, paste0(opt$outdir,"/","results.tsv"), sep = "\t") + write.table(res, paste0(opt$outdir,"/","results_full.tsv"), sep = "\t") + + flog.info("... plotting P histogram") + ## Plot P value Histogram + start_plot("PHistogram", opt$outdir) + hist(res$pvalue,breaks=50, col='skyblue', xlab="p-value", main="P-value Histogram") + end_plot() + + flog.info("... plotting downregulated genes") + ## Plot Top Downregulated Genes + genelist <- rownames(res[ order( res[,grepl("log2",colnames(res))] ), ][0:9,]) + for(gene in unique(substr(genelist,0,15))){ + start_plot(paste0("Downregulated_",gene), opt$outdir) + plotDEXSeq(res, gene, legend=TRUE, cex.axis=1.2, cex=1.3, lwd=2 ,fitExpToVar = opt$contrast) + end_plot() + } + + + flog.info("... plotting upregulated genes") + genelist <- rownames(res[ order( -res[,grepl("log2",colnames(res))] ), ][0:9,]) + for(gene in unique(substr(genelist,0,15))){ + start_plot(paste0("Upregulated_",gene), opt$outdir) + plotDEXSeq(res, gene, legend=TRUE, cex.axis=1.2, cex=1.3, lwd=2 ,fitExpToVar = opt$contrast) + end_plot() + } + + flog.info("... plotting significant genes") + genelist <- rownames(resSig[order(resSig$padj), ]) + for(gene in unique(substr(genelist,0,15))){ + start_plot(paste0("Significant_",gene), opt$outdir) + plotDEXSeq(res, gene, legend=TRUE, cex.axis=1.2, cex=1.3, lwd=2 ,fitExpToVar = opt$contrast) + end_plot() + } + + if(opt$perm >0){ + flog.info("... performing permutations") + designperm <- colData(dxd) + dxdperm = dxd + x = 0 + i = 1 + y= list() + while(i <= opt$perm) { + flog.info(paste0("...... Permutation ", i)) + designperm[, opt$contrast] = sample(designperm[, opt$contrast]) + if(sum(designperm$group == levels(designperm[, opt$contrast])[1]) != table(colData(dxd)[,opt$contrast])[1]){ + flog.info(paste0("......... Failed. Skipping design: ", paste(as.character(designperm$group),collapse = ","))) + next + } + colData(dxdperm) <- designperm + dxdperm = testForDEU(dxdperm) + dxdperm = estimateExonFoldChanges( dxdperm, fitExpToVar=opt$contrast) + resperm = DEXSeqResults(dxdperm) + x[i] = length(subset(resperm, padj < opt$alpha)$padj) + y[[i]] = dxdperm$group + i = i+1 + } + flog.info(paste0("Result: " , x)) + start_plot("Simulations", outdir=opt$outdir) + theme_set(theme_gray(base_size = 18)) + sims = qplot(x, + geom="histogram", + breaks=seq(0, 20, by = 1),fill=I("grey"), col=I("black"), + main = "Histogram of DE experiments\n with random group labels", + xlab = "Number of differentially expressed exons", + ylab = "Number of simulations") + + geom_vline(xintercept = length(rownames(subset(res, padj < opt$alpha)))) + + theme_classic() + theme(plot.title = element_text(hjust = 0.5, size=22)) + print(sims) + end_plot() + flog.info(paste0("... Permutation p value: ", + length(x[x > length(rownames(subset(res, padj < opt$alpha)))])/length(x))) + z <- list() + for(i in 0:length(x)){ + z[i] <- paste( unlist(y[i]), collapse=' ') + } + z <- unlist(z) + df <- data.frame(number=x, combination=z) + df + write_tsv(df,paste0(opt$outdir,"/","Simulations.tsv")) + } +} + +main <- function() { + + option_list <- list( + make_option( + "--rds-filename", + dest = "rds_filename", + type = "character", + default = "experiment.rds", + help = paste("filename with input data of counts") + ), + make_option( + "--model", + dest = "model", + type = "character", + default = "~ sample + exon + group:exon", + help = paste("model formula for GLM") + ), + make_option( + "--reducedmodel", + dest = "reducedmodel", + type = "character", + default = "~ sample + exon", + help = paste("model formula for GLM") + ), + make_option( + "--contrast", + dest = "contrast", + type = "character", + default = "group", + help = paste("contrast/factor to use for comparison") + ), + make_option( + "--refgroup", + dest = "refgroup", + type = "character", + default = "CTR", + help = paste("reference group, e.g. CTR") + ), + make_option( + "--alpha", + dest = "alpha", + type = "numeric", + default = 0.05, + help = paste("Adjusted P value threshold") + ), + make_option( + "--outdir", + dest = "outdir", + type = "character", + default = "experiment", + help = paste("Libraries for gsea") + ), + make_option( + "--permute", + dest = "perm", + type = "integer", + default = 0, + help = paste("number of permutations") + ), + make_option( + "--pathways", + dest = "pathways", + type = "character", + default = "", + help = paste("Libraries for gsea") + ) + ) + opt <- experiment_start(option_list = option_list, + description = description) + + if (!is.null(opt$pathways)) { + opt$pathways = unlist(strsplit(opt$pathways, ",")) + } + run(opt) + + experiment_stop() +} + +main() diff --git a/cgatpipelines/tasks/diffexpression.R b/cgatpipelines/tasks/diffexpression.R new file mode 100644 index 000000000..20fda35ff --- /dev/null +++ b/cgatpipelines/tasks/diffexpression.R @@ -0,0 +1,397 @@ +#' Differential expression analysis +#' +#' WARNING: This script is work-in-progress +#' +#' Example usage: +#' +#' Rscript PATH/TO/diffexpression.R --rds-filename=sce.rds --model=~group --coef=group_MUT_vs_CTR --refgroup=CTR +#' + + +suppressMessages(library(futile.logger)) +suppressMessages(library(getopt)) +suppressMessages(library(fgsea)) +suppressMessages(library(data.table)) +suppressMessages(library(sva)) +suppressMessages(library(biomaRt)) +suppressMessages(library(tidyverse)) +suppressMessages(library(RUVSeq)) +suppressMessages(library(Cairo)) +suppressMessages(library(goseq)) + +source(file.path(Sys.getenv("R_ROOT"), "experiment.R")) + + +mart = useMart(biomart = "ENSEMBL_MART_ENSEMBL",dataset="hsapiens_gene_ensembl", host = "jul2018.archive.ensembl.org") +getmart <- function(values){ + data<- getBM( + filters= "ensembl_gene_id", + attributes= c("ensembl_gene_id", "external_gene_name", "description","entrezgene", 'chromosome_name', + 'start_position', 'end_position'), + values= values, + mart= mart, + useCache = FALSE) + data$description <- gsub("\t", "", data$description) + return(data) +} + +start_plot <- function(section, height = 10, width = 10, type = "png", outdir="") { + file = get_output_filename(paste0(outdir,"/",section, ".", type)) + Cairo(file = file, + type = type, + width = width, + height = height, + units="in", + dpi = 300, + bg = "white") + #opar <- par(lwd=0.5) +} + +end_plot <- function() { + dev.off() +} + +# makeTPMtable +# Function: combines data from a genelist with abundance data to create a +# Input: 1. genelist vector in ENSEMBL format, 2. abundance matrix from tximport +# Output: data frame with gene symbol names +makeTPMtable <- function(genelist, abundance, design, contrast){ + genelist.df <- getmart(genelist) + genelist.df <- genelist.df[!duplicated(genelist.df[,1]),] + rownames(genelist.df) <- genelist.df$ensembl_gene_id + genelist.names <- genelist.df[genelist,]$external_gene_name + genelist.names[is.na(genelist.names)] <- genelist[is.na(genelist.names)] + dftemp <- as_tibble(t(abundance[genelist,]), rownames = "track") + dftemp <- dftemp %>% rename_at(vars(genelist), ~ genelist.names) + dftemp$contrast <- design[dftemp$track,][,contrast] + return(dftemp) +} + +# plotTPMs function +# Function: plots a table of TPMs from a data frame, ensuring appropriate formating for ggplot +# Input: data frame +# Output: plot object +plotTPMs <- function(dftemp, contrast_name){ + dftemp %>% + gather(key = "var", value="value", -contrast, -track) %>% + mutate(var = factor(var, levels=unique(var))) %>% + ggplot(aes(x = contrast, y = value, color = contrast)) + + geom_point(position = position_jitter(w = 0.15, h = 0)) + + facet_wrap(~ var, scales = "free") + theme_bw() + + ylab("normalised counts") + xlab (contrast_name) + guides(color = "none")+ + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +} + + +run <- function(opt) { + + futile.logger::flog.info(paste("reading Experiment object from", normalizePath(opt$rds_filename))) + experiment <- readRDS(opt$rds_filename) + futile.logger::flog.info(paste("Dataset", class(experiment))) + if (class(experiment) == "DESeqDataSet"){ + flog.info("Running DESeq2") + suppressMessages(library(DESeq2)) + colData(experiment)[,opt$contrast] <- relevel(colData(experiment)[,opt$contrast], ref = opt$refgroup) + design(experiment) <- formula(opt$model) + dds <- DESeq(experiment, betaPrior=FALSE) + futile.logger::flog.info(paste("coef", opt$coef)) + futile.logger::flog.info(paste("coef", resultsNames(dds))) + res <- results(dds, name=opt$coef) + resLFC <- lfcShrink(dds, coef=opt$coef, type=opt$shrinkage) + } + if (class(experiment) == "DGEList"){ + dds = DESeqDataSetFromMatrix(experiment$counts, experiment$sample, design = formula(opt$model)) + } + if (class(experiment) == "DEXSeqDataSet"){ + stop("please use diffexonexpression R script") + } + + flog.info("... plotting dispersion estimates") + ## Plot dispersion estimates + start_plot("Dispersion", outdir=opt$outdir) + plotDispEsts(dds) + end_plot() + + flog.info("... plotting MA") + ## MA Plot + start_plot("MAPlot", outdir=opt$outdir) + DESeq2::plotMA(resLFC, ylim = c(ceiling(max(resLFC$log2FoldChange)),floor(min(resLFC$log2FoldChange)))) + end_plot() + + flog.info("... saving DE data") + ## Save DE data + resSig <- subset(resLFC, padj < opt$alpha) + data <- getmart(rownames(resLFC)) + resLFC$symbol<-data$external_gene_name[match(rownames(resLFC), data$ensembl_gene_id)] + resLFC$desc<-data$description[match(rownames(resLFC), data$ensembl_gene_id)] + resLFC$chromosome<-data$chromosome_name[match(rownames(resLFC), data$ensembl_gene_id)] + resLFC$start<-data$start_position[match(rownames(resLFC), data$ensembl_gene_id)] + resSig <- subset(resLFC, padj < opt$alpha) + write.table(resSig, paste0(opt$outdir,"/","results.tsv"), sep = "\t") + write.table(resLFC, paste0(opt$outdir,"/","results_full.tsv"), sep = "\t") + resSig2 <- subset(res, padj < opt$alpha) + write.table(resSig2, paste0(opt$outdir,"/","results_noLFCshrinkage.tsv"), sep = "\t") + resdf <- data.frame(geneid=rownames(resLFC), pvalue=resLFC$pvalue*sign(resLFC$log2FoldChange)) + write.table(resdf, paste0(opt$outdir,"/","px_results_pvalue.gene.tsv"), sep = "\t", row.names = FALSE, quote = FALSE) + resdf <- data.frame(geneid=rownames(resLFC), l2fc=resLFC$log2FoldChange) + write.table(resdf, paste0(opt$outdir,"/","px_results_l2fc.gene.tsv"), sep = "\t", row.names = FALSE, quote = FALSE) + resdf <- data.frame(geneid=rownames(resLFC), l2fc=resLFC$log2FoldChange) + write.table(resdf, paste0(opt$outdir,"/","px_results_l2fc.gene.tsv"), sep = "\t", row.names = FALSE, quote = FALSE) + if(any(names(assays(experiment)) == "avgTxLength")){ + genelengths <- rowMeans(assays(experiment)$avgTxLength) + write.table(genelengths, paste0(opt$outdir,"/","gene_lengths.tsv"), sep = "\t", row.names = TRUE, quote = FALSE) + } + file = get_output_filename(paste0(opt$outdir,"/","results_table.rds")) + flog.info(paste("saving results data to", file)) + saveRDS(resLFC, file = file) + + + flog.info("... plotting P histogram") + ## Plot P value Histogram + start_plot("PHistogram", outdir=opt$outdir) + hist(res$pvalue,breaks=50, col='skyblue', xlab="p-value", main="P-value Histogram") + end_plot() + + flog.info("... plotting downregulated genes") + ## Plot Top Downregulated Genes + genelist <- rownames(resLFC[ order( resLFC[,grepl("log2",colnames(resLFC))] ), ][0:9,]) + dftemp <- makeTPMtable(genelist, counts(dds, normalized=TRUE), colData(dds), opt$contrast) + start_plot("Downregulated", outdir=opt$outdir) + print(plotTPMs(dftemp, opt$contrast)) + end_plot() + + flog.info("... plotting upregulated genes") + genelist <- rownames(resLFC[ order( -resLFC[,grepl("log2",colnames(resLFC))] ), ][0:9,]) + dftemp <- makeTPMtable(genelist, counts(dds, normalized=TRUE), colData(dds), opt$contrast) + start_plot("Upregulated", outdir=opt$outdir) + print(plotTPMs(dftemp, opt$contrast)) + dev.off() + + flog.info("... plotting significant genes") + resSig <- subset(resLFC, padj < 0.05) + genelist <- rownames(resSig[order(resSig$padj), ]) + if(length(genelist) > 9){ + genelist <- genelist[0:9]} + dftemp <- makeTPMtable(genelist, counts(dds, normalized=TRUE), colData(dds), opt$contrast) + start_plot("significant", outdir=opt$outdir) + print(plotTPMs(dftemp, opt$contrast)) + dev.off() + + flog.info("... plotting user-defined genes") + genelist <- unlist(strsplit(opt$userlist, ",")) + genelist <- genelist[genelist %in% rownames(dds)] + dftemp <- makeTPMtable(genelist, counts(dds, normalized=TRUE), colData(dds), opt$contrast) + start_plot("Userdefined", outdir=opt$outdir) + print(plotTPMs(dftemp, opt$contrast)) + dev.off() + + flog.info("... performing gene ontology (GO) enrichment analysis") + # Currently only supports data from salmon and kallisto + # Needs implementation of genelength matrix from featurecounts + if(any(names(assays(experiment)) == "avgTxLength")){ + res.nona <- subset(resLFC, (!is.na(padj & baseMean) & baseMean > 1)) + res.list <- list(as.integer(res.nona$padj < 0.05), + as.integer(res.nona$log2FoldChange >0 & res.nona$padj < 0.05), + as.integer(res.nona$log2FoldChange <0 & res.nona$padj < 0.05)) + res.names <- list("all", "up", "down") + for(i in 1:3) { + de.genes <- res.list[[i]] + names(de.genes) <- rownames(res.nona) + temp <- rowMeans(assays(experiment)$avgTxLength[match(names(de.genes), rownames(assays(experiment)$avgTxLength)),]) + pwf=nullp(de.genes,bias.data=temp) + all = goseq(pwf,"hg38","ensGene", method="Hypergeometric") + sigall <- all + names(sigall) <- c("category","pvalue","underrepresented_pvalue","numberDE", "numberTOT", "term", "ontology") + sigall$pvalue <- p.adjust(sigall$pvalue, method="BH") + sigall$percent <- sigall$numberDE/sigall$numberTOT + sigall <- sigall[which(sigall[,2] < 0.05),] + cats <- sigall$category + write.table(sigall[,c("category", "pvalue")], file=paste0(opt$outdir,"/GO_",res.names[[i]],".tsv"), quote=FALSE, sep='\t', row.names = FALSE) + write.table(sigall, paste0(opt$outdir,"/GO_annotated_",res.names[[i]],".tsv"), quote=FALSE, sep='\t', row.names = FALSE) + write.table(all, paste0(opt$outdir,"/GO_complete_",res.names[[i]],".tsv"), quote=FALSE, sep='\t', row.names = FALSE) + } + } + + + flog.info("... performing gene set enrichment analysis (GSEA)") + resrnk<-resLFC + data <- getmart(rownames(resrnk)) + resrnk$symbol <- data$external_gene_name[match(rownames(resrnk), data$ensembl_gene_id)] + resrnk$desc <-data$description[match(rownames(resrnk), data$ensembl_gene_id)] + resrnk$entrezgene <-data$entrezgene[match(rownames(resrnk), data$ensembl_gene_id)] + rnk.df <- as_tibble(resrnk[,c("entrezgene","log2FoldChange")]) %>% na.omit() + rnk <- rnk.df$log2FoldChange + names(rnk) <- rnk.df$entrezgene + rnk <- rnk[isUnique(names(rnk))] + if (!dir.exists(paste0(opt$outdir,"/gsea"))) { + dir.create(paste0(opt$outdir,"/gsea")) + } + + for(pathway in opt$pathways){ + pathways <- gmtPathways(pathway) + fgseaRes <- fgsea(pathways = pathways, + stats = rnk, + minSize=15, + maxSize=500, + nperm=10000) + fwrite(fgseaRes, file=paste0(opt$outdir,"/gsea/",sub("([^.]+)\\.[[:alnum:]]+$", "\\1", (basename(pathway))),'.tsv'), sep="\t", sep2=c("", " ", "")) + topPathwaysUp <- fgseaRes[ES > 0,][head(order(pval), n=10),]$pathway + png(paste0(opt$outdir,"/gsea/",'UP_',sub("([^.]+)\\.[[:alnum:]]+$", "\\1", (basename(pathway))),'.png'), + width =15, height = 3, units = 'in', res = 600) + plotGseaTable(pathways[topPathwaysUp], rnk, fgseaRes, + gseaParam = 0.5, colwidths = c(10,2,1,1,1)) + dev.off() + + + topPathwaysDown <- fgseaRes[ES < 0,][head(order(pval), n=10),]$pathway + plotGseaTable(pathways[topPathwaysDown], rnk, fgseaRes, + gseaParam = 0.5) + png(paste0(opt$outdir,"/gsea/",'DOWN_',sub("([^.]+)\\.[[:alnum:]]+$", "\\1", (basename(pathway))),'.png'), + width =15, height = 3, units = 'in', res = 600) + plotGseaTable(pathways[topPathwaysDown], rnk, fgseaRes, + gseaParam = 0.5, colwidths = c(10,2,1,1,1)) + dev.off() + } + + if(opt$perm >0){ + flog.info("... performing permutations") + designperm <- colData(dds) + ddsperm = dds + x = 0 + i = 1 + y= list() + while(i <= opt$perm) { + flog.info(paste0("...... Permutation ", i)) + # Code to only shuffle the group labels for groups in coef + tempdesign <- designperm[grep(paste(as.list(unlist(strsplit(opt$coef, "_"))) [c(2,4)],collapse = "|"),designperm[, opt$contrast]),] + designperm[grep(paste(as.list(unlist(strsplit(opt$coef, "_"))) [c(2,4)],collapse = "|"),designperm[, opt$contrast]),][,opt$contrast] = sample(tempdesign[,opt$contrast]) + if(sum(designperm$group == levels(designperm[, opt$contrast])[1]) != table(colData(dds)[,opt$contrast])[1]){ + flog.info(paste0("......... Failed. Skipping design: ", paste(as.character(designperm$group),collapse = ","))) + flog.info(paste0("......... Number of reference replicates is: ", sum(designperm$group == levels(designperm[, opt$contrast])[1]), ", but should be: ", table(colData(dds)[,opt$contrast])[1])) + next + } + colData(ddsperm) <- designperm + ddsperm <- DESeq(ddsperm) + resperm <- results(ddsperm) + x[i] = length(subset(resperm, padj < opt$alpha)$padj) + y[[i]] = ddsperm$group + i = i+1 + } + start_plot("Simulations", outdir=opt$outdir) + theme_set(theme_gray(base_size = 18)) + sims = qplot(x, + geom="histogram", + breaks=seq(0, 20, by = 1),fill=I("grey"), col=I("black"), + main = "Histogram of DE experiments\n with random group labels", + xlab = "Number of differentially expressed genes", + ylab = "Number of simulations") + + geom_vline(xintercept = length(rownames(subset(res, padj < opt$alpha)))) + + theme_classic() + theme(plot.title = element_text(hjust = 0.5, size=22)) + print(sims) + end_plot() + flog.info(paste0("... Permutation p value: ", + length(x[x > length(rownames(subset(res, padj < opt$alpha)))])/length(x))) + z <- list() + for(i in 0:length(x)){ + z[i] <- paste( unlist(y[i]), collapse=' ') + } + z <- unlist(z) + df <- data.frame(number=x, combination=z) + df + write_tsv(df,paste0(opt$outdir,"/","Simulations.tsv")) + } +} + +main <- function() { + + option_list <- list( + make_option( + "--rds-filename", + dest = "rds_filename", + type = "character", + default = "experiment.rds", + help = paste("filename with input data of counts") + ), + make_option( + "--model", + dest = "model", + type = "character", + default = "~group", + help = paste("model formula for GLM") + ), + make_option( + "--contrast", + dest = "contrast", + type = "character", + default = "group", + help = paste("contrast/factor to use for comparison") + ), + make_option( + "--coef", + dest = "coef", + type = "character", + default = "", + help = paste("Comparison of interest for DESeq2: e.g. group_MUT_vs_CTR, where MUT and CTR are in group") + ), + make_option( + "--refgroup", + dest = "refgroup", + type = "character", + default = "CTR", + help = paste("reference group, e.g. CTR") + ), + make_option( + "--alpha", + dest = "alpha", + type = "numeric", + default = 0.05, + help = paste("Adjusted P value threshold") + ), + make_option( + "--shrinkage", + dest = "shrinkage", + type = "character", + default = "apeglm", + help = paste("Method for LFC shrinkage. Options are apeglm (default), ashr, normal") + ), + make_option( + "--userlist", + dest = "userlist", + type = "character", + default = "ENSG00000205927,ENSG00000130675,ENSG00000016082,ENSG00000070748,ENSG00000064300,ENSG00000078018", + help = paste("User defined genes to plot") + ), + make_option( + "--outdir", + dest = "outdir", + type = "character", + default = "experiment", + help = paste("Directory for output") + ), + make_option( + "--permute", + dest = "perm", + type = "integer", + default = 0, + help = paste("number of permutations") + ), + make_option( + "--pathways", + dest = "pathways", + type = "character", + default = "", + help = paste("Libraries for gsea") + ) + ) + opt <- experiment_start(option_list = option_list, + description = description) + + if (!is.null(opt$pathways)) { + opt$pathways = unlist(strsplit(opt$pathways, ",")) + } + run(opt) + + experiment_stop() +} + +main() diff --git a/cgatpipelines/tasks/exome.py b/cgatpipelines/tasks/exome.py index 6a0cb662e..e9521ba2e 100644 --- a/cgatpipelines/tasks/exome.py +++ b/cgatpipelines/tasks/exome.py @@ -27,11 +27,6 @@ # Set PARAMS in calling module PARAMS = {} - -def getGATKOptions(): - return "-l mem_free=1.4G" - - def makeSoup(address): sock = urlopen(address) htmlSource = sock.read() @@ -41,23 +36,23 @@ def makeSoup(address): ############################################################################## -def GATKReadGroups(infile, outfile, genome, +def GATKReadGroups(infile, outfile, dictionary, library="unknown", platform="Illumina", - platform_unit="1", track="unknown"): + platform_unit="1", track="unknown", gatkmem="2G"): '''Reorders BAM according to reference fasta and adds read groups''' if track == 'unknown': track = P.snip(os.path.basename(infile), ".bam") tmpdir_gatk = P.get_temp_dir('.') - job_options = getGATKOptions() + job_memory = gatkmem job_threads = 3 statement = '''picard ReorderSam - INPUT=%(infile)s - OUTPUT=%(tmpdir_gatk)s/%(track)s.reordered.bam - REFERENCE=%(genome)s - ALLOW_INCOMPLETE_DICT_CONCORDANCE=true - VALIDATION_STRINGENCY=SILENT ;''' % locals() + -INPUT %(infile)s + -OUTPUT %(tmpdir_gatk)s/%(track)s.reordered.bam + -SEQUENCE_DICTIONARY %(dictionary)s + -ALLOW_INCOMPLETE_DICT_CONCORDANCE true + -VALIDATION_STRINGENCY SILENT ;''' % locals() statement += '''samtools index %(tmpdir_gatk)s/%(track)s.reordered.bam ; ''' % locals() @@ -80,55 +75,28 @@ def GATKReadGroups(infile, outfile, genome, ############################################################################## -def GATKIndelRealign(infile, outfile, genome, intervals, padding, threads=4): - '''Realigns BAMs around indels using GATK''' - - intervalfile = outfile.replace(".bam", ".intervals") - job_options = getGATKOptions() - job_threads = 3 - - statement = '''GenomeAnalysisTK - -T RealignerTargetCreator - -o %(intervalfile)s - --num_threads %(threads)s - -R %(genome)s - -L %(intervals)s - -ip %(padding)s - -I %(infile)s; ''' % locals() - - statement += '''GenomeAnalysisTK - -T IndelRealigner - -o %(outfile)s - -R %(genome)s - -I %(infile)s - -targetIntervals %(intervalfile)s;''' % locals() - P.run(statement) - -############################################################################## - - def GATKBaseRecal(infile, outfile, genome, intervals, padding, dbsnp, - solid_options=""): + options="", gatkmem="2G"): '''Recalibrates base quality scores using GATK''' track = P.snip(os.path.basename(infile), ".bam") tmpdir_gatk = P.get_temp_dir('.') - job_options = getGATKOptions() + job_memory = gatkmem job_threads = 3 - statement = '''GenomeAnalysisTK - -T BaseRecalibrator - --out %(tmpdir_gatk)s/%(track)s.recal.grp + statement = '''gatk + BaseRecalibrator + -O %(tmpdir_gatk)s/%(track)s.recal.grp -R %(genome)s -L %(intervals)s -ip %(padding)s -I %(infile)s - --knownSites %(dbsnp)s %(solid_options)s ; + --known-sites %(dbsnp)s %(options)s ; ''' % locals() - statement += '''GenomeAnalysisTK - -T PrintReads -o %(outfile)s - -BQSR %(tmpdir_gatk)s/%(track)s.recal.grp + statement += '''gatk + ApplyBQSR -O %(outfile)s + --bqsr-recal-file %(tmpdir_gatk)s/%(track)s.recal.grp -R %(genome)s -I %(infile)s ; ''' % locals() @@ -140,21 +108,19 @@ def GATKBaseRecal(infile, outfile, genome, intervals, padding, dbsnp, def haplotypeCaller(infile, outfile, genome, - dbsnp, intervals, padding, options): + dbsnp, intervals, padding, options, gatkmem = "2G"): '''Call SNVs and indels using GATK HaplotypeCaller in all members of a family together''' - job_options = getGATKOptions() + job_memory = gatkmem job_threads = 3 - statement = '''GenomeAnalysisTK - -T HaplotypeCaller + statement = '''gatk + HaplotypeCaller -ERC GVCF - -variant_index_type LINEAR - -variant_index_parameter 128000 - -o %(outfile)s + -O %(outfile)s -R %(genome)s -I %(infile)s - --dbsnp %(dbsnp)s + --dbsnp %(dbsnp)s -L %(intervals)s -ip %(padding)s %(options)s''' % locals() @@ -164,16 +130,61 @@ def haplotypeCaller(infile, outfile, genome, ############################################################################## -def genotypeGVCFs(inputfiles, outfile, genome, options): +def consolidateGVCFs(inputfiles, outfile, inputlen, dbmem="20G"): '''Joint genotyping of all samples together''' - job_options = getGATKOptions() + + runmem = dbmem + if not re.match(r' ', runmem): + runmem = re.sub(r'([KMGT])', r' \1', runmem) + number, unit = [string.strip() for string in runmem.split()] + job_threads = 2 + job_memory = f"{float(number)*1.2/job_threads:.1f}{unit}" + tmpdir = P.get_temp_dir('.') + + tmpfile = P.get_temp_filename(".", suffix=".list") + chrID = [ 'chr{}'.format(x) for x in list(range(1,23)) + ['X', 'Y'] ] + with open(tmpfile, 'w+') as f: + for contig in chrID: + f.write('%s\n' % contig) + + statement = '''gatk + --java-options "-Xmx%(dbmem)s -Xms%(dbmem)s" + GenomicsDBImport + -V %(inputfiles)s + -L %(tmpfile)s + --reader-threads %(job_threads)s + --tmp-dir %(tmpdir)s + --genomicsdb-workspace-path %(outfile)s + ''' % locals() + if inputlen > 50: + statement += ''' --batch-size 50''' + statement += ''' > %(outfile)s.log.tmp 2>&1; + rm -rf %(tmpdir)s ; + mv %(outfile)s.log.tmp %(outfile)s.log''' % locals() + P.run(statement) + + os.unlink(tmpfile) + + +############################################################################## + + +def genotypeGVCFs(infile, outfile, genome, options, gatkmem="2G"): + '''Joint genotyping of all samples together''' + job_memory = gatkmem job_threads = 3 + infile = P.snip(infile, ".log") + tmpdir = P.get_temp_dir('.') + logfile = P.snip(outfile,".vcf")+".log" - statement = '''GenomeAnalysisTK - -T GenotypeGVCFs - -o %(outfile)s + statement = '''gatk + GenotypeGVCFs + -O %(outfile)s -R %(genome)s - --variant %(inputfiles)s''' % locals() + -V gendb://%(infile)s + --tmp-dir %(tmpdir)s + %(options)s > %(logfile)s 2>&1; + rm -rf %(tmpdir)s''' % locals() P.run(statement) ############################################################################## @@ -245,113 +256,90 @@ def strelkaINDELCaller(infile_control, infile_tumor, outfile, genome, config, ############################################################################## -def variantAnnotator(vcffile, bamlist, outfile, genome, - dbsnp, annotations="", snpeff_file=""): - '''Annotate variant file using GATK VariantAnnotator''' - job_options = getGATKOptions() - job_threads = 3 - - if "--useAllAnnotations" in annotations: - anno = "--useAllAnnotations" - elif annotations: - anno = " -A " + " -A ".join(annotations) - else: - anno = "" - - statement = '''GenomeAnalysisTK -T VariantAnnotator - -R %(genome)s - -I %(bamlist)s - -A SnpEff - --snpEffFile %(snpeff_file)s - -o %(outfile)s - --variant %(vcffile)s - -L %(vcffile)s - --dbsnp %(dbsnp)s - %(anno)s''' % locals() - P.run(statement) - -############################################################################## - - def variantRecalibrator(infile, outfile, genome, mode, dbsnp=None, - kgenomes=None, hapmap=None, omni=None, mills=None): + kgenomes=None, hapmap=None, omni=None, mills=None, + axiom=None, gatkmem = "2G"): '''Create variant recalibration file''' - job_options = getGATKOptions() + job_memory = gatkmem job_threads = 3 track = P.snip(outfile, ".recal") if mode == 'SNP': - statement = '''GenomeAnalysisTK -T VariantRecalibrator + statement = '''gatk VariantRecalibrator -R %(genome)s - -input %(infile)s - -resource:hapmap,known=false,training=true,truth=true,prior=15.0 %(hapmap)s - -resource:omni,known=false,training=true,truth=true,prior=12.0 %(omni)s - -resource:dbsnp,known=true,training=false,truth=false,prior=2.0 %(dbsnp)s - -resource:1000G,known=false,training=true,truth=false,prior=10.0 %(kgenomes)s - -an QD -an SOR -an MQRankSum - -an ReadPosRankSum -an FS -an MQ - --maxGaussians 4 + -V %(infile)s + --trust-all-polymorphic + -tranche 100.0 -tranche 99.95 -tranche 99.9 -tranche 99.8 -tranche 99.6 -tranche 99.5 -tranche 99.4 -tranche 99.3 -tranche 99.0 -tranche 98.0 -tranche 97.0 -tranche 90.0 + -an QD -an MQRankSum -an ReadPosRankSum -an FS -an MQ -an SOR -an InbreedingCoeff -mode %(mode)s - -recalFile %(outfile)s - -tranchesFile %(track)s.tranches - -rscriptFile %(track)s.plots.R ''' % locals() + --max-gaussians 6 + --resource:hapmap,known=false,training=true,truth=true,prior=15 %(hapmap)s + --resource:omni,known=false,training=true,truth=true,prior=12 %(omni)s + --resource:1000G,known=false,training=true,truth=false,prior=10 %(kgenomes)s + --resource:dbsnp,known=true,training=false,truth=false,prior=7 %(dbsnp)s + -O %(track)s.recal + --tranches-file %(track)s.tranches + --rscript-file %(track)s.plots.R + > %(track)s.log 2>&1 ''' % locals() P.run(statement) elif mode == 'INDEL': - statement = '''GenomeAnalysisTK -T VariantRecalibrator + statement = '''gatk VariantRecalibrator -R %(genome)s - -input %(infile)s - -resource:mills,known=true,training=true,truth=true,prior=12.0 %(mills)s - -an QD -an MQRankSum - -an ReadPosRankSum -an FS -an MQ - --maxGaussians 4 - --minNumBadVariants 5000 + -V %(infile)s + --resource:mills,known=true,training=true,truth=true,prior=12 %(mills)s + --resource:axiomPoly,known=false,training=true,truth=false,prior=10 %(axiom)s + --resource:dbsnp,known=true,training=false,truth=false,prior=2 %(dbsnp)s + -an QD -an MQRankSum -an ReadPosRankSum -an FS -an MQ + --max-gaussians 4 -mode %(mode)s - -recalFile %(outfile)s - -tranchesFile %(track)s.tranches - -rscriptFile %(track)s.plots.R ''' % locals() + -O %(track)s.recal + --tranches-file %(track)s.tranches + --rscript-file %(track)s.plots.R + > %(track)s.log 2>&1 ''' % locals() P.run(statement) ############################################################################## -def applyVariantRecalibration(vcf, recal, tranches, outfile, genome, mode): +def applyVQSR(vcf, recal, tranches, outfile, genome, mode, gatkmem): '''Perform variant quality score recalibration using GATK ''' - job_options = getGATKOptions() + job_memory = gatkmem job_threads = 3 - statement = '''GenomeAnalysisTK -T ApplyRecalibration + statement = '''gatk ApplyVQSR -R %(genome)s - -input:VCF %(vcf)s - -recalFile %(recal)s - -tranchesFile %(tranches)s - --ts_filter_level 99.0 + -V %(vcf)s + --recal-file %(recal)s + --tranches-file %(tranches)s + --truth-sensitivity-filter-level 99.7 + --create-output-variant-index true -mode %(mode)s - -o %(outfile)s ''' % locals() + -O %(outfile)s + > %(outfile)s.log 2>&1 ''' % locals() P.run(statement) ############################################################################## -def vcfToTable(infile, outfile, genome, columns): +def vcfToTable(infile, outfile, genome, columns, gatkmem): '''Converts vcf to tab-delimited file''' - job_options = getGATKOptions() + job_memory = gatkmem job_threads = 3 - statement = '''GenomeAnalysisTK -T VariantsToTable + statement = '''gatk VariantsToTable -R %(genome)s -V %(infile)s - --showFiltered - --allowMissingData + --show-filtered %(columns)s - -o %(outfile)s''' % locals() + -O %(outfile)s''' % locals() P.run(statement) ############################################################################## def selectVariants(infile, outfile, genome, select): - '''Filter de novo variants based on provided jexl expression''' - statement = '''GenomeAnalysisTK -T SelectVariants + '''Filter de novo variants based on provided expression''' + statement = '''gatk SelectVariants -R %(genome)s --variant %(infile)s -select '%(select)s' @@ -421,22 +409,6 @@ def buildSelectStatementfromPed(filter_type, pedfile, template): ############################################################################## - -def guessSex(infile, outfile): - '''Guess the sex of a sample based on ratio of reads - per megabase of sequence on X and Y''' - statement = '''calc `samtools idxstats %(infile)s - | grep 'X' - | awk '{print $3/($2/1000000)}'` - /`samtools idxstats %(infile)s | grep 'Y' - | awk '{print $3/($2/1000000)}'` - | tr -d " " | tr "=" "\\t" | tr "/" "\\t" - > %(outfile)s''' - P.run(statement) - -############################################################################## - - def filterMutect(infile, outfile, logfile, control_id, tumour_id, min_t_alt, min_n_depth, @@ -909,313 +881,6 @@ def filterQuality(infile, qualstr, qualfilter, outfiles): out2.close() -@cluster_runnable -def FilterExacCols(infile, exac_suffs, exac_thresh): - ''' - Returns a set of line indices indicating lines where either of the alleles - called have a frequency of greater that exac_thresh in any of the - populations specified as exac_suffs. - Where no data is available an allele frequency of -1 is used. - - Exac provide data as AC_xxx and AN_xxx where AC is the allele count - - the number of times the allele has been called - - and AN is chromosome count - the number of - samples in which the allele could have been called - in population xxx. - AC / AN = allele frequecy. - - exac_suffs are any columns where an AC_xxx and AN_xxx column is provided - in the VCF, e.g. Adj will calculate allele frequency from the AC_Adj - and AN_Adj columns - - ''' - # read columns from the input VCF - exac_suffs = exac_suffs.split(",") - cols = iotools.open_file(infile).readline().strip().split("\t") - nD = dict() - afdict = dict() - for e in exac_suffs: - # find the columns with the appropriate information - # Allele count - AC_i = cols.index("AC_%s" % (e)) - # Allele Number - AN_i = cols.index("AN_%s" % (e)) - # Genotype - GT_i = cols.index('GT') - nlist = set() - n = 0 - AFS = [] - with iotools.open_file(infile) as input: - for line in input: - if n > 1: - line = line.strip().split("\t") - # At multi-allelic sites, comma delimited AC and AN values - # are provided - # "." and "NA" indicate no data here - this is represented - # as an AF of -1 - AC = line[AC_i].replace(".", "-1").replace( - "NA", "-1").split(",") - AN = line[AN_i].replace(".", "1").replace( - "NA", "1").split(",") - AC = np.array([float(a) for a in AC]) - AN = np.array([float(a) for a in AN]) - AF = AC / AN - AF2 = [af if af > 0 else 0 for af in AF] - AF = np.insert(AF, 0, (1 - sum(AF2))) - - # Chromosome count is usually the same for all minor - # alleles (but not always) - # If it is not the same the AC and AN lists should be the - # same length - # Otherwise AN will have length 1 - if len(AC) != len(AN): - AN = [AN] * len(AC) - - # Record the genotype called in this sample for this SNP - GT = line[GT_i] - GT = GT.replace(".", '0') - GT = GT.split("/") - GT[0], GT[1] = int(GT[0]), int(GT[1]) - - # If the variant is not in ExAC the ExAC columns show "." - # but the site - # may still have been called as multi allelic - # - use -1 for all frequencies - # in this case - if max(GT) > (len(AF) - 1): - AF = np.array([-1] * (max(GT) + 1)) - - AF1 = AF[GT[0]] - AF2 = AF[GT[1]] - AFS.append((AF1, AF2)) - # Remember where both allele frequencies are - # greater than exac_thresh - if AF1 >= exac_thresh and AF2 >= exac_thresh: - nlist.add(n) - else: - AFS.append(('NA', 'NA')) - n += 1 - afdict[e] = AFS - nD[e] = nlist - - ns = set.union(*list(nD.values())) - return afdict, ns - - -@cluster_runnable -def FilterFreqCols(infile, thresh, fcols): - ''' - Returns a set of line indices indicating lines where either of the alleles - called have a frequency of less than thresh in all of the columns specified - in fcols. - No information - assigned allele frequency of -1. - ''' - fcols = fcols.split(",") - # read the column headings from the variant table - cols = iotools.open_file(infile).readline().strip().split("\t") - # store allele frequency columns - AFdict = dict() - # store low frequency indices - nD = dict() - for col in fcols: - ind = cols.index(col) - GT_i = cols.index('GT') - n = 0 - nlist = set() - AFS = [] - with iotools.open_file(infile) as input: - for line in input: - if n > 1: - line = line.strip().split("\t") - GT = line[GT_i].replace(".", "0").split("/") - af = line[ind].split(",") - AF = [] - # where the allele frequency is not numeric - # "." or "NA" use -1 to indicate no data - for a in af: - try: - AF.append(float(a)) - except: - AF.append(float(-1)) - AF2 = [l if l > 0 else 0 for l in AF] - AF = np.array(AF) - AF = np.insert(AF, 0, 1 - sum(AF2)) - GT[0] = int(GT[0]) - GT[1] = int(GT[1]) - # If the variant is not in database the column shows "." - # but the site - # may still have been called as multi allelic - # - use -1 for all frequencies - # in this case - if max(GT[0], GT[1]) > (len(AF) - 1): - AF = [float(-1)] * (max(GT[0], GT[1]) + 1) - AF1 = AF[GT[0]] - AF2 = AF[GT[1]] - if AF1 >= thresh and AF2 >= thresh: - nlist.add(n) - AFS.append((AF1, AF2)) - else: - AFS.append(('NA', 'NA')) - n += 1 - AFdict[col] = AFS - nD[col] = nlist - - ns = set.union(*list(nD.values())) - return AFdict, ns - - -def WriteFreqFiltered(infile, exacdict, exacinds, otherdict, otherinds, - outfiles): - ''' - Writes the output of the frequency filtering steps to file, including - the new columns showing calculated allele frequency for the allele - in this specific sample. - ''' - x = 0 - out = iotools.open_file(outfiles[0], "w") - out2 = iotools.open_file(outfiles[1], "w") - - exaccols = list(exacdict.keys()) - othercols = list(otherdict.keys()) - - # column names for the new columns - exacnewcols = ["%s_calc" % c for c in exaccols] - othernewcols = ["%s_calc" % c for c in othercols] - - with iotools.open_file(infile) as infile: - for line in infile: - line = line.strip() - if x <= 1: - # write the column names - out.write("%s\t%s\t%s\n" % (line, "\t".join(exacnewcols), - "\t".join(othernewcols))) - else: - freqs = [] - for key in exaccols: - col = exacdict[key] - freqs.append(col[x]) - for key in othercols: - col = otherdict[key] - freqs.append(col[x]) - freqs = [str(freq) for freq in freqs] - - if x not in exacinds and x not in otherinds: - out.write("%s\t%s\n" % (line, "\t".join(freqs))) - else: - out2.write("%s\t%s\n" % (line, "\t".join(freqs))) - x += 1 - out.close() - out2.close() - - -@cluster_runnable -def filterRarity(infile, exac, freqs, thresh, outfiles): - ''' - Filter out variants which are common in any of the exac or other - population datasets as specified in the pipeline.yml. - ''' - exacdict, exacinds = FilterExacCols(infile, exac, thresh) - otherdict, otherinds = FilterFreqCols(infile, thresh, freqs) - WriteFreqFiltered(infile, exacdict, exacinds, otherdict, - otherinds, outfiles) - - -@cluster_runnable -def filterDamage(infile, damagestr, outfiles): - ''' - Filter variants which have not been assessed as damaging by any - of the specified tools. - Tools and thresholds can be specified in the pipeline.yml. - - Does not account for multiple alt alleles - if any ALT allele has - been assessed as damaging with any tool the variant is kept, - regardless of if this is the allele called in the sample. - - ''' - damaging = damagestr.split(",") - cols = iotools.open_file(infile).readline().strip().split("\t") - - D = dict() - # parses the "damage string" from the pipeline.yml - # this should be formatted as COLUMN|result1-result2-...,COLUMN|result1... - # where variants with any of these results in this column will - # be retained - for d in damaging: - d = d.split("|") - col = d[0] - res = d[1].split("-") - i = cols.index(col) - D[col] = ((res, i)) - - x = 0 - out = iotools.open_file(outfiles[0], "w") - out2 = iotools.open_file(outfiles[1], "w") - with iotools.open_file(infile) as input: - for line in input: - if x > 1: - # grep for specific strings within this column of this - # line of the input file - line = line.strip().split("\t") - isdamaging = 0 - for key in D: - res, i = D[key] - current = line[i] - for r in res: - if re.search(r, current): - isdamaging = 1 - if isdamaging == 1: - out.write("%s\n" % "\t".join(line)) - else: - out2.write("%s\n" % "\t".join(line)) - else: - out.write(line) - x += 1 - out.close() - out2.close() - - -@cluster_runnable -def filterFamily(infile, infile2, outfiles): - ''' - Filter variants according to the output of calculateFamily - - only variants shared by both members of a family will be kept. - ''' - cps1 = set() - cps2 = set() - - # make a list of variants in infile1 - with iotools.open_file(infile) as input: - for line in input: - line = line.strip().split("\t") - chrom = line[0] - pos = line[1] - cp = "%s_%s" % (chrom, pos) - cps1.add(cp) - - # make a list of variants in infile2 - with iotools.open_file(infile2) as input: - for line in input: - line = line.strip().split("\t") - chrom = line[0] - pos = line[1] - cp = "%s_%s" % (chrom, pos) - cps2.add(cp) - - # only variants in both are of interest - cps = cps1 & cps2 - - out = iotools.open_file(outfiles[0], "w") - out2 = iotools.open_file(outfiles[1], "w") - with iotools.open_file(infile) as input: - for line in input: - line = line.strip().split("\t") - if "%s_%s" % (line[0], line[1]) in cps: - out.write("%s\n" % "\t".join(line)) - else: - out2.write("%s\n" % "\t".join(line)) - out.close() - out2.close() - - @cluster_runnable def CleanVariantTables(genes, variants, cols, outfile): variants = pd.read_csv(variants, sep="\t") @@ -1276,3 +941,5 @@ def CleanVariantTables(genes, variants, cols, outfile): df.columns = ['gene', 'CHROM', 'POS'] variants = vp1.merge(df, 'left') variants.to_csv(outfile, sep="\t") + + diff --git a/cgatpipelines/tasks/exomeancestry.py b/cgatpipelines/tasks/exomeancestry.py deleted file mode 100644 index 7618bab10..000000000 --- a/cgatpipelines/tasks/exomeancestry.py +++ /dev/null @@ -1,487 +0,0 @@ -import pandas as pd -import numpy as np -import cgatcore.iotools as iotools -import random -import itertools -from cgatcore.pipeline import cluster_runnable -import json -import decimal -from sklearn.cluster import KMeans -import matplotlib.pyplot as plt -import matplotlib.cm as cm -import matplotlib.patches as mpatches - - -@cluster_runnable -def MakeSNPFreqDict(infiles, outfiles, rs): - ''' - Generates a random set of SNPs to use to characterise the ancestry - and relatedness of the samples. - - 1. Finds all SNPs which have a known genotype frequency for all 11 - hapmap ancestries - 2. Picks a random 50000 of these SNPs - 3. Stores a list of these SNPs as randomsnps.tsv - 4. Builds a dictionary of dictionaries of dictionaries where: - At Level 1 each key is a HapMap ancestry ID - each of these keys leads to another dictionary - level 2. - At Level 2 each key is a dbSNP SNP ID - each of these keys leads to another dictionary - level 3. - At Level 3 each key is a genotype: - the values of these keys are the genotype frequency of this - genotype at this SNP in this ancestry. - e.g. snpdict['ASW']['rs000000001']['CT'] --> - frequency of the CT genotype at the rs0000001 SNP in the - ASW population - 5. Stores this dictionary in json format as randomsnps.json - ''' - - # list all chromosomes - chroms = np.array([f.split("/")[-1].split("_")[2] for f in infiles]) - uchroms = set(chroms) - infiles = np.array(infiles) - snpdict = dict() - for chrom in uchroms: - # get all the files for this chromosome - c = np.where(chroms == chrom)[0] - thischrom = infiles[c] - snpidsets = [] - # make a set of all the snp ids of known frequency for - # this chromsome for each ancestry - for f in thischrom: - snpids = [] - anc = f.split("/")[-1].split("_")[3] - with iotools.open_file(f) as inp: - for line in inp: - line = line.strip().split(" ") - snpids.append(line[0]) - snpids = set(snpids) - snpidsets.append(snpids) - # find the snp ids where frequency is known in all ancestries - thischromsnps = set.intersection(*snpidsets) - snpdict[chrom] = thischromsnps - - vals = list(snpdict.values()) - # set of all the SNPs genotyped in all ancestries on all chroms - pooled = set.union(*vals) - # take a random sample of snps from this set - random.seed(rs) - sam = set(random.sample(pooled, 50000)) - - # make a new dictionary of just the sampled snps for each chrom - snpdictsam = dict() - for key in snpdict: - i = snpdict[key] & sam - snpdictsam[key] = i - - # generate all possible genotypes as list of strings - # (e.g. CC CT CG CA TT etc) - res = ["%s%s" % (g[0], g[1]) - for g in itertools.permutations("CGAT", 2)] - res += ['CC', 'GG', 'TT', 'AA'] - freqdict = dict() - ancs = set([l.split("/")[-1].split("_")[3] for l in infiles]) - - # build an empty dictionary of dictionaries of dictionaries - # to store genotype freqs for each ancestry - for anc in ancs: - freqdict.setdefault(anc, dict()) - for s in sam: - freqdict[anc].setdefault(s, dict()) - for r in res: - freqdict[anc][s][r] = 0.0 - - # read genotype freqs from the input file and store in the dictionary - for f in infiles: - bits = f.split("/")[-1].split("_") - anc = bits[3] - chrom = bits[2] - snpset = snpdictsam[chrom] - with iotools.open_file(f) as input: - for line in input: - line = line.strip().split(" ") - if line[0] in snpset: - G1 = "%s%s" % (line[10][0], line[10][-1]) - try: - F1 = float(line[11]) - G2 = "%s%s" % (line[13][0], line[13][-1]) - F2 = float(line[14]) - G3 = "%s%s" % (line[16][0], line[16][-1]) - F3 = float(line[17]) - except: - # because occasionally the GFs are not numeric - continue - freqdict[anc][line[0]][G1] = F1 - freqdict[anc][line[0]][G2] = F2 - freqdict[anc][line[0]][G3] = F3 - - # dump dictionary in a json file to resurrect later - outf = iotools.open_file(outfiles[0], "w") - json.dump(freqdict, outf) - outf.close() - - # store the sampled list of snps - out = iotools.open_file(outfiles[1], "w") - for snp in sam: - out.write("%s\n" % snp) - out.close() - - -@cluster_runnable -def GenotypeSNPs(infile, snplist, outfile): - ''' - Fetches the genotype from the variant tables for all samples - for SNPs in the hapmap sample from makeRandomSNPSet. - - Complex sites are ignored (as simple SNPs are sufficient for these - calculations). - These are: - Sites which failed QC (column 3 in the variant table is not PASS) - Sites with more than 2 alleles defined (column 6 in the variant table - contains more than one alternative allele) - SNPs with more than one ID - Indels - ''' - out = iotools.open_file(outfile, "w") - with iotools.open_file(infile) as inf: - for line in inf: - line = line.strip().split() - # if the variant passed QC - if line[4] == "PASS": - genotype = line[7] - # if the genotype looks normal e.g. 1/1 - if len(genotype) == 3: - # get the actual genotype (rather than the index) - if genotype[0] != ".": - ind1 = int(genotype[0]) - else: - ind1 = 0 - if genotype[2] != ".": - ind2 = int(genotype[2]) - else: - ind2 = 0 - A1 = line[5] - A2 = line[6].split(",") - AS = [A1] + A2 - - if len(AS) <= 2: - GT = "%s%s" % (AS[ind1], AS[ind2]) - refGT = "%s%s" % (A1, A1) - if len(GT) == 2: - if line[3][0:2] == "rs" and len( - line[3].split(";")) == 1: - snpid = line[3] - chrom = line[0] - pos = line[1] - if snpid in snplist: - out.write("%s\t%s\t%s\t%s\t%s\n" - % (snpid, chrom, pos, GT, - refGT)) - out.close() - - -@cluster_runnable -def CalculateAncestry(infile, calledsnps, snpdict, outfiles): - ''' - Takes the data stored in MakeRandomSNPSet and the genotype of each sample - at each site in calledsnps.tsv and tabulates the frequency of this - genotype in each of the HapMap ancestry categories. - The overall probability of each ancestry is then calculated as the - product of these frequencies. These can only be used in comparison to - each other - to show which of the 11 ancestries is most probable. - ''' - - # List the SNPs in the sample where a variant has been called - # Record the reference genotype at each of these SNPs - called = set() - refs = dict() - for line in iotools.open_file(calledsnps).readlines(): - line = line.strip().split("\t") - refs[line[0]] = line[1] - called.add(line[0]) - - # Record the genotype of the current sample at each SNP - currents = dict() - for line in iotools.open_file(infile).readlines(): - line = line.strip().split("\t") - currents[line[0]] = line[3] - - # Regenerate the genotype frequency dictionary made earlier - s = iotools.open_file(snpdict) - snpdict = json.load(s) - s.close() - ancs = list(snpdict.keys()) - - # Build a table of the frequency of the genotype in this sample - # in each of the hapmap ancestries - out = iotools.open_file(outfiles[0], "w") - out.write("snp\tgenotype\t%s\n" % "\t".join(ancs)) - for snp in called: - # where a variant hasn't been called in the current sample assume - # the reference genotype - if snp in currents: - geno = currents[snp] - else: - geno = refs[snp] - - res = [] - for anc in ancs: - D = snpdict[anc] - sub = D[snp] - freq = sub[geno] - res.append(freq) - - res = [str(r) for r in res] - out.write("%s\t%s\t%s\n" % (snp, geno, "\t".join(res))) - - out.close() - out = iotools.open_file(outfiles[1], "w") - - # read the output of the previous step into pandas - res = pd.read_csv(outfiles[0], sep="\t") - - # calculate the probability of each ancestry as the product of all the - # genotype frequencies - # SNPs where a genotype not recorded in hapmap has been called are - # skipped - # decimal package is used because the results are very small floats - res = res.replace(0, float('nan')) - for anc in set(ancs): - r = res[anc][np.invert(res[anc].isnull())] - r = [decimal.Decimal(d) for d in r] - out.write("%s\t%s\n" % (anc, np.product(r))) - out.close() - - -@cluster_runnable -def MakePEDFile(infiles, outfiles): - ''' - Generates the required input for the Plink and King software packages. - - PED file - columns are SNPs and rows are samples, each cell is the - genotype of the sample at this SNP - - MAP file - rows are SNPs in the same order as the columns in the ped - file, each row shows chromosome, snp id and position. - ''' - # Record the chromosome and position of each SNP - chromposdict = dict() - for line in iotools.open_file(infiles[-1]).readlines(): - line = line.strip().split("\t") - chromposdict[line[0]] = ((line[2], line[3])) - - # Generate the PED file - takes the genotype column of the table from - # the CalculateAncestry step and transposes it across a row - - out = iotools.open_file(outfiles[0], "w") - for inf in infiles[0:-1]: - inf = inf[0] - df = pd.read_csv(inf, sep="\t") - geno = df['genotype'] - genolist = [] - for g in geno: - genolist.append(" ".join(list(g))) - out.write("%s\t%s\n" % (inf.split("/")[-1], "\t".join(genolist))) - out.close() - - # Generate the MAP file - chromosome and position of each SNP in the same - # order as the PED file - mapf = iotools.open_file(outfiles[1], "w") - slist = df['snp'] - for snp in slist: - mapf.write("%s\t%s\t%s\n" % (chromposdict[snp][0], snp, - chromposdict[snp][1])) - mapf.close() - - -def CalculateFamily(infile, outfile): - ''' - Translates and filters the output from King. - Pairs of related samples are written to the output file along with the - degree of relatedness. Degrees are decided using thresholds from - the King documentation, here - http://people.virginia.edu/~wc9c/KING/manual.html - ''' - out = iotools.open_file(outfile, "w") - input = open(infile).readlines()[1:] - for line in input: - line = line.strip().split("\t") - score = float(line[14]) - # if there is a significant relatedness score - if score >= 0.0442: - ID1 = line[0] - ID2 = line[2] - # 2nd degree relatives - if score <= 0.177: - degree = 2 - # 1st degree relatives - elif score <= 0.354: - degree = 1 - # 0th degree relatives (ID twins or duplicate) - else: - degree = 0 - out.write("%s\t%s\t%s\n" % (ID1, ID2, degree)) - out.close() - - -@cluster_runnable -def CalculateSex(infiles, outfile): - ''' - Approximates the sex of the sample using the data in the variant table. - Basic estimate based on heterozygosity on the X chromosome - genotypes - are taken for all called variants on X passing QC and the percentage - of heterozygotes is taken. - This tends to produce two clear populations so Kmeans clustering - is used to split the data into two - male and female. Samples which are - unclear are marked in the output. - ''' - percs = [] - for f in infiles: - homs = 0 - tot = 0 - # Count the homozygous and total variants called on the - # X chromosome in each input file (after QC) - with iotools.open_file(f) as inf: - for line in inf: - line = line.strip().split("\t") - if line[0] == "chrX" and line[4] == "PASS": - geno = line[7] - s = set(list(geno)) - if len(s) == 2: - homs += 1 - tot += 1 - perc = float(homs) / float(tot) - percs.append(perc) - - # impose a grid on the data so clustering is possible - ests = np.array(percs) - ones = np.array([1] * len(ests)) - grid = np.column_stack((ones, ests)) - - # divide data into two clusters - means = KMeans(n_clusters=2) - fit1 = means.fit(grid) - res = fit1.predict(grid) - - # find the position of the cluster centres on the y axis - ccs = fit1.cluster_centers_ - cc1 = ccs[0, 1] - cc2 = ccs[1, 1] - ccs = [cc1, cc2] - - # pick out the scores from each cluster and calculate the standard dev - s1 = ests[res == 0] - s2 = ests[res == 1] - sds = [np.std(s1), np.std(s2)] - # males will be more homozygous - the bigger value represents males - if cc1 > cc2: - sexes = ["male", "female"] - else: - sexes = ["female", "male"] - - x = 0 - out = iotools.open_file(outfile, "w") - for f in infiles: - # for each input file take the group, the score, the standard deviation - # of this group, the distance from the cluster centre and the sex - group = res[x] - est = ests[x] - std = sds[group] - dist = est - ccs[group] - sex = sexes[group] - - # is the value with 3 SDs of the mean for this sex - if sex == "male": - if ((3 * std) - abs(dist) >= 0 or dist >= 0): - sig = "*" - else: - sig = "-" - else: - if ((3 * std) - abs(dist) >= 0 or dist <= 0): - sig = "*" - else: - sig = "-" - out.write("%s\t%f\t%f\t%s\t%s\n" % (f, est, dist, sexes[group], sig)) - x += 1 - out.close() - - # plot the output - blue dots male, red dots female - # lines represent 3 standard deviations from the cluster centre - p = plt.figure() - a = p.add_subplot('111') - - a.axis([0, len(infiles), - (min([min(s1), min(s2)]) - 0.1), (max([max(s1), max(s2)]) + 0.1)]) - - dist1 = np.linspace(0, len(infiles), len(s1)) - dist2 = np.linspace(0, len(infiles), len(s2)) - - a.plot(dist1, s1, 'b.') - a.plot(dist2, s2, 'r.') - - top = ccs[0] + (3 * sds[0]) - bottom = ccs[1] - (3 * sds[1]) - a.plot([0, len(infiles)], [top, top], 'b-') - a.plot([0, len(infiles)], [bottom, bottom], 'r-') - a.tick_params(axis='x', labelbottom='off', bottom='off') - a.yaxis.set_label_text('percentage homozygous X variants') - fig = outfile.replace(".tsv", ".png") - p.savefig(fig) - - -def PlotAncestry(infile): - ''' - Draws a plot showing the score (probabilty) for each individual - for their assigned ancestry and the second closest match. - x = individual - y = score - large diamonds represent the best match and small triangles the second - best match - ''' - ancestry = pd.read_csv(infile, sep="\t", header=None, dtype="str") - - # score using the index of the probability e.g. 5e-1000 as -1000 - ancestry[2] = [int(a[1]) * -1 for a in ancestry[2].str.split("-")] - ancestry[4] = [int(a[1]) * -1 for a in ancestry[4].str.split("-")] - - # sort by assigned ancestry - ancestry = ancestry.sort_values([1]) - ancs = list(set(ancestry[1].append(ancestry[3]))) - - # assign colours to points by ancestry - ancestry[5] = ancestry[1].apply(lambda x: ancs.index(x), 1) - ancestry[6] = ancestry[3].apply(lambda x: ancs.index(x), 1) - floats = np.linspace(0, 1, len(set(ancs))) - colours = cm.jet(floats) - - f = plt.figure(figsize=(30, 15)) - a = f.add_subplot('111') - ymin = min(ancestry[2]) - 100 - ymax = max(ancestry[4]) + 100 - - a.axis([-5, len(ancestry) + 5, ymin, ymax]) - - # plot best scoring ancestry - a.scatter(list(range(len(ancestry))), - ancestry[2], c=colours[ancestry[5].values], - s=500, marker='D', edgecolors='None') - - # plot second best scoring ancestry - a.scatter(list(range(len(ancestry))), - ancestry[4], c=colours[ancestry[6].values], - s=150, marker="^", edgecolors='None') - sancs = set(ancestry[1]) - i = 20 - - # draw lines between individuals assigned to different ancestries - for anc in sancs: - p = max(np.where(ancestry[1] == anc)[0]) + 0.5 - p2 = min(np.where(ancestry[1] == anc)[0]) + 0.5 - a.plot([p, p], - [ymin, ymax], 'k-') - a.text(p2, ymax - i, anc, ha='center') - i += 10 - - # legend - ps = [mpatches.Patch(color=col) for col in colours] - a.legend(handles=ps, labels=ancs, loc=3) - - f.savefig(infile.replace(".tsv", ".png")) diff --git a/cgatpipelines/tasks/exploratory.R b/cgatpipelines/tasks/exploratory.R new file mode 100755 index 000000000..d22da0cbd --- /dev/null +++ b/cgatpipelines/tasks/exploratory.R @@ -0,0 +1,302 @@ +#' Differential expression analysis Script +#' +#' +#' Example usage: +#' +#' cgatflow R exploratory --model=~ group --contrast=group --factor=mouse_id,collection_date,slice_depth,slice_number,pipette_visual,timepoint +#' +#' Primary Input: `filename.rds` - output from the readandfiltercounts.R file or DESeqExpermient object or DEGList object +#' Additional inputs: model and contrast as well as factors of interest +#' +#' Output: png images of: PCA, clustering, Surrogate Variable Analysis, Heatmaps +#' + +suppressMessages(library(getopt)) +suppressMessages(library(tidyverse)) +suppressMessages(library(data.table)) +suppressMessages(library(DESeq2)) +suppressMessages(library(edgeR)) +suppressMessages(library(rhdf5)) +suppressMessages(library(sva)) +suppressMessages(library(ggplot2)) +suppressMessages(library(biomaRt)) +suppressMessages(library(Cairo)) +suppressMessages(library(pheatmap)) +suppressMessages(library(RColorBrewer)) +suppressMessages(library(ggforce)) + +source(file.path(Sys.getenv("R_ROOT"), "experiment.R")) + + +# getmart +# Function: wrapper for getBM for ENSEMBL gene list +# Dependencies: biomaRt package +# Input: vector of ensembl gene ids +# Output: data frame with description, symbol and entrez gene +mart = useMart(biomart = "ENSEMBL_MART_ENSEMBL",dataset="hsapiens_gene_ensembl", host = "jul2018.archive.ensembl.org") +getmart <- function(values){ + data<- getBM( + filters= "ensembl_gene_id", + attributes= c("ensembl_gene_id", "external_gene_name", "description","entrezgene"), + values= values, + mart= mart, + useCache = FALSE) + data$description <- gsub("\t", "", data$description) + return(data) +} + + +start_plot <- function(section, height = 6, width = 6, type = "png", outdir="") { + file = get_output_filename(paste0(outdir,"/",section, ".", type)) + Cairo(file = file, + type = type, + width = width, + height = height, + units="in", + dpi = 300, + bg = "white") + #opar <- par(lwd=0.5) +} + +end_plot <- function() { + dev.off() +} + + +run <- function(opt) { + + ### READING DATA ### + futile.logger::flog.info(paste("reading Experiment object from", normalizePath(opt$rds_filename))) + experiment <- readRDS(opt$rds_filename) + if (class(experiment) == "DESeqDataSet"){ + dds <- experiment + } else if (class(experiment) == "DGEList"){ + dds = DESeqDataSetFromMatrix(experiment$counts, experiment$sample, design = formula(opt$model)) + } + futile.logger::flog.info(paste("reading Experiment object", paste(dim(counts(experiment)), collapse = ","))) + + ### TRANSFORMATION OF DATA ### + futile.logger::flog.info(paste("Transforming data")) + rld<- rlog(dds) + vsd<- vst(dds) + df <- bind_rows( + as_tibble(log2(counts(dds, normalized=TRUE)[, 1:2]+1)) %>% + mutate(transformation = "log2(x + 1)"), + as_tibble(assay(vsd)[, 1:2]) %>% mutate(transformation = "vst"), + as_tibble(assay(rld)[, 1:2]) %>% mutate(transformation = "rlog")) + colnames(df)[1:2] <- c("x", "y") + start_plot('Variance_Transformations', outdir=opt$outdir) + print(ggplot(df, aes(x = x, y = y)) + geom_hex(bins = 80) + + coord_fixed() + facet_grid( . ~ transformation)) + end_plot() + + ### PRINCIPAL COMPONENT ANALYSIS ### + futile.logger::flog.info(paste("Performing Principal Component Analysis")) + pca = prcomp(t(assay(vsd))) + for(factor in opt$factors){ + variable.group <- colData(dds)[, factor] + names(variable.group) <- factor + percentVar <- round(100 * summary(pca)$importance[2,]) + futile.logger::flog.info(paste("number of PCA dimensions", dim(pca$x))) + if(dim(pca$x)[1]>7){ + dim_pca = 8 + } else { + dim_pca = dim(pca$x) + } + scores <- data.frame(variable.group, pca$x[,1:dim_pca]) + start_plot(paste0('PCA_', factor), outdir=opt$outdir) + print(ggplot(scores,aes(x=PC1, y=PC2)) + + geom_point(colour=factor(variable.group)) + + theme(legend.position="right") + + labs(colour=factor, x=paste0("PC1 (", percentVar[1],"% of variance)"), + y=paste0("PC2 (", percentVar[2],"% of variance)")) + + ggtitle("Principal Component Analysis") + theme_grey(base_size = 15) + + theme(plot.title = element_text(lineheight=1, face="bold")) + geom_point(size=2) + + theme(text=element_text(family='serif'))) + end_plot() + start_plot(paste0('PCA_13_', factor), outdir=opt$outdir) + print(ggplot(scores,aes(x=PC1, y=PC3)) + + geom_point(colour=factor(variable.group)) + + theme(legend.position="right") + + labs(colour=factor, x=paste0("PC1 (", percentVar[1],"% of variance)"), + y=paste0("PC3 (", percentVar[3],"% of variance)")) + + ggtitle("Principal Component Analysis") + theme_grey(base_size = 15) + + theme(plot.title = element_text(lineheight=1, face="bold")) + geom_point(size=2) + + theme(text=element_text(family='serif'))) + end_plot() + } + variable.group <- colData(dds)[, opt$contrast] + names(variable.group) <- opt$contrast + scores <- data.frame(variable.group, pca$x[,1:dim_pca]) + start_plot(paste0('PCA_grid'), outdir=opt$outdir) + if(dim_pca==8){ + print(ggplot(scores, aes(x = .panel_x, y = .panel_y, fill = variable.group, colour = variable.group)) + + geom_point(shape = 16, size = 0.5, position = 'auto') + + geom_autodensity(alpha = 0.3, colour = NA, position = 'identity') + + facet_matrix(vars(PC1:PC8), layer.diag = 2)) + }else{ + print(ggplot(scores, aes(x = .panel_x, y = .panel_y, fill = variable.group, colour = variable.group)) + + geom_point(shape = 16, size = 0.5, position = 'auto') + + geom_autodensity(alpha = 0.3, colour = NA, position = 'identity') + + facet_matrix(vars(everything()), layer.diag = 2)) + } + end_plot() + + loadings <- pca$rotation[,1:dim_pca] + loadings <- data.frame(loadings[order(loadings[,1]), ]) + data <- getmart(rownames(loadings)) + loadings$symbol<-data$mgi_symbol[match(rownames(loadings), data$ensembl_gene_id)] + loadings$description <- data$description[match(rownames(loadings), data$ensembl_gene_id)] + write.table(loadings, file=paste0(opt$outdir,"/",'PCA_loadings.tsv'), quote=FALSE, sep='\t', row.names = TRUE, col.names = NA) + + ### HEATMAPS ### + futile.logger::flog.info(paste("Performing Heatmap")) + df <- as.data.frame(colData(dds)[,opt$factors]) + rownames(df) <- colData(dds)$track + # Heatmap of Top 20 Expressed Genes + select <- order(rowMeans(counts(dds,normalized=TRUE)),decreasing=TRUE)[1:20] + start_plot('Heatmap_topExpressed', outdir=opt$outdir) + pheatmap(assay(vsd)[select,], cluster_rows=FALSE, cluster_cols=FALSE, show_rownames=FALSE, annotation_col=df) + end_plot() + # Heatmap of Top 20 Variable Genes + topVarGenes <- head(order(rowVars(assay(vsd)),decreasing=TRUE),20) + mat <- assay(vsd)[ topVarGenes, ] + temp <- getmart(rownames(mat)) + row.names(temp) <- temp$ensembl_gene_id + rownames(mat) <- temp[rownames(mat),"external_gene_name"] + start_plot('Heatmap_topVariable', outdir=opt$outdir) + pheatmap(mat, annotation_col=df, cluster_rows=FALSE,fontsize_row = 6) + end_plot() + # Heatmap of Genes of interest + if (!is.null(opt$genes_of_interest)) { + print(rownames(assay(vsd))) + goi <- opt$genes_of_interest + goi <- goi[goi %in% rownames(assay(vsd))] + mat <- assay(vsd)[goi, ] + mat <- mat - rowMeans(mat) + temp <- getmart(rownames(mat)) + row.names(temp) <- temp$ensembl_gene_id + rownames(mat) <- temp[rownames(mat),"external_gene_name"] + start_plot('Heatmap_ofInterest', outdir=opt$outdir) + pheatmap(mat, annotation_col=df, scale = "row", cluster_cols = FALSE) + end_plot() + } + + ### CLUSTERING ### + futile.logger::flog.info(paste("Clustering")) + # for all genes + sampleDists <- dist(t(assay(vsd))) + sampleDistMatrix <- as.matrix( sampleDists ) + rownames(sampleDistMatrix) <- paste(colData(vsd)[,opt$contrast], vsd$track, sep = " - " ) + colnames(sampleDistMatrix) <- NULL + colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255) + start_plot('Heatmap_all', outdir=opt$outdir) + pheatmap(sampleDistMatrix, + clustering_distance_rows = sampleDists, + clustering_distance_cols = sampleDists, + col = colors) + end_plot() + # for top 500 genes + top500 <- head(assay(vsd)[ order(rowMeans(assay(vsd)), decreasing=TRUE ), ], n=500) + distsRL500 <- dist(t(top500)) + sampleDistMatrix500 <- as.matrix( distsRL500 ) + rownames(sampleDistMatrix500) <- paste(colData(vsd)[,opt$contrast], vsd$track, sep = " - " ) + colnames(sampleDistMatrix500) <- NULL + colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255) + start_plot('Heatmap_top500', outdir=opt$outdir) + pheatmap(sampleDistMatrix500, + clustering_distance_rows = distsRL500, + clustering_distance_cols = distsRL500, + col = colors) + end_plot() + + ### EXPLORE BATCH EFFECTS ### + futile.logger::flog.info(paste("Exploring Batch Effects")) + futile.logger::flog.info(dim_pca) + if(dim_pca == 8){ + for(factor in opt$factors){ + factor_transformed <- vsd + assay(factor_transformed) <- limma::removeBatchEffect(assay(factor_transformed), colData(factor_transformed)[,factor]) + pca = prcomp(t(assay(factor_transformed))) + sample.group <- as_factor(colData(dds)[, factor]) + variable.group <- colData(dds)[, opt$contrast] + percentVar <- round(100 * summary(pca)$importance[2,]) + scores <- data.frame(variable.group, sample.group, pca$x[,1:2]) + start_plot(paste0('PCA_', factor, '_removed'), outdir=opt$outdir) + print(ggplot(scores,aes(x=PC1, y=PC2)) + + geom_point(colour=factor(variable.group), shape=factor(sample.group)) + + theme(legend.position="right") + + labs(colour=opt$contrast, shape=factor, x=paste0("PC1 (", percentVar[1],"% of variance)"), + y=paste0("PC2 (", percentVar[2],"% of variance)")) + + ggtitle(paste0("Principal Component Analysis\n after batch correction for ", factor)) + + theme_grey(base_size = 15) + + theme(plot.title = element_text(lineheight=1, face="bold")) + geom_point(size=2) + + theme(text=element_text(family='serif'))) + end_plot() + } + } + else { + futile.logger::flog.info(paste("... not done - too few degrees of freedom")) + } +} + +main <- function() { + option_list <- list( + make_option( + "--rds-filename", + dest = "rds_filename", + type = "character", + default = "experiment.rds", + help = paste("filename with input data of counts") + ), + make_option( + "--model", + dest = "model", + type = "character", + default = "~ group", + help = paste("formula for multivariate model") + ), + make_option( + "--contrast", + dest = "contrast", + type = "character", + default = "group", + help = paste("formula for multivariate model") + ), + make_option( + "--factors", + dest = "factors", + type = "character", + default = "", + help = paste("formula for multivariate model") + ), + make_option( + "--genes_of_interest", + dest = "genes_of_interest", + type = "character", + default = NULL, + help = paste("genes of interest for plotting") + ), + make_option( + c("-o","--outdir"), + dest = "outdir", + type = "character", + default = "", + help = paste("genes of interest for plotting") + ) + ) + opt <- experiment_start(option_list = option_list, + description = description) + + if (!is.null(opt$factors)) { + opt$factors = unlist(strsplit(opt$factors, ",")) + } + if (!is.null(opt$genes_of_interest)) { + opt$genes_of_interest = unlist(strsplit(opt$genes_of_interest, ",")) + } + run(opt) + + experiment_stop() +} + +main() diff --git a/cgatpipelines/tasks/filtercounts.R b/cgatpipelines/tasks/filtercounts.R new file mode 100755 index 000000000..6f8d7ebd7 --- /dev/null +++ b/cgatpipelines/tasks/filtercounts.R @@ -0,0 +1,250 @@ +#' Basic filtering analysis Script +#' +#' +#' Example usage: +#' +#' Rscript PATH/TO/filtercounts.R +#' +#' input: directory containing read count files or tsv file containing reads +#' additional input variables: method used to generate file, model +#' output: `experiment_out.rds` is an experiment object after filtering +#' + +suppressMessages(library(futile.logger)) +suppressMessages(library(getopt)) +suppressMessages(library(tidyverse)) +suppressMessages(library(data.table)) +suppressMessages(library(DESeq2)) +suppressMessages(library(edgeR)) +suppressMessages(library(csaw)) +suppressMessages(library(rhdf5)) +suppressMessages(library(tximport)) +suppressMessages(library(DEXSeq)) + +source(file.path(Sys.getenv("R_ROOT"), "experiment.R")) + +run <- function(opt) { + + ### READING DATA ### + # Read in sampleData Table + futile.logger::flog.info(paste("reading sampleData table from", normalizePath(opt$sampleData))) + sampleData <- read_tsv(opt$sampleData) + sampleData <-sampleData[sampleData$include ==1, ] + futile.logger::flog.info(paste("read sampleData ", paste(dim(sampleData), collapse = ","))) + futile.logger::flog.info(paste(sampleData)) + rownames(sampleData) <- sampleData$track + + futile.logger::flog.info(paste("reading in data from ", opt$source)) + if(opt$source %in% c("salmon", "kallisto")){ + # Read in Transcript to Gene Map + tx2gene <- read_tsv(opt$tx2gene) + colnames(tx2gene) <- c("ensembl_transcript_id", "ensembl_gene_id") + if(opt$tx2gene_regex != "None"){ + tx2gene <- filter(tx2gene, !grepl(opt$tx2gene_regex,ensembl_transcript_id)) + } + # Read in Data + futile.logger::flog.info(opt$counts_dir) + futile.logger::flog.info(sampleData$track) + files <- file.path(opt$counts_dir, sampleData$track, "quant.sf") + names(files) <- sampleData$track + txi <- tximport(files, type = opt$source, tx2gene = tx2gene) + if(opt$method == "deseq2"){ + dataset <- DESeqDataSetFromTximport(txi, colData = sampleData, design = formula(opt$model)) + } else if(opt$method == "edger"){ + cts <- txi$counts + normMat <- txi$length + normMat <- normMat/exp(rowMeans(log(normMat))) + normCts <- cts/normMat + normMat <- sweep(normMat, 2, eff.lib, "*") + normMat <- log(normMat) + dataset <- DGEList(counts=cts, + samples=sampleData) + dataset <- scaleOffset(dataset,normMat) + } else if(opt$method == "Sleuth"){ + stop("Sleuth method not yet implemented. Sorry.") + + } else{ + stop("Method not defined. Allowable methods are \"DESeq2\", \"EdgeR\" or \"Sleuth\"") + } + } + + if(opt$source == "dexseq"){ + # Read in Data + files <- file.path(opt$counts_dir, paste0(sampleData$track, ".txt")) + names(files) <- sampleData$track + if(opt$method != "dexseq"){ + stop("DEXSeq input is handled by diffexonexpression. Please correct the method argument.") + } + dataset = DEXSeqDataSetFromHTSeq( + files, + sampleData=sampleData, + design=formula(opt$model), + flattenedfile=opt$flattenedFile) + } else if(opt$source == "counts_table"){ + # Read in Data + raw <- read.table(file = gzfile(opt$counts_tsv), header=TRUE, row.name=1, check.names = FALSE) + futile.logger::flog.info(paste("Colnames: ", colnames(raw))) + experiment_tsv <- raw[,sampleData$track,drop=FALSE] + if(opt$method == "deseq2"){ + dataset = DESeqDataSetFromMatrix(experiment_tsv, sampleData, design = formula(opt$model)) + } else if(opt$method == "edger"){ + dataset <- DGEList(counts=experiment_tsv, samples=sampleData) + } else if(opt$method == "sleuth"){ + stop("Sleuth method not yet implemented. Sorry.") + } else{ + stop("Method not defined. Allowable methods are \"DESeq2\", \"EdgeR\" or \"Sleuth\"") + } + } + + ### FILTERING ### + if(opt$filter == TRUE) { + futile.logger::flog.info(paste("filtering data ", opt$source)) + if(opt$method == "edger"){ + futile.logger::flog.info(paste("Counts before filtering ", paste(dim(dataset$counts), collapse = ","))) + keep <- filterByExpr(dataset) + dataset <- dataset[keep, , keep.lib.sizes=FALSE] + counts_table <- dataset$counts + futile.logger::flog.info(paste("Counts after filtering ", paste(dim(dataset$counts), collapse = ","))) + } else if(opt$method == "deseq2"){ + futile.logger::flog.info(paste("Counts before filtering ", paste(dim(counts(dataset)), collapse = ","))) + keep <- rowSums(counts(dataset)) >= 10 + dataset <- dataset[keep,] + #dataset <- dataset[Reduce('|', dataset[sapply(dataset, is.numeric)]),] + #dataset <- dataset[rowSums(dataset[,sapply(dataset, is.numeric)]==0)<6,] + #dataset <- subset(dataset, !rowSums(assay(dataset) == 0) > 6) + counts_table <- counts(dataset) + futile.logger::flog.info(paste("Counts after filtering ", paste(dim(counts(dataset)), collapse = ","))) + } else if(opt$method == "dexseq"){ + futile.logger::flog.info(paste("Filtering for DEXSeq not implemented ")) + counts_table <- counts(dataset) + } + } + else { + futile.logger::flog.info(paste("No filtering on dataset performed.", opt$source)) + } + + ### SAVING DATA ### + file = get_output_filename(paste0(opt$outdir,"/experiment_out.rds")) + flog.info(paste("saving experiment data to", file)) + saveRDS(dataset, file = file) + + ## Set up gene lengths for RPKM + flog.info("outputting counts data") + write.table(counts(dataset), + file = paste0(opt$outdir,"/Counts_Data.tsv"), + sep = "\t", + quote = FALSE, + row.names = TRUE, + col.names = NA) + if(opt$source %in% c("salmon", "kallisto")){ + write.table(txi$abundance, + file = paste0(opt$outdir,"/tpm.tsv"), + sep = "\t", + quote = FALSE, + row.names = TRUE, + col.names = NA) + } +} + + +main <- function() { + option_list <- list( + make_option( + "--counts-dir", + dest="counts_dir", + type="character", + help=paste("directory containing expression estimates", + "from salmon/kallisto/DEXSeq.") + ), + make_option( + "--counts-tsv", + dest="counts_tsv", + type="character", + help=paste("file containing counts generated", + "by e.g. featurecounts.") + ), + make_option( + c("-d", "--sampleData"), + dest="sampleData", + type="character", + default = "", + help=paste("input file with experimental design/sample info") + ), + make_option( + c("--outdir"), + dest="outdir", + type="character", + default = "results.dir", + help=paste("output directory") + ), + make_option( + "--model", + dest = "model", + type = "character", + default = "~group", + help = paste("formula for multivariate model") + ), + make_option( + c("-s", "--source"), + dest="source", + type="character", + default="salmon", + help=paste("Source of data. Possible options are ", + "\"salmon\", \"kallisto\", \"counts_table\", \"dexseq\"") + ), + make_option( + c("--tx2gene"), + dest="tx2gene", + type="character", + default="transcript2geneMap.tsv", + help=paste("Path to transcript to gene tsv.") + ), + make_option( + c("--tx2gene_regex"), + dest="tx2gene_regex", + type="character", + default="None", + help=paste("Regex/Prefix for removal of certain features from ", + "experiment (e.g. removal of spike-ins)") + ), + make_option( + "--method", + dest="method", + type="character", + default="deseq2", + help=paste("differential expression method to apply ") + ), + make_option( + "--filter", + dest="filter", + type="logical", + default=TRUE, + help=paste("adopt filtering strategy. ", + "For EDGER, the default strategy is applied. ", + "For DESeq2 basic rowsum filtering < 10 is applied.") + ), + make_option( + "--dexseq-flattened-file", + dest="flattenedFile", + type="character", + help=paste("directory containing flat gtf for dexseq. DEXSeq ", + "expects this to be generated by the", + "DEXSeq_prepare_annotations.py script") + ) + ) + + opt <- experiment_start(option_list = option_list, + description = description) + if (!is.null(opt$method)) { + opt$method = str_to_lower(opt$method) + } + + if (!is.null(opt$source)) { + opt$source = str_to_lower(opt$source) + } + run(opt) + + experiment_stop() +} + +main() diff --git a/cgatpipelines/tasks/gtfsubset.py b/cgatpipelines/tasks/gtfsubset.py index 62c467fc8..cd9c1c0fd 100644 --- a/cgatpipelines/tasks/gtfsubset.py +++ b/cgatpipelines/tasks/gtfsubset.py @@ -14,6 +14,7 @@ import cgat.GTF as GTF import cgatcore.pipeline as P import cgatcore.database as Database +from sqlalchemy import text class SubsetGTF(): @@ -142,10 +143,12 @@ def getRepeatDataFromUCSC(dbhandle, expression given. ''' - cc = dbhandle.execute("SHOW TABLES LIKE '%%rmsk'") - tables = [x[0] for x in cc.fetchall()] - if len(tables) == 0: - raise ValueError("could not find any `rmsk` tables") + with dbhandle.connect() as conn: + query = "SHOW TABLES LIKE '%%rmsk'" + cc = conn.execute(text(query)) + tables = [x[0] for x in cc.fetchall()] + if len(tables) == 0: + raise ValueError("could not find any `rmsk` tables") # now collect repeats tmpfile = P.get_temp_file(".") @@ -166,9 +169,10 @@ def getRepeatDataFromUCSC(dbhandle, sql = sql % locals() E.debug("executing sql statement: %s" % sql) - cc = dbhandle.execute(sql) - for data in cc.fetchall(): - tmpfile.write("\t".join(map(str, data)) + "\n") + with dbhandle.connect() as conn: + cc = conn.execute(text(sql)) + for data in cc.fetchall(): + tmpfile.write("\t".join(map(str, data)) + "\n") tmpfile.close() diff --git a/cgatpipelines/tasks/mappingqc.py b/cgatpipelines/tasks/mappingqc.py deleted file mode 100644 index a9cb44845..000000000 --- a/cgatpipelines/tasks/mappingqc.py +++ /dev/null @@ -1,807 +0,0 @@ -""" -mappingqc.py - Tasks for QC'ing mapping -=============================================== - -Reference ---------- - -""" - -import cgatcore.experiment as E -import os -import re -import pandas -import cgatcore.iotools as iotools -import cgat.BamTools.bamtools as BamTools -from cgatcore import pipeline as P - -PICARD_MEMORY = "9G" - - -def getNumReadsFromReadsFile(infile): - '''get number of reads from a .nreads file.''' - with iotools.open_file(infile) as inf: - line = inf.readline() - if not line.startswith("nreads"): - raise ValueError( - "parsing error in file '%s': " - "expected first line to start with 'nreads'") - nreads = int(line[:-1].split("\t")[1]) - return nreads - - -def buildPicardInsertSizeStats(infile, outfile, genome_file): - '''run Picard:CollectInsertSizeMetrics - - Collect insert size statistics. - - Arguments - --------- - infile : string - Input filename in :term:`BAM` format. - outfile : string - Output filename with picard output. - genome_file : string - Filename with genomic sequence. - ''' - - job_memory = PICARD_MEMORY - picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals() - job_threads = 3 - - if BamTools.getNumReads(infile) == 0: - E.warn("no reads in %s - no metrics" % infile) - iotools.touch_file(outfile) - return - - statement = '''picard %(picard_opts)s CollectInsertSizeMetrics - INPUT=%(infile)s - REFERENCE_SEQUENCE=%(genome_file)s - ASSUME_SORTED=true - OUTPUT=%(outfile)s - VALIDATION_STRINGENCY=SILENT - >& %(outfile)s''' - - P.run(statement) - - -def buildPicardAlignmentStats(infile, outfile, genome_file): - '''run picard:CollectMultipleMetrics - - Arguments - --------- - infile : string - Input filename in :term:`BAM` format. - outfile : string - Output filename with picard output. - genome_file : string - Filename with genomic sequence. - ''' - - job_memory = PICARD_MEMORY - picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals() - job_threads = 3 - tmp_bam = P.get_temp_filename(".") - - if BamTools.getNumReads(infile) == 0: - E.warn("no reads in %s - no metrics" % infile) - iotools.touch_file(outfile) - return - - # Picard seems to have problem if quality information is missing - # or there is no sequence/quality information within the bam file. - # Thus, add it explicitly. - statement = '''cat %(infile)s - | cgat bam2bam - -v 0 - --method=set-sequence - --output-sam - --log=%(outfile)s.bam2bam.log - > %(tmp_bam)s && - picard %(picard_opts)s CollectMultipleMetrics - INPUT=%(tmp_bam)s - REFERENCE_SEQUENCE=%(genome_file)s - ASSUME_SORTED=true - OUTPUT=%(outfile)s - VALIDATION_STRINGENCY=SILENT - >& %(outfile)s''' - - P.run(statement) - - os.unlink(tmp_bam) - -def buildPicardDuplicationStats(infile, outfile): - '''run picard:MarkDuplicates - - Record duplicate metrics using Picard, the marked records - are discarded. - - Arguments - --------- - infile : string - Input filename in :term:`BAM` format. - outfile : string - Output filename with picard output. - ''' - - job_memory = PICARD_MEMORY - picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals() - job_threads = 3 - - if BamTools.getNumReads(infile) == 0: - E.warn("no reads in %s - no metrics" % infile) - iotools.touch_file(outfile) - return - - # currently, MarkDuplicates cannot handle split alignments from gsnap - # these can be identified by the custom XT tag. - if ".gsnap.bam" in infile: - tmpf = P.get_temp_file(".") - tmpfile_name = tmpf.name - statement = '''samtools view -h %(infile)s - | awk "!/\\tXT:/" - | samtools view /dev/stdin -S -b > %(tmpfile_name)s; - ''' % locals() - data_source = tmpfile_name - else: - statement = "" - data_source = infile - - statement += '''picard %(picard_opts)s MarkDuplicates - INPUT=%(data_source)s - ASSUME_SORTED=true - METRICS_FILE=%(outfile)s - OUTPUT=/dev/null - VALIDATION_STRINGENCY=SILENT - ''' - P.run(statement) - - if ".gsnap.bam" in infile: - os.unlink(tmpfile_name) - - -def buildPicardDuplicateStats(infile, outfile): - '''run picard:MarkDuplicates - - Record duplicate metrics using Picard and keep the dedupped .bam - file. - - Pair duplication is properly handled, including inter-chromosomal - cases. SE data is also handled. These stats also contain a - histogram that estimates the return from additional sequecing. No - marked bam files are retained (/dev/null...) Note that picards - counts reads but they are in fact alignments. - - Arguments - --------- - infile : string - Input filename in :term:`BAM` format. - outfile : string - Output filename with picard output. - - ''' - job_memory = PICARD_MEMORY - picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals() - job_threads = 3 - - if BamTools.getNumReads(infile) == 0: - E.warn("no reads in %s - no metrics" % infile) - iotools.touch_file(outfile) - return - - statement = '''picard %(picard_opts)s MarkDuplicates - INPUT=%(infile)s - ASSUME_SORTED=true - METRICS_FILE=%(outfile)s.duplicate_metrics - OUTPUT=%(outfile)s - VALIDATION_STRINGENCY=SILENT; - ''' - statement += '''samtools index %(outfile)s ;''' - P.run(statement) - - -def buildPicardCoverageStats(infile, outfile, baits, regions): - '''run picard:CollectHSMetrics - - Generate coverage statistics for regions of interest from a bed - file using Picard. - - Arguments - --------- - infile : string - Input filename in :term:`BAM` format. - outfile : string - Output filename with picard output. - baits : :term:`bed` formatted file of bait regions - regions : :term:`bed` formatted file of target regions - - ''' - - job_memory = PICARD_MEMORY - picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals() - job_threads = 3 - - if BamTools.getNumReads(infile) == 0: - E.warn("no reads in %s - no metrics" % infile) - iotools.touch_file(outfile) - return - - statement = '''picard %(picard_opts)s CollectHsMetrics - BAIT_INTERVALS=%(baits)s - TARGET_INTERVALS=%(regions)s - INPUT=%(infile)s - OUTPUT=%(outfile)s - VALIDATION_STRINGENCY=LENIENT''' % locals() - P.run(statement) - - -def buildPicardGCStats(infile, outfile, genome_file): - """picard:CollectGCBiasMetrics - - Collect GC bias metrics. - - Arguments - --------- - infile : string - Input filename in :term:`BAM` format. - outfile : string - Output filename with picard output. - genome_file : string - Filename with genomic sequence. - - """ - - job_memory = PICARD_MEMORY - picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals() - job_threads = 3 - - if BamTools.getNumReads(infile) == 0: - E.warn("no reads in %s - no metrics" % infile) - iotools.touch_file(outfile) - return - - statement = '''picard %(picard_opts) CollectGcBiasMetrics - INPUT=%(infile)s - REFERENCE_SEQUENCE=%(genome_file)s - OUTPUT=%(outfile)s - VALIDATION_STRINGENCY=SILENT - CHART_OUTPUT=%(outfile)s.pdf - SUMMARY_OUTPUT=%(outfile)s.summary - >& %(outfile)s''' - - P.run(statement) - - -def loadPicardMetrics(infiles, outfile, suffix, - pipeline_suffix=".picard_stats", - tablename=None): - '''load picard metrics. - - Arguments - --------- - infiles : string - Filenames of files with picard metric information. Each file - corresponds to a different track. - outfile : string - Logfile. - suffix : string - Suffix to append to table name. - pipeline_suffix : string - Suffix to remove from track name. - tablename : string - Tablename to use. If unset, the table name will be derived - from `outfile` and suffix as ``to_table(outfile) + "_" + - suffix``. - - ''' - - if not tablename: - tablename = "%s_%s" % (P.to_table(outfile), suffix) - - outf = P.get_temp_file(".") - - filenames = ["%s.%s" % (x, suffix) for x in infiles] - - first = True - for filename in filenames: - track = P.snip(os.path.basename(filename), "%s.%s" % - (pipeline_suffix, suffix)) - - if not os.path.exists(filename): - E.warn("File %s missing" % filename) - continue - - lines = iotools.open_file(filename, "r").readlines() - - # extract metrics part - rx_start = re.compile("## METRICS CLASS") - for n, line in enumerate(lines): - if rx_start.search(line): - lines = lines[n + 1:] - break - - for n, line in enumerate(lines): - if not line.strip(): - lines = lines[:n] - break - - if len(lines) == 0: - E.warn("no lines in %s: %s" % (track, filename)) - continue - - if first: - outf.write("%s\t%s" % ("track", lines[0])) - fields = lines[0][:-1].split("\t") - else: - f = lines[0][:-1].split("\t") - if f != fields: - raise ValueError( - "file %s has different fields: expected %s, got %s" % - (filename, fields, f)) - - first = False - for i in range(1, len(lines)): - outf.write("%s\t%s" % (track, lines[i])) - - outf.close() - - P.load(outf.name, - outfile, - tablename=tablename, - options="--add-index=track --allow-empty-file") - - os.unlink(outf.name) - - -def loadPicardHistogram(infiles, outfile, suffix, column, - pipeline_suffix=".picard_stats", tablename=False): - '''extract a histogram from a picard output file and load - it into database. - - Arguments - --------- - infiles : string - Filenames of files with picard metric information. Each file - corresponds to a different track. - outfile : string - Logfile. - suffix : string - Suffix to append to table name. - column : string - Column name to take from the histogram. - pipeline_suffix : string - Suffix to remove from track name. - tablename : string - Tablename to use. If unset, the table name will be derived - from `outfile` and suffix as ``to_table(outfile) + "_" + - suffix``. - ''' - - if not tablename: - tablename = "%s_%s" % (P.to_table(outfile), suffix) - tablename = tablename.replace("_metrics", "_histogram") - - # some files might be missing - xfiles = [x for x in infiles if os.path.exists("%s.%s" % (x, suffix))] - - if len(xfiles) == 0: - E.warn("no files for %s" % tablename) - return - - header = ",".join([os.path.basename(x) for x in xfiles]) - filenames = " ".join(["%s.%s" % (x, suffix) for x in xfiles]) - - # there might be a variable number of columns in the tables - # only take the first ignoring the rest - - load_statement = P.build_load_statement( - tablename, - options="--add-index=track " - " --header-names=%s,%s" - " --allow-empty-file" - " --replace-header" % (column, header)) - - statement = """cgat combine_tables - --regex-start="## HISTOGRAM" - --missing-value=0 - --take=2 - %(filenames)s - | %(load_statement)s - >> %(outfile)s - """ - - to_cluster = False - - P.run(statement) - - -def loadPicardAlignmentStats(infiles, outfile): - '''load all output from Picard's CollectMultipleMetrics into database. - - Loads tables into database with prefix derived from outfile: - * [outfile]_alignment_summary_metric - * [outfile]_insert_size_metrics - * [outfile]_quality_by_cycle_metrics - * [outfile]_quality_distribution_metrics - * [outfile]_insert_size_metrics - - Arguments - --------- - infiles : string - Filenames of files with picard metric information. Each file - corresponds to a different track. - outfile : string - Logfile. The table name will be derived from `outfile`. - - ''' - - loadPicardMetrics(infiles, outfile, "alignment_summary_metrics") - - # insert size metrics only available for paired-ended data - loadPicardMetrics(infiles, outfile, "insert_size_metrics") - - histograms = (("quality_by_cycle_metrics", "cycle"), - ("quality_distribution_metrics", "quality"), - ("insert_size_metrics", "insert_size")) - - for suffix, column in histograms: - loadPicardHistogram(infiles, outfile, suffix, column) - - -def loadPicardDuplicationStats(infiles, outfiles): - '''load picard duplicate filtering stats into database. - - Loads two tables into the database - * picard_duplication_metrics - * picard_complexity_histogram - - Arguments - --------- - infiles : string - Filenames of files with picard metric information. Each file - corresponds to a different track. - outfile : string - Logfile. The table name will be derived from `outfile`. - ''' - # SNS: added to enable naming consistency - - outfile_metrics, outfile_histogram = outfiles - - suffix = "picard_duplication_metrics" - - # the loading functions expect "infile_name.pipeline_suffix" as the infile - # names. - infile_names = [x[:-len("." + suffix)] for x in infiles] - - loadPicardMetrics(infile_names, outfile_metrics, suffix, "", - tablename="picard_duplication_metrics") - - infiles_with_histograms = [] - - # The complexity histogram is only present for PE data, so we must check - # because by design the pipeline does not track endedness - for infile in infile_names: - with_hist = False - with open(".".join([infile, suffix]), "r") as open_infile: - for line in open_infile: - if line.startswith("## HISTOGRAM"): - infiles_with_histograms.append(infile) - break - - if len(infiles_with_histograms) > 0: - loadPicardHistogram(infiles_with_histograms, - outfile_histogram, - suffix, - "coverage_multiple", - "", - tablename="picard_complexity_histogram") - else: - with open(outfile_histogram, "w") as ofh: - ofh.write("No histograms detected, no data loaded.") - - -def loadPicardDuplicateStats(infiles, outfile, pipeline_suffix=".bam"): - '''load picard duplicate filtering stats. - - Arguments - --------- - infiles : string - Filenames of files with picard metric information. Each file - corresponds to a different track. - outfile : string - Logfile. The table name will be derived from `outfile`. - pipeline_suffix : string - Suffix appended to pipeline output file, will be removed to - define track. - ''' - - loadPicardMetrics( - infiles, outfile, "duplicate_metrics", - pipeline_suffix=pipeline_suffix) - loadPicardHistogram(infiles, - outfile, - "duplicate_metrics", - "duplicates", - pipeline_suffix=pipeline_suffix) - - -def loadPicardCoverageStats(infiles, outfile): - '''import coverage statistics into database. - - Arguments - --------- - infiles : string - Filenames of files with picard metric information. Each file - corresponds to a different track. - outfile : string - Logfile. The table name will be derived from `outfile`. - ''' - - outf = P.get_temp_file(".") - first = True - for f in infiles: - track = P.snip(os.path.basename(f), ".cov") - lines = [x for x in open(f, "r").readlines() - if not x.startswith("#") and x.strip()] - if first: - outf.write("%s\t%s" % ("track", lines[0])) - first = False - outf.write("%s\t%s" % (track, lines[1])) - outf.close() - P.load(outf.name, - outfile, - options="--ignore-empty --add-index=track") - os.unlink(outf.name) - - -def buildBAMStats(infile, outfile): - '''Count number of reads mapped, duplicates, etc. using - bam2stats.py - - Arguments - --------- - infile : string - Input file in :term:`BAM` format - outfile : string - Output file in :term:`tsv` format. - - ''' - - statement = '''cgat bam2stats - --force-output - --output-filename-pattern=%(outfile)s.%%s - < %(infile)s - > %(outfile)s''' - P.run(statement) - - -def loadBAMStats(infiles, outfile): - '''load output of :func:`buildBAMStats` into database. - - Arguments - --------- - infiles : string - Input files, output from :func:`buildBAMStats`. - outfile : string - Logfile. The table name will be derived from `outfile`. - ''' - - header = ",".join([P.snip(os.path.basename(x), ".readstats") - for x in infiles]) - filenames = " ".join(["<( cut -f 1,2 < %s)" % x for x in infiles]) - tablename = P.to_table(outfile) - - load_statement = P.build_load_statement( - tablename, - options="--add-index=track " - " --allow-empty-file") - - E.info("loading bam stats - summary") - statement = """cgat combine_tables - --header-names=%(header)s - --missing-value=0 - --ignore-empty - %(filenames)s - | perl -p -e "s/bin/track/" - | cgat table2table --transpose - | %(load_statement)s - > %(outfile)s""" - - to_cluster = False - - P.run(statement) - - for suffix in ("nm", "nh"): - E.info("loading bam stats - %s" % suffix) - filenames = " ".join(["%s.%s" % (x, suffix) for x in infiles]) - - load_statement = P.build_load_statement( - "%s_%s" % (tablename, suffix), - options="--allow-empty-file") - - statement = """cgat combine_tables - --header-names=%(header)s - --skip-titles - --missing-value=0 - --ignore-empty - %(filenames)s - | perl -p -e "s/bin/%(suffix)s/" - | %(load_statement)s - >> %(outfile)s """ - - to_cluster = False - - P.run(statement) - - # load mapping qualities, there are two columns per row - # 'all_reads' and 'filtered_reads' - # Here, only filtered_reads are used (--take=3) - for suffix in ("mapq",): - E.info("loading bam stats - %s" % suffix) - filenames = " ".join(["%s.%s" % (x, suffix) for x in infiles]) - - load_statement = P.build_load_statement( - "%s_%s" % (tablename, suffix), - options=" --allow-empty-file") - - statement = """cgat combine_tables - --header-names=%(header)s - --skip-titles - --missing-value=0 - --ignore-empty - --take=3 - %(filenames)s - | perl -p -e "s/bin/%(suffix)s/" - | %(load_statement)s - >> %(outfile)s """ - - to_cluster = False - - P.run(statement) - - -def buildPicardRnaSeqMetrics(infiles, strand, outfile): - '''run picard:RNASeqMetrics - - - - Arguments - --------- - infiles : string - Input filename in :term:`BAM` format. - Genome file in refflat format - (http://genome.ucsc.edu/goldenPath/gbdDescriptionsOld.html#RefFlat) - outfile : string - Output filename with picard output. - - ''' - job_memory = PICARD_MEMORY - picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals() - job_threads = 3 - infile, genome = infiles - - if BamTools.getNumReads(infile) == 0: - E.warn("no reads in %s - no metrics" % infile) - iotools.touch_file(outfile) - return - - statement = '''picard %(picard_opts)s CollectRnaSeqMetrics - REF_FLAT=%(genome)s - INPUT=%(infile)s - ASSUME_SORTED=true - OUTPUT=%(outfile)s - STRAND=%(strand)s - VALIDATION_STRINGENCY=SILENT - ''' - P.run(statement) - - -def loadPicardRnaSeqMetrics(infiles, outfiles): - '''load picard rna stats into database. - - Loads tables into the database - * picard_rna_metrics - * picard_rna_histogram - - Arguments - --------- - infile : string - Filenames of files with picard metric information. Each file - corresponds to a different track. - outfiles : string - Logfile. The table names will be derived from `outfile`. - ''' - - outfile_metrics, outfile_histogram = outfiles - - suffix = "picard_rna_metrics" - - # the loading functions expect "infile_name.pipeline_suffix" as the infile - # names. - infile_names = [x[:-len("." + suffix)] for x in infiles] - - loadPicardMetrics(infile_names, outfile_metrics, suffix, "", - tablename="picard_rna_metrics") - - infiles_with_histograms = [] - - # Checking if histogram is present (?is this necessary) - for infile in infile_names: - with_hist = False - with open(".".join([infile, suffix]), "r") as open_infile: - for line in open_infile: - if line.startswith("## HISTOGRAM"): - infiles_with_histograms.append(infile) - break - - if len(infiles_with_histograms) > 0: - loadPicardHistogram(infiles_with_histograms, - outfile_histogram, - suffix, - "coverage_multiple", - "", - tablename="picard_rna_histogram") - else: - with open(outfile_histogram, "w") as ofh: - ofh.write("No histograms detected, no data loaded.") - - -def loadIdxstats(infiles, outfile): - '''take list of file paths to samtools idxstats output files - and merge to create single dataframe containing mapped reads per - contig for each track. This dataframe is then loaded into - database. - - Loads tables into the database - * idxstats_reads_per_chromosome - - Arguments - --------- - infiles : list - list where each element is a string of the filename containing samtools - idxstats output. Filename format is expected to be 'sample.idxstats' - outfile : string - Logfile. The table name will be derived from `outfile`. - ''' - - outf = P.get_temp_file(".") - dfs = [] - for f in infiles: - track = P.snip(f, ".idxstats").split('/')[-1] - - if not os.path.exists(f): - E.warn("File %s missing" % f) - continue - - # reformat idx stats - df = pandas.read_csv(f, sep='\t', header=None) - df.columns = ['region', 'length', 'mapped', 'unmapped'] - - # calc total reads mapped & unmappedpep - total_reads = df.unmapped.sum() + df.mapped.sum() - total_mapped_reads = df.mapped.sum() - - reformatted_df = pandas.DataFrame([['total_mapped_reads', total_mapped_reads], - ['total_reads', total_reads], - ['track', track]], columns=(['region', 'mapped'])) - - # reformat the df - df = df.append(reformatted_df, ignore_index=True) - df.set_index('region', inplace=True) - df1 = df[['mapped']].T - # set track as index - df1.set_index('track', inplace=True) - dfs.append(df1) - - # merge dataframes into single table - master_df = pandas.concat(dfs) - master_df.drop('*', axis=1, inplace=True) - # transform dataframe to avoid reaching column limit - master_df = master_df.T - master_df.to_csv(outf, sep='\t', index=True) - outf.close() - - P.load(outf.name, - outfile, - options="--ignore-empty --add-index=track") - os.unlink(outf.name) diff --git a/cgatpipelines/tasks/script_template.R b/cgatpipelines/tasks/script_template.R new file mode 100644 index 000000000..84ee66d8d --- /dev/null +++ b/cgatpipelines/tasks/script_template.R @@ -0,0 +1,66 @@ +#' filtering single cell data based on QC metrics +#' +#' WARNING: This script is work-in-progress +#' +#' Example usage: +#' +#' cgat sc-counts2counts --counts-filename=featurecounts.tsv --phenotypes-filename=phenodata.tsv --factor=group,mouse_id,collection_date,slice_depth,slice_number,pipette_visual,timepoint > filtered_counts.tsv +#' +#' `feature_counts.tsv` is a table (tab-separated) of ngenes x ncells, +#' that is the genes are in rows and the columns are cells. +#' +#' `phenodata.tsv` is a table (tab-separated) of ncells x nfeatures, +#' that is rows are cells and features are in columns. The table should contain +#' a column called `sample_id` that will match the columns in the table +#' `feature_counts.tsv`. +#' +#' Features can then be selected in the `--factor` option to be +#' plotted. +#' +#' -> todo: parameterize detection of ERCC (pattern?) +#' -> todo: parameterize definition of mitochondrial genes - currently hardcoded for mouse. + +## conda dependencies: bioconductor-scater r-cairo + +suppressMessages(library(futile.logger)) +suppressMessages(library(getopt)) + +source(file.path(Sys.getenv("R_ROOT"), "experiment.R")) + + +run <- function(opt) { + + ## do something + +} + +main <- function() { + + option_list <- list( + make_option( + "--counts-filename", + dest = "counts_filename", + type = "character", + default = "featurecounts.tsv", + help = paste("filename with input data of counts") + ), + make_option( + "--factor", + dest = "factors", + type = "character", + default = "group,collection_date", + help = paste("factors to colour QC plots by.") + ) + ) + opt <- experiment_start(option_list = option_list, + description = description) + + if (!is.null(opt$factors)) { + opt$factors = unlist(strsplit(opt$factors, ",")) + } + run(opt) + + experiment_stop() +} + +main() diff --git a/cgatpipelines/tasks/venn.R b/cgatpipelines/tasks/venn.R new file mode 100644 index 000000000..16f3ca19e --- /dev/null +++ b/cgatpipelines/tasks/venn.R @@ -0,0 +1,132 @@ +#' Wrapper for VennDiagram with basic customization and command line functionality +#' +#' WARNING: This script is work-in-progress +#' +#' Example usage: +#' +#' cgat sc-counts2counts --counts-filename=featurecounts.tsv --phenotypes-filename=phenodata.tsv --factor=group,mouse_id,collection_date,slice_depth,slice_number,pipette_visual,timepoint > filtered_counts.tsv +#' +#' `feature_counts.tsv` is a table (tab-separated) of ngenes x ncells, +#' that is the genes are in rows and the columns are cells. +#' +#' `phenodata.tsv` is a table (tab-separated) of ncells x nfeatures, +#' that is rows are cells and features are in columns. The table should contain +#' a column called `sample_id` that will match the columns in the table +#' `feature_counts.tsv`. +#' +#' Features can then be selected in the `--factor` option to be +#' plotted. +#' +#' -> todo: parameterize detection of ERCC (pattern?) +#' -> todo: parameterize definition of mitochondrial genes - currently hardcoded for mouse. + +## conda dependencies: bioconductor-scater r-cairo + +suppressMessages(library(futile.logger)) +suppressMessages(library(getopt)) +suppressMessages(library(VennDiagram)) + + +source(file.path(Sys.getenv("R_ROOT"), "experiment.R")) + + +run <- function(opt) { + + dataframes <- lapply(opt$filenames, read.table) + names(dataframes) <- opt$names + x1 <- lapply(dataframes, function(x) subset(x, padj < opt$pvalue)) + x <- lapply(x1, function(x) subset(x, abs(log2FoldChange) >= opt$lfc)) + if(opt$sign == "positive"){ + x <- lapply(x, function(y) subset(y, log2FoldChange >0))} + if(opt$sign == "negative"){ + x <- lapply(x, function(y) subset(y, log2FoldChange <0))} + x <- lapply(x, rownames) + flog.info(str(x)) + lapply(calculate.overlap(x), write, "venn.txt", append=TRUE, ncolumns=1000) + + + venn.diagram(x, + filename = 'venn_diagramm.png', + output = TRUE , + imagetype="png" , + height = 480 , + width = 480 , + resolution = 300, + compression = "lzw", + lwd = 2, + lty = 'blank', + fill = opt$colours, + cex = 0.8, + fontface = "bold", + fontfamily = "sans", + cat.cex = 0.4, + cat.fontface = "bold", + cat.default.pos = "outer", + cat.fontfamily = "sans" + ) +} + +main <- function() { + + option_list <- list( + make_option( + "--filenames", + dest = "filenames", + type = "character", + default = "file1.tsv,file2.tsv", + help = paste("filenames with results data") + ), + make_option( + "--colours", + dest = "colours", + type = "character", + default = "blue,red", + help = paste("Venn Diagram Colours") + ), + make_option( + "--names", + dest = "names", + type = "character", + default = "A,B", + help = paste("Names") + ), + make_option( + "--pvalue", + dest = "pvalue", + type = "numeric", + default = 0.05, + help = paste("p value threshold") + ), + make_option( + "--lfc", + dest = "lfc", + type = "numeric", + default = 0, + help = paste("LFC value threshold") + ), + make_option( + "--sign", + dest = "sign", + type = "character", + default = "", + help = paste("optional - possibilities are positive/negative") + ) + ) + opt <- experiment_start(option_list = option_list, + description = description) + + if (!is.null(opt$filenames)) { + opt$filenames = unlist(strsplit(opt$filenames, ",")) + } + if (!is.null(opt$colours)) { + opt$colours = unlist(strsplit(opt$colours, ",")) + } + if (!is.null(opt$names)) { + opt$names = unlist(strsplit(opt$names, ",")) + } + run(opt) + + experiment_stop() +} + +main() diff --git a/cgatpipelines/tools/pipeline_bamstats.py b/cgatpipelines/tools/pipeline_bamstats.py index 5df60e88f..f2d12771a 100644 --- a/cgatpipelines/tools/pipeline_bamstats.py +++ b/cgatpipelines/tools/pipeline_bamstats.py @@ -291,22 +291,6 @@ def buildRrnaIntervals(infile, outfile): ######################################################################### -@follows(mkdir("StrandSpec.dir")) -@transform("*.bam", - suffix(".bam"), - r"StrandSpec.dir/\1.strand") -def strandSpecificity(infile, outfile): - '''This function will determine the strand specificity of your library - from the bam file''' - - statement = ( - "cgat bam2libtype " - "--max-iterations 10000 " - "< {infile} " - "> {outfile}".format(**locals())) - return P.run(statement) - - @follows(mkdir("BamFiles.dir")) @transform("*.bam", regex("(.*).bam$"), @@ -356,6 +340,26 @@ def buildPicardDuplicationStats(infile, outfile): PICARD_MEMORY) +@follows(mkdir("RSeQC.dir")) +@follows(countReads) +@transform(intBam, + regex("BamFiles.dir/(.*).bam$"), + r"RSeQC.dir/\1.read_distribution") +def runRSeQC(infile, outfile): + + bed_file = "/ceph/project/talbotlab/jscaber/iPS-C9TrapYan/bamstats-fullgenome/hg38_GENCODE.v38.bed" + + job_memory = "6G" + + statement = ''' + read_distribution.py + -i %(infile)s + -r %(bed_file)s + > %(outfile)s + ''' + + P.run(statement) + @follows(mkdir("BamStats.dir")) @follows(countReads) @transform(intBam, @@ -804,14 +808,6 @@ def loadTranscriptProfile(infiles, outfile): bamstats.loadTranscriptProfile(infiles, outfile) -@P.add_doc(bamstats.loadStrandSpecificity) -@jobs_limit(PARAMS.get("jobs_limit_db", 1), "db") -@follows(loadTranscriptProfile) -@merge(strandSpecificity, "strand_spec.load") -def loadStrandSpecificity(infiles, outfile): - ''' merge strand specificity data into a single table''' - bamstats.loadStrandSpecificity(infiles, outfile) - @merge((loadBAMStats, loadPicardStats, loadContextStats), "view_mapping.load") def createViewMapping(infile, outfile): @@ -856,8 +852,7 @@ def views(): loadIdxStats, loadExonValidation, loadPicardRnaSeqMetrics, - loadTranscriptProfile, - loadStrandSpecificity) + loadTranscriptProfile) def full(): '''a dummy task to run all tasks in the pipeline''' pass diff --git a/cgatpipelines/tools/pipeline_exome.py b/cgatpipelines/tools/pipeline_exome.py index 30e3adf3a..1c2c06bdc 100644 --- a/cgatpipelines/tools/pipeline_exome.py +++ b/cgatpipelines/tools/pipeline_exome.py @@ -18,19 +18,18 @@ 3. Local realignment and BQSR in GATK and deduplication in Picard 4. Calculate the ratio of reads on the X and Y chromosomes to assert sex 5. Variant calling in families (SNVs & indels) using GATK HaplotypeCaller - 6. HapMap genotyping in individuals using GATK HaplotypeCaller - 7. Comparison of Hapmap genotypes to assess relatedness (VCFtools) - 8. Variant annotation using SNPeff, GATK VariantAnnotator, and SnpSift - 9. Variant quality score recalibration (GATK) - 10. Flags variants within genes of interest (such as known disease genes) + 6. Comparison of join VCF genotypes to assess relatedness (somalier) + 7. Variant annotation using SNPeff, GATK VariantAnnotator, and SnpSift + 8. Variant quality score recalibration (GATK) + 9. Flags variants within genes of interest (such as known disease genes) (GATK) (optional) - 11. Filters potential de novo variants - 12. Filters potential de novo variants using lower stringency criteria - 13. Filters potential dominant mutations - 14. Filters potential homozygous recessive mutations - 15. Filters potential compound heterozygous mutations - 16. Generates summary statistics for unfiltered vcf file - 17. Generates report + 10. Filters potential de novo variants + 11. Filters potential de novo variants using lower stringency criteria + 12. Filters potential dominant mutations + 13. Filters potential homozygous recessive mutations + 14. Filters potential compound heterozygous mutations + 15. Generates summary statistics for unfiltered vcf file + 16. Generates report .. note:: @@ -152,6 +151,7 @@ * samtools >= 1.1 * GATK >= 2.7 * snpEff >= 4.0 +* somalier * Gemini >= ? * VCFtools >= 0.1.8a @@ -163,8 +163,8 @@ # load modules from ruffus import transform, mkdir, follows, merge, regex, suffix, \ jobs_limit, files, collate, add_inputs, formatter, \ - active_if -from ruffus.combinatorics import permutations + active_if, originate, subdivide +from ruffus.combinatorics import permutations, product import sys import os import csv @@ -177,9 +177,8 @@ import cgatcore.iotools as iotools from cgatcore import pipeline as P import cgatpipelines.tasks.mapping as mapping -import cgatpipelines.tasks.mappingqc as mappingqc +import cgatpipelines.tasks.bamstats as bamstats import cgatpipelines.tasks.exome as exome -import cgatpipelines.tasks.exomeancestry as exomeancestry ############################################################################### ############################################################################### @@ -196,9 +195,14 @@ matches = glob.glob("*.fastq.1.gz") + glob.glob( "*.fastq.gz") + glob.glob("*.sra") + glob.glob("*.csfasta.gz") +PICARD_MEMORY = PARAMS["picard_memory"] +GATK_MEMORY = PARAMS["gatk_memory"] +ANNOTATION_MEMORY = PARAMS["annotation_memory"] + +PANELS = glob.glob("*.panel.tsv") + +FILTERS = glob.glob ("*.filter.tsv") -def getGATKOptions(): - return "-l mem_free=1.4G" ############################################################################### ############################################################################### @@ -207,12 +211,10 @@ def getGATKOptions(): # The following functions are designed to upload meta-data to the csvdb # These haven't been fully implemented yet - @jobs_limit(PARAMS.get("jobs_limit_db", 1), "db") @files(PARAMS["roi_bed"], "roi.load") def loadROI(infile, outfile): '''Import regions of interest bed file into SQLite.''' - scriptsdir = PARAMS["general_scriptsdir"] header = "chr,start,stop,feature" tablename = P.to_table(outfile) statement = '''cat %(infile)s @@ -224,14 +226,10 @@ def loadROI(infile, outfile): > %(outfile)s ''' P.run(statement) -############################################################################### - - @jobs_limit(PARAMS.get("jobs_limit_db", 1), "db") @files(PARAMS["roi_to_gene"], "roi2gene.load") def loadROI2Gene(infile, outfile): '''Import genes mapping to regions of interest bed file into SQLite.''' - scriptsdir = PARAMS["general_scriptsdir"] tablename = P.to_table(outfile) statement = '''cat %(infile)s | cgat csv2db %(csv2db_options)s @@ -241,14 +239,10 @@ def loadROI2Gene(infile, outfile): > %(outfile)s ''' P.run(statement) -############################################################################### - - @jobs_limit(PARAMS.get("jobs_limit_db", 1), "db") @files(PARAMS["samples"], "samples.load") def loadSamples(infile, outfile): '''Import sample information into SQLite.''' - scriptsdir = PARAMS["general_scriptsdir"] tablename = P.to_table(outfile) statement = '''cat %(infile)s | cgat csv2db %(csv2db_options)s @@ -258,11 +252,60 @@ def loadSamples(infile, outfile): > %(outfile)s ''' P.run(statement) +@originate("genome.dict") +def createSequenceDictionary(outfile): + '''creates a sequence dictionary required for running + gatk/picard tools''' + genome_file = os.path.abspath( + os.path.join(PARAMS["genome_dir"], PARAMS["genome"] + ".fa")) + + statement = '''picard CreateSequenceDictionary + -R %(genome_file)s -O %(outfile)s''' + P.run(statement) + +@originate("ensembl.dict") +def createEnsemblDictionary(outfile): + '''creates a dictionary converting ensembl gene_id + to symbol/gene_name''' + geneset = PARAMS['geneset'] + + statement = '''zcat %(geneset)s| + cgat gtf2tsv --attributes-as-columns|grep -v ^#| + awk -F '\\t' '{print $9, $16}'|uniq > %(outfile)s''' + + P.run(statement) + +@mkdir("panels") +@subdivide([x for x in PANELS], + regex("(\S+).panel.tsv"), + add_inputs(createEnsemblDictionary), + r"panels/\1.bed") +def createPanelBed(infiles, outfile): + + geneset = PARAMS['geneset'] + panel_path,dict_path = infiles + ensembl_dict = pd.read_csv(dict_path, sep=" ") + with open(panel_path) as f: + panellist = [line.strip() for line in f] + panel = ensembl_dict[ensembl_dict["gene_name"].isin(panellist)] + panelgenes = " -e ".join(panel['gene_id'].to_list()) + + + statement = '''zcat %(geneset)s + |grep -e %(panelgenes)s|cgat gtf2gtf --method=sort| + cgat gff2bed --is-gtf --log %(outfile)s.log | sort -k1,1 -k2,2n| + cgat bed2bed --method=merge --merge-by-name + --log %(outfile)s.log > %(outfile)s''' + + P.run(statement) + ############################################################################### ############################################################################### ############################################################################### -# Alignment to a reference genome +# Pre-Processing for Variant Discovery +############################################################################### +# Alignment & post-alignment QC & GATK preparation @follows(mkdir("bam")) @transform(INPUT_FORMATS, REGEX_FORMATS, r"bam/\1.bam") @@ -274,109 +317,71 @@ def mapReads(infiles, outfile): statement = m.build((infiles,), outfile) P.run(statement, job_memory="8G", job_threads=PARAMS["bwa_threads"]) -############################################################################### -############################################################################### -############################################################################### -# Post-alignment QC - @transform(mapReads, regex(r"bam/(\S+).bam"), r"bam/\1.picard_stats") def PicardAlignStats(infile, outfile): '''Run Picard CollectMultipleMetrics on each BAM file''' - genome = PARAMS["bwa_index_dir"] + "/" + PARAMS["genome"] + ".fa" - mappingqc.buildPicardAlignmentStats(infile, outfile, genome) - -############################################################################### + genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa" + bamstats.buildPicardAlignmentStats(infile, outfile, genome, PICARD_MEMORY) @jobs_limit(PARAMS.get("jobs_limit_db", 1), "db") @merge(PicardAlignStats, "picard_stats.load") def loadPicardAlignStats(infiles, outfile): '''Merge Picard alignment stats into single table and load into SQLite.''' - mappingqc.loadPicardAlignmentStats(infiles, outfile) - -############################################################################### -############################################################################### -############################################################################### -# GATK + bamstats.loadPicardAlignmentStats(infiles, outfile) @follows(loadPicardAlignStats, mkdir("gatk")) -@transform(mapReads, regex(r"bam/(\S+).bam"), r"gatk/\1.readgroups.bam") -def GATKReadGroups(infile, outfile): +@transform(mapReads, regex(r"bam/(\S+).bam"), + add_inputs(createSequenceDictionary), + r"gatk/\1.readgroups.bam") +def GATKReadGroups(infiles, outfile): '''Reorders BAM according to reference fasta and adds read groups using GATK''' - '''Reorders BAM according to reference fasta and add read groups using - SAMtools, realigns around indels and recalibrates base quality - scores using GATK - - ''' + infile, dictionary = infiles track = re.sub(r'-\w+-\w+\.bam', '', os.path.basename(infile)) - tmpdir_gatk = P.get_temp_dir('.') - job_options = getGATKOptions() job_threads = PARAMS["gatk_threads"] library = PARAMS["readgroup_library"] platform = PARAMS["readgroup_platform"] platform_unit = PARAMS["readgroup_platform_unit"] - genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa" - exome.GATKReadGroups(infile, outfile, genome, + exome.GATKReadGroups(infile, outfile, dictionary, library, platform, - platform_unit, track) + platform_unit, track, + GATK_MEMORY) iotools.zap_file(infile) -############################################################################### -############################################################################### + ############################################################################### # Remove duplicates, realign and recalibrate lane-by-lane - @transform(GATKReadGroups, regex(r"gatk/(\S+).readgroups.bam"), r"gatk/\1.dedup.bam") def RemoveDuplicatesLane(infile, outfile): '''Merge Picard duplicate stats into single table and load into SQLite.''' - mappingqc.buildPicardDuplicateStats(infile, outfile) + bamstats.buildPicardDuplicateStats(infile, outfile, PICARD_MEMORY) iotools.zap_file(infile) -############################################################################### - @jobs_limit(PARAMS.get("jobs_limit_db", 1), "db") @merge(RemoveDuplicatesLane, "picard_duplicate_stats_lane.load") def loadPicardDuplicateStatsLane(infiles, outfile): '''Merge Picard duplicate stats into single table and load into SQLite.''' - mappingqc.loadPicardDuplicateStats(infiles, outfile) - -############################################################################### + bamstats.loadPicardDuplicateStats(infiles, outfile) @transform(RemoveDuplicatesLane, regex(r"gatk/(\S+).dedup.bam"), - r"gatk/\1.realigned2.bam") -def GATKIndelRealignLane(infile, outfile): - '''realigns around indels using GATK''' - threads = PARAMS["gatk_threads"] - genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa" - intervals = PARAMS["roi_intervals"] - padding = PARAMS["roi_padding"] - exome.GATKIndelRealign(infile, outfile, genome, intervals, padding, - threads) - iotools.zap_file(infile) - -############################################################################### - - -@transform(GATKIndelRealignLane, - regex(r"gatk/(\S+).realigned2.bam"), r"gatk/\1.bqsr.bam") def GATKBaseRecal(infile, outfile): '''recalibrates base quality scores using GATK''' intrack = P.snip(os.path.basename(infile), ".bam") outtrack = P.snip(os.path.basename(outfile), ".bam") dbsnp = PARAMS["gatk_dbsnp"] - solid_options = PARAMS["gatk_solid_options"] - genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa" + options = PARAMS["gatk_baserecalibrator_options"] + genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa" intervals = PARAMS["roi_intervals"] padding = PARAMS["roi_padding"] if PARAMS["targetted"]: @@ -384,15 +389,14 @@ def GATKBaseRecal(infile, outfile): shutil.copyfile(intrack + ".bai", outtrack + ".bai") else: exome.GATKBaseRecal(infile, outfile, genome, intervals, - padding, dbsnp, solid_options) + padding, dbsnp, options, + GATK_MEMORY) iotools.zap_file(infile) -############################################################################### -############################################################################### + ############################################################################### # Merge BAMs across different lanes for the same sample - @collate(GATKBaseRecal, regex(r"gatk/(.*).bqsr.bam"), r"gatk/\1.merged.bam") @@ -413,12 +417,10 @@ def mergeBAMs(infiles, outfile): for inputfile in infiles: iotools.zap_file(inputfile) -############################################################################### -############################################################################### + ############################################################################### # Remove duplicates sample-by-sample - @transform(mergeBAMs, regex(r"gatk/(\S+).merged.bam"), add_inputs(r"gatk/\1.merged.bam.count"), @@ -428,582 +430,429 @@ def RemoveDuplicatesSample(infiles, outfile): infile, countfile = infiles countf = open(countfile, "r") if countf.read() != '1': - mappingqc.buildPicardDuplicateStats(infile, outfile) + bamstats.buildPicardDuplicateStats(infile, outfile, PICARD_MEMORY) else: shutil.copyfile(infile, outfile) shutil.copyfile(infile + ".bai", outfile + ".bai") iotools.zap_file(infile) -############################################################################### - - @jobs_limit(PARAMS.get("jobs_limit_db", 1), "db") @merge(RemoveDuplicatesSample, "picard_duplicate_stats_sample.load") def loadPicardDuplicateStatsSample(infiles, outfile): '''Merge Picard duplicate stats into single table and load into SQLite.''' - mappingqc.loadPicardDuplicateStats(infiles, outfile) - -############################################################################### -############################################################################### -############################################################################### -# Realign sample-by-sample + bamstats.loadPicardDuplicateStats(infiles, outfile) -@transform(RemoveDuplicatesSample, - regex(r"gatk/(\S+).dedup2.bam"), - add_inputs(r"gatk/\1.merged.bam.count"), - r"gatk/\1.realigned.bam") -def GATKIndelRealignSample(infiles, outfile): - '''realigns around indels using GATK''' - infile, countfile = infiles - threads = PARAMS["gatk_threads"] - genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa" - intervals = PARAMS["roi_intervals"] - padding = PARAMS["roi_padding"] - countf = open(countfile, "r") - if countf.read() > '1': - exome.GATKIndelRealign(infiles[0], outfile, genome, intervals, - padding, threads) - else: - shutil.copyfile(infile, outfile) - shutil.copyfile(infile + ".bai", outfile + ".bai") -# iotools.zap_file(infile) - -############################################################################### -############################################################################### ############################################################################### # Coverage of targetted area -@transform(GATKIndelRealignSample, - regex(r"gatk/(\S+).realigned.bam"), +@transform(RemoveDuplicatesSample, + regex(r"gatk/(\S+).dedup2.bam"), r"gatk/\1.cov") def buildCoverageStats(infile, outfile): '''Generate coverage statistics for regions of interest from a bed file using Picard''' baits = PARAMS["roi_baits"] regions = PARAMS["roi_regions"] - mappingqc.buildPicardCoverageStats(infile, outfile, - baits, regions) - -############################################################################### + bamstats.buildPicardCoverageStats(infile, outfile, + baits, regions, PICARD_MEMORY) @jobs_limit(PARAMS.get("jobs_limit_db", 1), "db") @merge(buildCoverageStats, "coverage_stats.load") def loadCoverageStats(infiles, outfile): '''Import coverage statistics into SQLite''' - mappingqc.loadPicardCoverageStats(infiles, outfile) - -############################################################################### -############################################################################### -############################################################################### -# Guess sex - + bamstats.loadPicardCoverageStats(infiles, outfile) -@follows(mkdir("xy_ratio")) -@transform(GATKIndelRealignSample, - regex(r"gatk/(\S+).realigned.bam"), - r"xy_ratio/\1.sex") -def calcXYratio(infile, outfile): - '''Guess the sex of a sample based on ratio of reads - per megabase of sequence on X and Y''' - exome.guessSex(infile, outfile) - -############################################################################### -@merge(calcXYratio, "xy_ratio/xy_ratio.tsv") -def mergeXYRatio(infiles, outfile): - '''merge XY ratios from all samples and load into database''' - inlist = " ".join(infiles) - statement = '''cgat combine_tables - --add-file-prefix --regex-filename="xy_ratio/(\S+).sex" - --no-titles --missing-value=0 --ignore-empty - -L %(outfile)s.log -v 6 - --cat=Track %(inlist)s - > %(outfile)s''' - P.run(statement) - -############################################################################### - - -@jobs_limit(PARAMS.get("jobs_limit_db", 1), "db") -@transform(mergeXYRatio, regex(r"xy_ratio/xy_ratio.tsv"), - r"xy_ratio/xy_ratio.load") -def loadXYRatio(infile, outfile): - '''load into database''' - P.load(infile, outfile, "--header-names=Track,X,Y,XY_ratio") - ############################################################################### ############################################################################### ############################################################################### -# Variant Calling +# GATK germline cohort calling @follows(mkdir("variants")) -@transform(GATKIndelRealignSample, regex(r"gatk/(\S+).realigned.bam"), +@transform(RemoveDuplicatesSample, regex(r"gatk/(\S+).dedup2.bam"), r"variants/\1.haplotypeCaller.g.vcf") def haplotypeCaller(infile, outfile): '''Call SNVs and indels using GATK HaplotypeCaller in individuals''' - genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa" - job_options = getGATKOptions() + genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa" job_threads = PARAMS["gatk_threads"] dbsnp = PARAMS["gatk_dbsnp"] intervals = PARAMS["roi_intervals"] padding = PARAMS["roi_padding"] options = PARAMS["gatk_hc_options"] exome.haplotypeCaller(infile, outfile, genome, dbsnp, - intervals, padding, options) + intervals, padding, options, GATK_MEMORY) -############################################################################### - - -@merge(haplotypeCaller, "variants/all_samples.vcf") -def genotypeGVCFs(infiles, outfile): - '''Joint genotyping of all samples together''' - genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa" - inputfiles = " --variant ".join(infiles) - options = PARAMS["gatk_genotype_options"] - exome.genotypeGVCFs(inputfiles, outfile, genome, options) - -############################################################################### -############################################################################### -############################################################################### -# HapMap Genotypes - - -@follows(mkdir("hapmap")) -@files(PARAMS["hapmap_vcf"], "hapmap/hapmap_exome.bed") -def SelectExonicHapmapVariants(infile, outfile): - '''Select variants from HapMap project that are located within - exome target regions. Assumes bgzipped & tabix indexed Hapmap VCF file.''' - bed = PARAMS["roi_regions"] - statement = '''tabix -B %(infile)s %(bed)s | - awk '{OFS="\\t" if (!/^#/){print $1,$2-1,$2}}' - > %(outfile)s''' % locals() - P.run(statement) - -############################################################################### - - -@follows(SelectExonicHapmapVariants) -@transform(GATKIndelRealignSample, - regex(r"gatk/(\S+).realigned.bam"), - add_inputs("hapmap/hapmap_exome.bed"), - r"hapmap/\1.hapmap.vcf") -def HapMapGenotype(infiles, outfile): - '''Genotype HapMap SNPs using HaplotypeCaller in each individual''' - infile, intervals = infiles - genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa" - dbsnp = PARAMS["gatk_dbsnp"] - padding = PARAMS["hapmap_padding"] - options = PARAMS["hapmap_hc_options"] - exome.haplotypeCaller(infile, outfile, genome, dbsnp, - intervals, padding, options) - -############################################################################### +@merge(haplotypeCaller, "variants/genomicsdb.log") +def consolidateGVCFs(infiles, outfile): + '''generates a GenomicsDB workspace from all GVCF files, this is an + easy-access database of all samples developed by the Intel-Broad team.''' + inputlen = len(infiles) + inputfiles = " -V ".join(infiles) + + + DB_MEMORY = PARAMS["gatk_dbmem"] -@transform(HapMapGenotype, regex(r"hapmap/(\S+).hapmap.vcf"), - r"hapmap/\1.hapmap.vcf.gz") -def indexVCFs(infile, outfile): - '''Genotype HapMap SNPs using HaplotypeCaller in each individual''' - statement = '''bgzip -c %(infile)s > %(outfile)s && - tabix -p vcf %(outfile)s; ''' - P.run(statement) + exome.consolidateGVCFs(inputfiles, outfile, inputlen, DB_MEMORY) -############################################################################### -############################################################################### -############################################################################### -# Compare Hapmap genotypes to assess relatedness - - -@follows(indexVCFs, mkdir("hapmap/vcfcompare")) -@permutations("hapmap/*.hapmap.vcf.gz", - formatter("hapmap/(\S+).hapmap.vcf.gz$"), - 2, - r"hapmap/vcfcompare/{basename[0][0]}_vs_{basename[1][0]}.vcfcompare") -def vcfCompare(infiles, outfile): - '''Compare HapMap genotypes from each pair of calls ''' - sample1, sample2 = infiles - name1 = P.snip(os.path.basename(sample1), ".hapmap.vcf.gz") - name2 = P.snip(os.path.basename(sample2), ".hapmap.vcf.gz") - statement = '''vcf-compare -g -m %(name1)s:%(name2)s - %(sample1)s %(sample2)s > %(outfile)s''' - P.run(statement) -############################################################################### - - -@transform(vcfCompare, - regex(r"hapmap/vcfcompare/(\S+).vcfcompare"), - r"hapmap/vcfcompare/\1.ndr") -def parseVcfCompare(infile, outfile): - '''Extract non-reference discordance rate for each comparison''' - statement = '''cat %(infile)s - | grep "Non-reference Discordance Rate (NDR):" - | cut -f 3 - > %(outfile)s''' - P.run(statement) - -############################################################################### - - -@merge(parseVcfCompare, "hapmap/vcfcompare/ndr.tsv") -def mergeNDR(infiles, outfile): - '''Merge NDR values for all files''' - outf = open(outfile, "w") - outf.write("sample1\tsample2\tNDR\n") - for infile in infiles: - inbase = P.snip(os.path.basename(infile), ".ndr") - sample1, sample2 = inbase.split("_vs_") - inf = open(infile, "r") - ndr = inf.readline().strip() - result = "\t".join([sample1, sample2, ndr]) - outf.write(result + "\n") - inf.close() - outf.close() - -############################################################################### - - -@jobs_limit(PARAMS.get("jobs_limit_db", 1), "db") -@transform(mergeNDR, regex(r"hapmap/vcfcompare/ndr.tsv"), - r"hapmap/vcfcompare/ndr.load") -def loadNDR(infile, outfile): - '''Load NDR into database''' - P.load(infile, outfile) - -############################################################################### -############################################################################### -############################################################################### -# Variant Annotation - - -@transform(genotypeGVCFs, - regex(r"variants/all_samples.vcf"), - r"variants/all_samples.snpeff.vcf") -def annotateVariantsSNPeff(infile, outfile): - '''Annotate variants using SNPeff''' - job_memory = PARAMS["annotation_memory"] - job_threads = PARAMS["annotation_threads"] - snpeff_genome = PARAMS["annotation_snpeff_genome"] - config = PARAMS["annotation_snpeff_config"] - statement = ( - "snpEff -Xmx%(job_memory)s " - "eff " - "-c %(config)s " - "-v %(snpeff_genome)s " - "-o gatk %(infile)s > %(outfile)s" % locals()) - P.run(statement, job_memory=PARAMS["annotation_memory"]) - -############################################################################### - - -@transform(annotateVariantsSNPeff, - regex(r"variants/all_samples.snpeff.vcf"), - r"variants/all_samples.snpeff.table") -def vcfToTableSnpEff(infile, outfile): - '''Converts vcf to tab-delimited file''' - genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa" - columns = PARAMS["annotation_snpeff_to_table"] - exome.vcfToTable(infile, outfile, genome, columns) - - -@jobs_limit(PARAMS.get("jobs_limit_db", 1), "db") -@transform(vcfToTableSnpEff, regex(r"variants/(\S+).table"), - r"variants/\1.table.load") -def loadTableSnpEff(infile, outfile): - '''Load VCF annotations into database''' - P.load(infile, outfile, options="--retry --ignore-empty") - - -############################################################################### -# GATK Variant Annotator - - -@merge(GATKIndelRealignSample, "gatk/all_samples.list") -def listOfBAMs(infiles, outfile): - '''generates a file containing a list of BAMs for use in VQSR''' - with iotools.open_file(outfile, "w") as outf: - for infile in infiles: - outf.write(infile + '\n') - -############################################################################### +@transform(consolidateGVCFs, regex(r"variants/genomicsdb.log"), + "variants/all_samples.vcf") +def genotypeGVCFs(infile, outfile): + '''Joint genotyping of all samples together''' + genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa" + options = PARAMS["gatk_genotype_options"] + + exome.genotypeGVCFs(infile, outfile, genome, options) -@follows(annotateVariantsSNPeff) @transform(genotypeGVCFs, regex(r"variants/all_samples.vcf"), - add_inputs(listOfBAMs, r"variants/all_samples.snpeff.vcf"), - r"variants/all_samples.annotated.vcf") -def variantAnnotator(infiles, outfile): - '''Annotate variant file using GATK VariantAnnotator''' - vcffile, bamlist, snpeff_file = infiles - genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa" - dbsnp = PARAMS["gatk_dbsnp"] - annotations = PARAMS["gatk_variant_annotations"] - exome.variantAnnotator(vcffile, bamlist, outfile, genome, - dbsnp, annotations, snpeff_file) + r"variants/all_samples_excesshet.vcf.gz") +def filterExcessHets(infile, outfile): + ''' + Hard filter on Excess Heterozygotes + Should not make a difference on small cohorts unless related + 54.69 corresponds to a Z score of -4.5 + ''' + job_memory = GATK_MEMORY + statement = ''' + gatk --java-options "-Xmx%(job_memory)s -Xms%(job_memory)s" + VariantFiltration + -V %(infile)s + --filter-expression "ExcessHet > 54.69" + --filter-name ExcessHet + -O %(outfile)s + > %(outfile)s.log 2>&1 + ''' + P.run(statement) ############################################################################### # SNP Recalibration +@transform(filterExcessHets, + regex(r"variants/all_samples_excesshet.vcf.gz"), + r"variants/all_samples_sitesonly.vcf.gz") +def sitesOnlyVcf(infile,outfile): + ''' + Create sites-only VCF with MakeSitesOnlyVcf + ''' + job_memory = GATK_MEMORY + statement = '''gatk MakeSitesOnlyVcf + -I %(infile)s + -O %(outfile)s + > %(outfile)s.log 2>&1 ''' + P.run(statement) + -@transform(variantAnnotator, - regex(r"variants/all_samples.annotated.vcf"), +@transform(sitesOnlyVcf, + regex(r"variants/all_samples_sitesonly.vcf.gz"), r"variants/all_samples.snp_vqsr.recal") def variantRecalibratorSnps(infile, outfile): '''Create variant recalibration file''' - genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa" + genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa" dbsnp = PARAMS["gatk_dbsnp"] - job_options = getGATKOptions() job_threads = PARAMS["gatk_threads"] - track = P.snip(outfile, ".recal") kgenomes = PARAMS["gatk_kgenomes"] hapmap = PARAMS["gatk_hapmap"] omni = PARAMS["gatk_omni"] mode = 'SNP' exome.variantRecalibrator(infile, outfile, genome, mode, dbsnp, - kgenomes, hapmap, omni) - -############################################################################### + kgenomes, hapmap, omni, GATK_MEMORY) @follows(variantRecalibratorSnps) -@transform(variantAnnotator, - regex(r"variants/all_samples.annotated.vcf"), +@transform(filterExcessHets, + regex(r"variants/all_samples_excesshet.vcf.gz"), add_inputs(r"variants/all_samples.snp_vqsr.recal", r"variants/all_samples.snp_vqsr.tranches"), - r"variants/all_samples.snp_vqsr.vcf") -def applyVariantRecalibrationSnps(infiles, outfile): + r"variants/all_samples.snp_vqsr.vcf.gz") +def applyVQSRSnps(infiles, outfile): '''Perform variant quality score recalibration using GATK ''' vcf, recal, tranches = infiles - genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa" + genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa" mode = 'SNP' - exome.applyVariantRecalibration(vcf, recal, tranches, - outfile, genome, mode) + exome.applyVQSR(vcf, recal, tranches, + outfile, genome, mode, GATK_MEMORY) ############################################################################### # Indel recalibration -@transform(applyVariantRecalibrationSnps, - regex(r"variants/all_samples.snp_vqsr.vcf"), - r"variants/all_samples.vqsr.recal") +@transform(sitesOnlyVcf, + regex(r"variants/all_samples_sitesonly.vcf.gz"), + r"variants/all_samples.indel_vqsr.recal") def variantRecalibratorIndels(infile, outfile): '''Create variant recalibration file''' - genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa" - job_options = getGATKOptions() + genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa" job_threads = PARAMS["gatk_threads"] - track = P.snip(outfile, ".recal") mills = PARAMS["gatk_mills"] dbsnp = PARAMS["gatk_dbsnp"] + axiom = PARAMS["gatk_axiom"] mode = 'INDEL' exome.variantRecalibrator(infile, outfile, genome, mode, - mills=mills) - -############################################################################### + mills=mills, axiom=axiom, dbsnp=dbsnp, gatkmem=GATK_MEMORY) @follows(variantRecalibratorIndels) -@transform(applyVariantRecalibrationSnps, - regex(r"variants/all_samples.snp_vqsr.vcf"), - add_inputs(r"variants/all_samples.vqsr.recal", - r"variants/all_samples.vqsr.tranches"), - r"variants/all_samples.vqsr.vcf") -def applyVariantRecalibrationIndels(infiles, outfile): +@transform(applyVQSRSnps, + regex(r"variants/all_samples.snp_vqsr.vcf.gz"), + add_inputs(r"variants/all_samples.indel_vqsr.recal", + r"variants/all_samples.indel_vqsr.tranches"), + r"variants/all_samples.vqsr.vcf.gz") +def applyVQSRIndels(infiles, outfile): '''Perform variant quality score recalibration using GATK ''' vcf, recal, tranches = infiles - genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa" + genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa" mode = 'INDEL' - exome.applyVariantRecalibration(vcf, recal, tranches, - outfile, genome, mode) - + exome.applyVQSR(vcf, recal, tranches, + outfile, genome, mode, GATK_MEMORY) ############################################################################### -# SnpSift - -@transform(applyVariantRecalibrationIndels, - regex(r"variants/all_samples.vqsr.vcf"), - r"variants/all_samples_dbnsfp.snpsift.vcf") -def annotateVariantsDBNSFP(infile, outfile): - '''Add annotations using SNPsift''' - job_options = "-l mem_free=6G" - job_threads = PARAMS["annotation_threads"] - dbNSFP = PARAMS["annotation_dbnsfp"] - dbN_annotators = PARAMS["annotation_dbnsfpannotators"] - if len(dbN_annotators) == 0: - annostring = "" - else: - annostring = "-f %s" % dbN_annotators +# coverage over candidate genes - statement = """SnpSift.sh dbnsfp -db %(dbNSFP)s -v %(infile)s - %(annostring)s > - %(outfile)s;""" - P.run(statement) +@merge(RemoveDuplicatesSample, "gatk/all_samples.list") +def listOfBAMs(infiles, outfile): + '''generates a file containing a list of BAMs''' + with iotools.open_file(outfile, "w") as outf: + for infile in infiles: + outf.write(infile + '\n') +@active_if(PARAMS["coverage_calculate"] == 1) +@transform(listOfBAMs, regex(r"gatk/all_samples.list"), r"candidate.sample_interval_summary") +def candidateCoverage(infile, outfile): + '''Calculate coverage over exons of candidate genes''' + all_exons = PARAMS["coverage_all_exons"] + candidates = PARAMS["coverage_candidates"] + candidates = candidates.replace(",", " -e ") + genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa" + threshold = PARAMS["coverage_threshold"] + statement = '''zcat %(all_exons)s | grep -e %(candidates)s + | awk '{print $1 ":" $4 "-" $5}' - | sed 's/chr//' - > + candidate.interval_list ; ''' % locals() + statement += '''zcat %(all_exons)s | grep -e %(candidates)s + | awk '{print $16}' - | sed 's/"//g;s/;//g' - > + candidate_gene_names.txt ;''' % locals() + statement += '''GenomeAnalysisTK -T DepthOfCoverage -R %(genome)s + -o candidate -I %(infile)s -ct %(threshold)s -L candidate.interval_list ;''' % locals() + P.run(statement) -@transform(annotateVariantsDBNSFP, - regex(r"variants/all_samples_dbnsfp.snpsift.vcf"), - r"variants/all_samples_clinvar.snpsift.vcf") -def annotateVariantsClinvar(infile, outfile): - '''Add annotations using SNPsift''' - job_options = "-l mem_free=6G" - job_threads = PARAMS["annotation_threads"] - clinvar = PARAMS["annotation_clinvar"] - intout = outfile.replace("samples", "samples_clinvar") - statement = """SnpSift.sh annotate %(clinvar)s - %(infile)s > %(outfile)s;""" +@active_if(PARAMS["coverage_calculate"] == 1) +@transform(candidateCoverage, regex(r"candidate.sample_interval_summary"), r"candidate_coverage_plot.pdf") +def candidateCoveragePlots(infile, outfile): + '''Produce plots of coverage''' + rscript = PARAMS["coverage_rscript"] + threshold = PARAMS["coverage_threshold"] + statement = '''Rscript %(rscript)s %(infile)s candidate_gene_names.txt %(threshold)s %(outfile)s ;''' P.run(statement) -@transform(annotateVariantsClinvar, - regex(r"variants/all_samples_clinvar.snpsift.vcf"), - r"variants/all_samples_exac.snpsift.vcf") -def annotateVariantsExAC(infile, outfile): - '''Add annotations using SNPsift''' - job_options = "-l mem_free=6G" - job_threads = PARAMS["annotation_threads"] - exac = PARAMS["annotation_exac"] - statement = """SnpSift.sh annotate - %(exac)s - %(infile)s > %(outfile)s;""" - P.run(statement) +############################################################################### +############################################################################### +############################################################################### +# Variant Annotation -@transform(annotateVariantsExAC, - regex(r"variants/all_samples_exac.snpsift.vcf"), - r"variants/all_samples_gwasc.snpsift.vcf") -def annotateVariantsGWASC(infile, outfile): - '''Add annotations using SNPsift''' - job_options = "-l mem_free=6G" - job_threads = PARAMS["annotation_threads"] - gwas_catalog = PARAMS["annotation_gwas_catalog"] - statement = """SnpSift.sh gwasCat -db %(gwas_catalog)s - %(infile)s > %(outfile)s;""" - P.run(statement) +############################################################################### +# Somalier - assessing relatedness of samples +@transform(applyVQSRIndels, + regex(r"variants/all_samples.vqsr.vcf.gz"), + r"variants/somalier/somalier.log") +def extractSomalier(infile,outfile): + '''extract list of polymorphic sites using Somalier''' -@transform(annotateVariantsGWASC, - regex(r"variants/all_samples_gwasc.snpsift.vcf"), - r"variants/all_samples_phastcons.snpsift.vcf") -def annotateVariantsPhastcons(infile, outfile): - '''Add annotations using SNPsift''' - job_options = "-l mem_free=6G" - job_threads = PARAMS["annotation_threads"] - genomeind = "%s/%s.fa.fai" % ( - PARAMS['general_genome_dir'], - PARAMS['general_genome']) - phastcons = PARAMS["annotation_phastcons"] - intout = outfile.replace("samples", "samples_phastc") - statement = """ln -sf %(genomeind)s %(phastcons)s/genome.fai && - SnpSift.sh phastCons %(phastcons)s %(infile)s > - %(outfile)s;""" + genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa" + somalier = PARAMS['annotation_somalier_path'] + sites = PARAMS['annotation_somalier_sites'] + outdir = os.path.dirname(outfile) + statement = '''%(somalier)s extract + -d %(outdir)s + --sites %(sites)s + -f %(genome)s + %(infile)s + > %(outfile)s 2>&1 + ''' P.run(statement) -@transform(annotateVariantsPhastcons, - regex(r"variants/all_samples_phastcons.snpsift.vcf"), - r"variants/all_samples_1000G.snpsift.vcf") -def annotateVariants1000G(infile, outfile): - '''Add annotations using SNPsift''' - job_options = "-l mem_free=6G" - job_threads = PARAMS["annotation_threads"] +@transform(extractSomalier, + regex(r"variants/somalier/somalier.log"), + r"variants/somalier/relate.log") +def relateSomalier(infile,outfile): + '''Calculate relatedness based on 1000G project''' - vcfs = [] - for f in os.listdir(PARAMS["annotation_tgdir"]): - if f.endswith(".vcf.gz"): - if PARAMS['test'] == 1: - if "chr14" in f: - vcfs.append("%s/%s" % (PARAMS['annotation_tgdir'], f)) - else: - vcfs.append("%s/%s" % (PARAMS['annotation_tgdir'], f)) - - T = P.get_temp_filename(".") - shutil.copy(infile, T) - tempin = T - tempout = P.get_temp_filename(".") - - for vcf in vcfs: - statement = """SnpSift.sh annotate - %(vcf)s - %(tempin)s > %(tempout)s && - mv %(tempout)s %(tempin)s""" - P.run(statement) + somalier = PARAMS['annotation_somalier_path'] + outdir = os.path.dirname(outfile) + statement ='''%(somalier)s relate + %(outdir)s/*.somalier + > %(outfile)s 2>&1''' + P.run(statement) - shutil.move(tempin, outfile) +@transform(extractSomalier, + regex(r"variants/somalier/somalier.log"), + r"variants/somalier/ancestry.log") +def ancestrySomalier(infile,outfile): + '''Calculate Ancestry based on 1000G project''' + + somalier = PARAMS['annotation_somalier_path'] + labels = PARAMS['annotation_somalier_labels'] + reference = PARAMS['annotation_somalier_1kg'] + outdir = os.path.dirname(outfile) + statement ='''%(somalier)s ancestry + --labels %(labels)s %(reference)s/*.somalier ++ + %(outdir)s/*.somalier + > %(outfile)s 2>&1''' + P.run(statement) -@transform(annotateVariants1000G, - regex(r"variants/all_samples_1000G.snpsift.vcf"), - r"variants/all_samples_dbsnp.snpsift.vcf") -def annotateVariantsDBSNP(infile, outfile): - '''Add annotations using SNPsift''' - job_options = "-l mem_free=6G" - job_threads = PARAMS["annotation_threads"] - dbsnp = PARAMS["annotation_dbsnp"] - statement = """SnpSift.sh annotate - %(dbsnp)s - %(infile)s > %(outfile)s;""" +############################################################################### +# Expansionhunter - P.run(statement) +@follows(mkdir("expansionhunter")) +@transform(RemoveDuplicatesSample, regex(r"gatk/(\S+).dedup2.bam"), + r"expansionhunter/\1.vcf") +def expansionHunter(infile, outfile): + + genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa" + catalog = PARAMS["expansionhunter"] + outfile = P.snip(outfile) -@follows(annotateVariantsDBSNP) -def annotateVariantsSNPsift(): - pass + statement = '''ExpansionHunter + --reads %(infile)s + --reference %(genome)s + --variant-catalog %(catalog)s + --output-prefix %(outfile)s + > %(outfile)s.log 2>&1 + ''' + P.run(statement, job_condaenv="expansionhunter") +############################################################################### +# VEP -@transform(annotateVariantsDBSNP, - regex(r"variants/all_samples_dbsnp.snpsift.vcf"), - r"variants/all_samples.vep.vcf") +@transform(applyVQSRIndels, + regex(r"variants/all_samples.vqsr.vcf.gz"), + r"variants/all_samples.vep.vcf.gz") def annotateVariantsVEP(infile, outfile): - ''' - Adds annotations as specified in the pipeline.yml using Ensembl - variant effect predictor (VEP). - ''' + + #Adds annotations as specified in the pipeline.yml using Ensembl + #variant effect predictor (VEP). + # infile - VCF # outfile - VCF with vep annotations - job_options = "-l mem_free=6G" - job_threads = 4 + job_memory = PARAMS["annotation_memory"] + job_threads = PARAMS["annotation_threads"] - VEP = PARAMS["annotation_vepannotators"].split(",") - vep_annotators = PARAMS["annotation_vepannotators"] - vep_path = PARAMS["annotation_veppath"] + genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa" + vep_options = PARAMS["annotation_vepoptions"] vep_cache = PARAMS["annotation_vepcache"] vep_species = PARAMS["annotation_vepspecies"] vep_assembly = PARAMS["annotation_vepassembly"] - if len(vep_annotators) != 0: - annostring = vep_annotators - statement = '''perl %(vep_path)s/variant_effect_predictor.pl - --cache --dir %(vep_cache)s --vcf - --species %(vep_species)s - --fork 2 - --assembly %(vep_assembly)s --input_file %(infile)s - --output_file %(outfile)s --force_overwrite - %(annostring)s --offline;''' - P.run(statement) + spliceAI_snv = PARAMS["annotation_spliceAI_snv"] + spliceAI_indel = PARAMS["annotation_spliceAI_indel"] + dbNSFP_path = PARAMS["annotation_dbnsfp_path"] + dbNSFP_columns = PARAMS["annotation_dbnsfp_columns"] + if dbNSFP_columns is None: + dbNSFP_columns = "ALL" + + statement = '''vep + -i %(infile)s + --cache --dir %(vep_cache)s --vcf + --species %(vep_species)s + --fork %(job_threads)s + --compress_output bgzip + --assembly %(vep_assembly)s + --plugin SameCodon + -o %(outfile)s --force_overwrite''' + if dbNSFP_path is not None: + statement +=''' + --plugin dbNSFP,%(dbNSFP_path)s,%(dbNSFP_columns)s''' + if spliceAI_snv is not None: + statement +=''' + --plugin SpliceAI,snv=%(spliceAI_snv)s,indel=%(spliceAI_indel)s''' + if vep_options is not None: + statement += " %(vep_options)s" + statement += ''' > %(outfile)s.log 2>&1; + tabix -p vcf %(outfile)s''' + P.run(statement) + + + +@subdivide([x for x in FILTERS], + regex("(\S+).filter.tsv"), + add_inputs(annotateVariantsVEP), + r"variants/all_samples.\1.vcf.gz") +def filterVEP(infiles, outfile): + ''' + Filter variants using vep-filter. + More docs + ''' + + filter_file,infile = infiles + with open(filter_file) as f: + filters = f.read() + outfile = P.snip(outfile, ".gz") + + statement = '''filter_vep + --gz + -i %(infile)s + -o %(outfile)s + --only_matched + --force_overwrite + -f "%(filters)s"; + > %(outfile)s.log 2>&1; + bgzip %(outfile)s; + tabix -p vcf %(outfile)s.gz + ''' + P.run(statement) + + +@product(filterVEP, formatter("variants/all_samples.(?P.*).vcf.gz$"), + createPanelBed, formatter("panels/(?P.*).bed$"), + "results/{FILTER[0][0]}/{PANEL[1][0]}.vcf.gz") +def filterPanel(infiles, outfile): + ''' + Filter variants using vep-filter. + More docs + ''' + vcf, panelbed = infiles + outfile = P.snip(outfile, ".gz") + + statement = '''gatk + SelectVariants + -V %(vcf)s + -O %(outfile)s + -L %(panelbed)s + > %(outfile)s.log 2>&1; + bgzip %(outfile)s; + tabix -p vcf %(outfile)s.gz + ''' + P.run(statement) -@follows(mkdir("variant_tables")) -@transform(GATKIndelRealignSample, regex(r"gatk/(.*).realigned.bam"), - add_inputs(annotateVariantsVEP), r"variant_tables/\1.tsv") +@product(RemoveDuplicatesSample, formatter("gatk/(?P.*).dedup2.bam$"), + filterPanel, formatter("results/(?P.*)/(?P.*).vcf.gz$"), + "results/{FILTER[1][0]}/{PANEL[1][0]}/{TRACK[0][0]}.tsv") def makeAnnotationsTables(infiles, outfile): ''' - Converts the multi sample vcf generated with Haplotype caller into + Converts the multi sample vcf into a single table for each sample contain only positions called as variants in that sample and all columns from the vcf. ''' bamname = infiles[0] inputvcf = infiles[1] TF = P.get_temp_filename(".") - samplename = bamname.replace(".realigned.bam", + samplename = bamname.replace(".dedup2.bam", ".bam").replace("gatk/", "") statement = '''bcftools view -h %(inputvcf)s | @@ -1021,7 +870,8 @@ def makeAnnotationsTables(infiles, outfile): cols2.append(line.split("\t")[0]) l1 = "\t".join(cols2) l2 = "\t".join(colds) - out = open(outfile, "w") + os.unlink(TF) + out = open(TF, "w") out.write('''CHROM\tPOS\tQUAL\tID\tFILTER\tREF1\tALT\tGT\t%s\n\ chromosome\tposition\tquality\tid\tfilter\tref\talt\t\ genotype\t%s\n''' % (l1, l2)) @@ -1029,239 +879,81 @@ def makeAnnotationsTables(infiles, outfile): cstring = "\\t".join(cols) cstring = "%CHROM\\t%POS\\t%QUAL\\t%ID\\t%FILTER\\t%REF\\t\ %ALT\\t[%GT]\\t" + cstring - if PARAMS['test'] == 1: - statement = '''bcftools query - -f '%(cstring)s\\n' - -i 'FILTER=="PASS" && GT!="0/0" && GT!="./."' - %(inputvcf)s >> %(outfile)s''' - else: - statement = '''bcftools query -s %(samplename)s - -f '%(cstring)s\\n' - -i 'FILTER=="PASS" && GT!="0/0" && GT!="./."' - %(inputvcf)s >> %(outfile)s''' - P.run(statement) - - -############################################################################### -# Ancestry Functions # -HAPMAP = "%s/*txt.gz" % PARAMS['hapmap_loc'] - - -@merge(HAPMAP, ["snpdict.json", - "randomsnps.tsv"]) -def makeRandomSNPSet(infiles, outfiles): - ''' - Generates a random set of SNPs to use to characterise the ancestry - and relatedness of the samples. - - 1. Finds all SNPs which have a known genotype frequency for all 11 - hapmap ancestries - 2. Picks a random 50000 of these SNPs - 3. Stores a list of these SNPs as randomsnps.tsv - 4. Builds a dictionary of dictionaries of dictionaries where: - At Level 1 each key is a HapMap ancestry ID - each of these keys leads to another dictionary - level 2. - At Level 2 each key is a dbSNP SNP ID - each of these keys leads to another dictionary - level 3. - At Level 3 each key is a genotype: - the values of these keys are the genotype frequency of this - genotype at this SNP in this ancestry. - e.g. snpdict['ASW']['rs000000001']['CT'] --> - frequency of the CT genotype at the rs0000001 SNP in the - ASW population - 5. Stores this dictionary in json format as randomsnps.json - ''' - rs = PARAMS['general_randomseed'] - exomeancestry.MakeSNPFreqDict(infiles, outfiles, rs, submit=True) - - -@follows(mkdir("sample_genotypes.dir")) -@transform(makeAnnotationsTables, regex("variant_tables/(.*).tsv"), - add_inputs(makeRandomSNPSet), r"sample_genotypes.dir/\1.tsv") -def getSampleGenotypes(infiles, outfile): - ''' - Fetches the genotype from the variant tables for all samples - for SNPs in the hapmap sample from makeRandomSNPSet. - - Complex sites are ignored (as simple SNPs are sufficient for these - calculations). - These are: - Sites which failed QC (column 3 in the variant table is not PASS) - Sites with more than 2 alleles defined (column 6 in the variant table - contains more than one alternative allele) - SNPs with more than one ID - Indels - ''' - snps = set([line.strip() - for line in iotools.open_file(infiles[1][1]).readlines()]) - exomeancestry.GenotypeSNPs(infiles[0], snps, outfile, submit=True) - - -@merge(getSampleGenotypes, "calledsnps.tsv") -def concatenateSNPs(infiles, outfile): - ''' - Lists all the SNPs in the sample from makeRandomSNPSet where a variant has - been called in any of the input files. - Stores the reference genotype, chromosome and position to use later. - ''' - snps = set() - for f in infiles: - with iotools.open_file(f) as inp: - for line in inp: - line = line.strip().split("\t") - snps.add("%s\t%s\t%s\t%s" % (line[0], line[4], - line[1], line[2])) - out = iotools.open_file(outfile, "w") - for snp in snps: - out.write("%s\n" % (snp)) - out.close() - - -@follows(mkdir("sample_freqs")) -@transform(getSampleGenotypes, regex("sample_genotypes.dir/(.*).tsv"), - add_inputs(concatenateSNPs, makeRandomSNPSet), - [r"sample_freqs/\1.tsv", r"sample_freqs/\1_anc.tsv"]) -def calculateAncestry(infiles, outfiles): - ''' - Takes the data stored in MakeRandomSNPSet and the genotype of each sample - at each site in calledsnps.tsv and tabulates the frequency of this - genotype in each of the HapMap ancestry categories. - The overall probability of each ancestry is then calculated as the - product of these frequencies. These can only be used in comparison to - each other - to show which of the 11 ancestries is most probable. - ''' - calledsnps = infiles[1] - snpdict = infiles[2][0] - exomeancestry.CalculateAncestry(infiles[0], calledsnps, snpdict, - outfiles, submit=True) - - -@merge(calculateAncestry, "ancestry_estimate.tsv") -def mergeAncestry(infiles, outfile): - ''' - Merges the output of the ancestry calculations for each sample into a - single table. - Draws a plot showing the score (probabilty) for each individual - for their assigned ancestry and the second closest match. - x = individual - y = score - large diamonds represent the best match and small triangles the second - best match - ''' - out = iotools.open_file(outfile, "w") - for f in infiles: - infile = f[1] - scores = [] - ancs = [] - for line in iotools.open_file(infile).readlines(): - line = line.strip().split("\t") - anc = line[0] - score = decimal.Decimal(line[1]) - ancs.append(anc) - scores.append(score) - z = list(zip(ancs, scores)) - s = sorted(z, key=lambda x: x[1])[::-1] - out.write("%s\t%s\t%s\t%s\t%s\n" % (f[0], - s[0][0], s[0][1], - s[1][0], s[1][1])) - out.close() - exomeancestry.PlotAncestry(outfile) - - -@active_if(len(matches) > 1) -@merge((calculateAncestry, concatenateSNPs), - ["all_samples.ped", "all_samples.map"]) -def makePed(infiles, outfiles): - ''' - Generates the required input for the Plink and King software packages. - - PED file - columns are SNPs and rows are samples, each cell is the - genotype of the sample at this SNP - - MAP file - rows are SNPs in the same order as the columns in the ped - file, each row shows chromosome, snp id and position. - ''' - exomeancestry.MakePEDFile(infiles, outfiles) - - -@active_if(len(matches) > 1) -@transform(makePed, suffix(".ped"), ".ibs0") -def runPlinkandKing(infiles, outfile): - ''' - Uses King software to calculate relatedness of pairs of samples as - described here - http://people.virginia.edu/~wc9c/KING/manual.html - Plink is used just to reformat the PED files into the format required by - King. - ''' - pref = infiles[0].split(".")[0] - k = PARAMS['king_path'] - p = PARAMS['king_plink'] - statement = """ - %(p)s/plink --file %(pref)s --make-bed --no-fid --noweb --map3 - --no-parents --no-sex --no-pheno && - %(k)s/king -b plink.bed --binary --prefix %(pref)s && - %(k)s/king -b %(pref)s.bgeno --kinship --ibs --prefix %(pref)s""" + statement = '''bcftools query -s %(samplename)s + -f '%(cstring)s\\n' + -i 'FILTER=="PASS" && GT!="0/0" && GT!="./."' + %(inputvcf)s >> %(TF)s''' P.run(statement) - -@active_if(len(matches) > 1) -@merge(runPlinkandKing, "family_estimate.tsv") -def calculateFamily(infile, outfile): - ''' - Translates and filters the output from King. - Pairs of related samples are written to the output file along with the - degree of relatedness. Degrees are decided using thresholds from - the King documentation, here - http://people.virginia.edu/~wc9c/KING/manual.html - ''' - exomeancestry.CalculateFamily(infile, outfile) + maxInt = sys.maxsize + while True: + # decrease the maxInt value by factor 10 + # as long as the OverflowError occurs. + try: + csv.field_size_limit(maxInt) + break + except OverflowError: + maxInt = int(maxInt/10) + + df = pd.read_table(TF, sep = '\t', engine='python', quoting=3) + CSQ_index = df['CSQ'][0].split("Format:_")[-1].rstrip('\"') + df = df.drop(0) + + if len(df)==0: + df.drop('CSQ',axis=1,inplace=True) + temp_df = pd.DataFrame(columns=CSQ_index.split("|")) + else: + df['CSQ'] = df.CSQ.str.split(",") + df = df.explode('CSQ').reset_index(drop=True) + temp_df = df.CSQ.str.split("|", expand = True) + temp_df.columns = CSQ_index.split("|") + df.drop('CSQ',axis=1,inplace=True) + df = df.join(temp_df, rsuffix="_VEP") + if "SYMBOL" in df.columns: + col = df.pop("SYMBOL") + df.insert(0, "SYMBOL", col) + if "HGVSp_VEP" in df.columns: + col = df.pop("HGVSp_VEP") + df.insert(0, "HGVSp_VEP", col) + if len(df)!=0 and "CANONICAL" in df.columns and "PolyPhen" in df.columns: + # Select canonical variant, then most deleterious + df_final = df.sort_values(by = ['CANONICAL', 'PolyPhen'], ascending = [False, False]).drop_duplicates(["CHROM", "POS"]) + df_final.to_csv(outfile, sep="\t") + outfile_full = P.snip(outfile, ".tsv") + ".tsv_full" + df.to_csv(outfile_full, sep="\t") + else: + df.to_csv(outfile, sep="\t") + os.unlink(TF) -@active_if(len(matches) > 1) -@merge(makeAnnotationsTables, "sex_estimate.tsv") -def calculateSex(infiles, outfile): - ''' - Approximates the sex of the sample using the data in the variant table. - Basic estimate based on heterozygosity on the X chromosome - genotypes - are taken for all called variants on X passing QC and the percentage - of heterozygotes is taken. - This tends to produce two clear populations so Kmeans clustering - is used to split the data into two - male and female. Samples which are - unclear are marked in the output. - ''' - exomeancestry.CalculateSex(infiles, outfile, submit=True) +@collate(makeAnnotationsTables, formatter("results/(?P.*)/(?P.*)/.*.tsv$"), + "results/{FILTER[0]}/{PANEL[0]}_merged.tsv") +def mergeAnnotationsTables(infiles, outfile): -@merge((mergeAncestry, calculateFamily, calculateSex), "summary.tsv") -def summarise(infiles, outfile): - ''' - Builds a summary table for ancestry, family and sex. - ''' - ancestry = pd.read_csv(infiles[0], sep="\t", header=None) - if len(matches) > 1: - family = pd.read_csv(infiles[1], sep="\t", header=None) - sex = pd.read_csv(infiles[2], sep="\t", header=None) - ancestry[0] = [a[-1] for a in ancestry[0].str.split("/")] - family[0] = [a[-1] for a in family[0].str.split("/")] - sex[0] = [a[-1] for a in sex[0].str.split("/")] - sex = sex.drop([1, 2], 1) - sex.columns = ['id', 'sex', 'sex_significance'] - family.columns = ['id', 'related_to', 'degree_relatedness'] - family['degree_relatedness'] = family['degree_relatedness'].astype(int) - ancestry = ancestry.drop([2, 3, 4], 1) - ancestry.columns = ['id', 'ancestry'] - summary = ancestry.merge(sex).merge(family, 'left') - summary = summary.fillna("NA") - summary.to_csv(outfile, sep="\t") + result = False + for infile in infiles: + lines = 0 + with open(infile) as f: + for line in f: + lines = lines + 1 + if lines > 1: + result = True + break + + path = os.path.dirname(infiles[1]) + infiles = " ".join(infiles) + + if result == True: + statement = ''' cgat tables2table %(infiles)s + --cat patient_id + --regex-filename '%(path)s/(\S+).tsv' + | awk '!/^#/{print > "%(outfile)s" ;next} 1' + > %(outfile)s.log 2>&1''' else: - ancestry[0] = [a[-1] for a in ancestry[0].str.split("/")] - ancestry = ancestry.drop([2, 3, 4], 1) - ancestry.columns = ['id', 'ancestry'] - ancestry = ancestry.fillna("NA") - ancestry.to_csv(outfile, sep="\t") - + statement = '''touch %(outfile)s''' + P.run(statement) -@follows(summarise) -def ancestry(): - pass ############################################################################### @@ -1280,144 +972,25 @@ def qualityFilterVariants(infile, outfiles): ''' qualstring = PARAMS['filtering_quality'] qualfilter = PARAMS['filtering_quality_ft'] - exome.filterQuality(infile, qualstring, qualfilter, - outfiles, submit=True) - - -@follows(mkdir("variant_tables_rare")) -@transform(qualityFilterVariants, regex("variant_tables_highqual/(.*).tsv"), - [r'variant_tables_rare/\1.tsv', - r'variant_tables_rare/\1_failed.tsv']) -def rarityFilterVariants(infiles, outfiles): - ''' - Filter out variants which are common in any of the exac or other - population datasets as specified in the pipeline.yml. - ''' - infile = infiles[0] - thresh = PARAMS['filtering_rarethresh'] - freqs = PARAMS['filtering_freqs'] - exac = PARAMS['filtering_exac'] - exome.filterRarity(infile, exac, freqs, thresh, outfiles, - submit=True) - - -@follows(mkdir("variant_tables_damaging")) -@transform(rarityFilterVariants, regex("variant_tables_rare/(.*).tsv"), - [r'variant_tables_damaging/\1.tsv', - r'variant_tables_damaging/\1_failed.tsv']) -def damageFilterVariants(infiles, outfiles): - ''' - Filter variants which have not been assessed as damaging by any - of the specified tools. - Tools and thresholds can be specified in the pipeline.yml. - - Does not account for multiple alt alleles - if any ALT allele has - been assessed as damaging with any tool the variant is kept, - regardless of if this is the allele called in the sample. - - ''' - infile = infiles[0] - exome.filterDamage(infile, PARAMS['filtering_damage'], outfiles, - submit=True) - - -@follows(mkdir("variant_tables_family")) -@transform(damageFilterVariants, regex("variant_tables_damaging/(.*).tsv"), - add_inputs(calculateFamily), - [r'variant_tables_family/\1.tsv', - r'variant_tables_family/\1_failed.tsv']) -def familyFilterVariants(infiles, outfiles): - ''' - Filter variants according to the output of calculateFamily - - only variants shared by both members of a family will be kept. - ''' - if len(matches) > 1: - infile = infiles[0][0] - - infilenam = infile.split("/")[-1] - infilestem = "/".join(infile.split("/")[:-1]) - - # figure out who is related to who - families = [line.strip().split("\t")[:2] - for line in iotools.open_file(infiles[1]).readlines()] - infam = [line[0] for line in families] + [line[1] for line in families] - - # no relatives - copy the input file to the output file and generate - # a blank "failed" file - if infilenam not in infam or PARAMS['filtering_family'] == 0: - shutil.copy(infile, outfiles[0]) - o = iotools.open_file(outfiles[1], "w") - o.close() - else: - for line in families: - if infilenam in line: - i = line.index(infilenam) - if i == 0: - infile2 = "%s/%s" % (infilestem, line[1]) - else: - infile2 = "%s/%s" % (infilestem, line[0]) - exome.filterFamily(infile, infile2, outfiles) + outfile1 = outfiles[0] + outfile2 = outfiles[1] + + if qualstring is not None: + exome.filterQuality(infile, qualstring, qualfilter, + outfiles, submit=True) else: - infile = infiles[0][0] - shutil.copy(infile, outfiles[0]) - out = iotools.open_file(outfiles[1], "w") - out.close() + statement = '''cp %(infile)s %(outfile1)s && + touch %(outfile2)s''' + P.run(statement) -@follows(familyFilterVariants) +@follows(filterVEP) def filter(): pass - -@follows(mkdir("genelists")) -@transform(familyFilterVariants, regex("variant_tables_family/(.*).tsv"), - [r'genelists/\1.tsv', - r'genelists/\1.bed']) -def makeGeneLists(infiles, outfiles): - infile = infiles[0] - outfile = outfiles[1] - genes = PARAMS['general_geneset'] - - # four %s because they need to be escaped in generating the statement - # then again when submitting the P.run(statement) - statement = '''awk 'NR > 2 {printf("%%%%s\\t%%%%s\\t%%%%s\\n",\ - $1, $2, $2 + 1)}'\ - %(infile)s |\ - bedtools intersect -wo -a stdin -b %(genes)s > %(outfile)s''' % locals() - P.run(statement) - - geneids = set() - with iotools.open_file(outfile) as inp: - for line in inp: - line = line.strip().split("\t") - details = line[11].split(";") - for detail in details: - r = re.search('gene_id', detail) - if r: - geneid = detail.split(" ")[-1] - geneids.add(geneid.replace("\"", "")) - out = iotools.open_file(outfiles[0], "w") - for geneid in geneids: - out.write("%s\n" % geneid) - out.close() - - -@follows(mkdir('final_variant_tables')) -@collate((makeGeneLists, familyFilterVariants), regex('(.*)/(.*).tsv'), - r'final_variant_tables/\2.tsv') -def finalVariantTables(infiles, outfile): - genes = infiles[0] - variants = infiles[1][0] - cols = PARAMS['filtering_columns'].split(",") - exome.CleanVariantTables(genes, variants, cols, outfile, - submit=True) - -############################################################################### -############################################################################### ############################################################################### # Genes of interest - @active_if(PARAMS["annotation_add_genes_of_interest"] == 1) @transform((annotateVariantsVEP), regex(r"variants/all_samples.snpsift.vcf"), @@ -1429,7 +1002,7 @@ def findGenes(infile, outfile): geneList = P.as_list(PARAMS["annotation_genes_of_interest"]) expression = '\'||SNPEFF_GENE_NAME==\''.join(geneList) statement = '''GenomeAnalysisTK -T VariantFiltration - -R %%(genome_dir)s/%%(gatkgenome)s.fa + -R %%(genome_dir)s/%%(genome)s.fa --variant %(infile)s --filterExpression "SNPEFF_GENE_NAME=='%(expression)s'" --filterName "GENE_OF_INTEREST" -o %(outfile)s''' % locals() @@ -1451,9 +1024,9 @@ def findGenes(infile, outfile): r"variants/all_samples.snpsift.table") def vcfToTable(infile, outfile): '''Converts vcf to tab-delimited file''' - genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa" + genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa" columns = PARAMS["gatk_vcf_to_table"] - exome.vcfToTable(infile, outfile, genome, columns) + exome.vcfToTable(infile, outfile, genome, columns, GATK_MEMORY) @jobs_limit(PARAMS.get("jobs_limit_db", 1), "db") @@ -1463,528 +1036,6 @@ def loadVariantAnnotation(infile, outfile): '''Load VCF annotations into database''' P.load(infile, outfile, options="--retry --ignore-empty") -############################################################################## -############################################################################### -############################################################################### -# Confirm parentage (do novo trios only) - - -@follows(loadVariantAnnotation) -@transform("*Trio*.ped", - regex(r"(\S*Trio\S+).ped"), - add_inputs(r"variants/all_samples.snpsift.vcf"), - r"variants/\1.parentage") -def confirmParentage(infiles, outfile): - '''Filter variants according to autosomal recessive disease model''' - pedfile, infile = infiles - pedigree = csv.DictReader(open(pedfile), delimiter='\t', fieldnames=[ - 'family', 'sample', 'father', 'mother', 'sex', 'status']) - trio = P.snip(os.path.basename(pedfile), ".ped") - trio = trio.replace(".", "_").replace("-", "_") - database = PARAMS["database_name"] - proband = None - mother = None - father = None - for row in pedigree: - if row['status'] == '2': - proband = row['sample'].replace(".", "_").replace("-", "_") - mother = row['mother'].replace(".", "_").replace("-", "_") - father = row['father'].replace(".", "_").replace("-", "_") - E.info("proband:" + proband) - E.info("mother:" + mother) - E.info("father:" + father) - - query = '''SELECT '%(trio)s' as family, - hom_nonref_trio/(hom_nonref_parents+0.0) as hom_nonref_conc, - child_mat_het_nonref/(maternal_hom_nonref+0.0) as maternal_conc, - child_pat_het_nonref/(paternal_hom_nonref+0.0) as paternal_conc, - * - FROM - (SELECT count(*) as hom_nonref_trio - FROM %(trio)s_genes_table - where CHROM NOT IN ('X', 'Y') - AND %(mother)s_PL LIKE '%%%%,0' - AND cast(%(mother)s_DP as INTEGER) > 10 - AND %(father)s_PL LIKE '%%%%,0' - AND cast(%(father)s_DP as INTEGER) > 10 - AND %(proband)s_PL LIKE '%%%%,0' - AND cast(%(proband)s_DP as INTEGER) > 10), - (SELECT count(*) as hom_nonref_parents - FROM %(trio)s_genes_table - where CHROM NOT IN ('X', 'Y') - AND %(mother)s_PL LIKE '%%%%,0' - AND cast(%(mother)s_DP as INTEGER) > 10 - AND %(father)s_PL LIKE '%%%%,0' - AND cast(%(father)s_DP as INTEGER) > 10 - AND cast(%(proband)s_DP as INTEGER) > 10), - (SELECT count(*) as maternal_hom_nonref - FROM %(trio)s_genes_table - where CHROM NOT IN ('X', 'Y') - AND %(mother)s_PL LIKE '%%%%,0' - AND cast(%(mother)s_DP as INTEGER) > 10 - AND %(father)s_PL LIKE '0,%%%%' - AND cast(%(father)s_DP as INTEGER) > 10 - AND cast(%(proband)s_DP as INTEGER) > 10), - (SELECT count(*) as child_mat_het_nonref - FROM %(trio)s_genes_table - where CHROM NOT IN ('X', 'Y') - AND %(mother)s_PL LIKE '%%%%,0' - AND cast(%(mother)s_DP as INTEGER) > 10 - AND %(father)s_PL LIKE '0,%%%%' - AND cast(%(father)s_DP as INTEGER) > 10 - AND %(proband)s_PL LIKE '%%%%,0,%%%%' - AND cast(%(proband)s_DP as INTEGER) > 10), - (SELECT count(*) as paternal_hom_nonref - FROM %(trio)s_genes_table - where CHROM NOT IN ('X', 'Y') - AND %(father)s_PL LIKE '%%%%,0' - AND cast(%(father)s_DP as INTEGER) > 10 - AND %(mother)s_PL LIKE '0,%%%%' - AND cast(%(mother)s_DP as INTEGER) > 10 - AND cast(%(proband)s_DP as INTEGER) > 10), - (SELECT count(*) as child_pat_het_nonref - FROM %(trio)s_genes_table - where CHROM NOT IN ('X', 'Y') - AND %(father)s_PL LIKE '%%%%,0' - AND cast(%(father)s_DP as INTEGER) > 10 - AND %(mother)s_PL LIKE '0,%%%%' - AND cast(%(mother)s_DP as INTEGER) > 10 - AND %(proband)s_PL LIKE '%%%%,0,%%%%' - AND cast(%(proband)s_DP as INTEGER) > 10)''' % locals() - statement = '''sqlite3 %(database)s "%(query)s" > %(outfile)s - 2> %(outfile)s.log''' % locals() - P.run(statement) - -############################################################################### -############################################################################### -############################################################################### -# De novos - - -@follows(loadVariantAnnotation) -@transform("*Trio*.ped", - regex(r"(\S*Trio\S+).ped"), - add_inputs(r"variants/all_samples.snpsift.vcf"), - r"variants/\1.filtered.vcf") -def deNovoVariants(infiles, outfile): - '''Filter de novo variants based on provided jexl expression''' - job_options = getGATKOptions() - genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa" - pedfile, infile = infiles - pedigree = csv.DictReader( - iotools.open_file(pedfile), delimiter='\t', fieldnames=[ - 'family', 'sample', 'father', 'mother', 'sex', 'status']) - for row in pedigree: - if row['status'] == '2': - father = row['father'] - mother = row['mother'] - child = row['sample'] - select = '''vc.getGenotype("%(father)s").getDP()>=10&&vc.getGenotype("%(mother)s").getDP()>=10&&vc.getGenotype("%(child)s").getPL().0>20&&vc.getGenotype("%(child)s").getPL().1==0&&vc.getGenotype("%(child)s").getPL().2>0&&vc.getGenotype("%(father)s").getPL().0==0&&vc.getGenotype("%(father)s").getPL().1>20&&vc.getGenotype("%(father)s").getPL().2>20&&vc.getGenotype("%(mother)s").getPL().0==0&&vc.getGenotype("%(mother)s").getPL().1>20&&vc.getGenotype("%(mother)s").getPL().2>20&&vc.getGenotype("%(child)s").getAD().1>=3&&((vc.getGenotype("%(child)s").getAD().1)/(vc.getGenotype("%(child)s").getDP().floatValue()))>=0.25&&(vc.getGenotype("%(father)s").getAD().1==0||(vc.getGenotype("%(father)s").getAD().1>0&&((vc.getGenotype("%(father)s").getAD().1)/(vc.getGenotype("%(father)s").getDP().floatValue()))<0.05))&&(vc.getGenotype("%(mother)s").getAD().1==0||(vc.getGenotype("%(mother)s").getAD().1>0&&((vc.getGenotype("%(mother)s").getAD().1)/(vc.getGenotype("%(mother)s").getDP().floatValue()))<0.05))&&(SNPEFF_IMPACT=="HIGH"||SNPEFF_IMPACT=="MODERATE")''' % locals() - exome.selectVariants(infile, outfile, genome, select) - - -@transform(deNovoVariants, - regex(r"variants/(\S+).filtered.vcf"), - r"variants/\1.filtered.table") -def tabulateDeNovos(infile, outfile): - '''Tabulate de novo variants''' - genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa" - columns = PARAMS["gatk_vcf_to_table"] - exome.vcfToTable(infile, outfile, genome, columns) - - -@jobs_limit(PARAMS.get("jobs_limit_db", 1), "db") -@transform(tabulateDeNovos, - regex(r"variants/(\S+).filtered.table"), - r"variants/\1.filtered.table.load") -def loadDeNovos(infile, outfile): - '''Load de novos into database''' - P.load(infile, outfile, - options="--retry --ignore-empty --allow-empty-file") - -############################################################################### - - -@follows(loadVariantAnnotation) -@transform("*Trio*.ped", - regex(r"(\S*Trio\S+).ped"), - add_inputs(r"variants/all_samples.snpsift.vcf"), - r"variants/\1.denovos.vcf") -def lowerStringencyDeNovos(infiles, outfile): - '''Filter lower stringency de novo variants based on provided jexl - expression''' - genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa" - pedfile, infile = infiles - pedigree = csv.DictReader( - iotools.open_file(pedfile), delimiter='\t', fieldnames=[ - 'family', 'sample', 'father', 'mother', 'sex', 'status']) - for row in pedigree: - if row['status'] == '2': - father = row['father'] - mother = row['mother'] - child = row['sample'] - select = '''vc.getGenotype("%(child)s").getPL().1==0&&vc.getGenotype("%(father)s").getPL().0==0&&vc.getGenotype("%(mother)s").getPL().0==0&&(SNPEFF_IMPACT=="HIGH"||SNPEFF_IMPACT=="MODERATE")''' % locals( - ) - exome.selectVariants(infile, outfile, genome, select) - - -@transform(lowerStringencyDeNovos, - regex(r"variants/(\S+).denovos.vcf"), - r"variants/\1.denovos.table") -def tabulateLowerStringencyDeNovos(infile, outfile): - '''Tabulate lower stringency de novo variants''' - genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa" - columns = PARAMS["gatk_vcf_to_table"] - exome.vcfToTable(infile, outfile, genome, columns) - - -@jobs_limit(PARAMS.get("jobs_limit_db", 1), "db") -@transform(tabulateLowerStringencyDeNovos, - regex(r"variants/(\S+).denovos.table"), - r"variants/\1.denovos.table.load") -def loadLowerStringencyDeNovos(infile, outfile): - '''Load lower stringency de novos into database''' - P.load(infile, outfile, - options="--retry --ignore-empty --allow-empty-file") - -############################################################################### -############################################################################### -############################################################################### -# Dominant - - -@follows(loadVariantAnnotation) -@transform("*Multiplex*.ped", - regex(r"(\S*Multiplex\S+).ped"), - add_inputs(r"variants/all_samples.snpsift.vcf"), - r"variants/\1.dominant.vcf") -def dominantVariants(infiles, outfile): - '''Filter variants according to autosomal dominant disease model''' - pedfile, infile = infiles - genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa" - pedigree = csv.DictReader(open(pedfile), delimiter='\t', - fieldnames=['family', 'sample', 'father', - 'mother', 'sex', 'status']) - affecteds = [] - unaffecteds = [] - for row in pedigree: - if row['status'] == '2': - affecteds += [row['sample']] - if row['status'] == '1': - unaffecteds += [row['sample']] - affecteds_exp = '").getPL().1==0&&vc.getGenotype("'.join(affecteds) - affecteds_exp2 = '").getAD().1>=1&&vc.getGenotype("'.join(affecteds) - if len(unaffecteds) == 0: - unaffecteds_exp = '' - else: - unaffecteds_exp = '&&vc.getGenotype("' + \ - ('").isHomRef()&&vc.getGenotype("'.join(unaffecteds)) + \ - '").isHomRef()' - # for some weird reason the 1000G filter doesn't work on these particular - # files - will add later when I've figured out what's wrong - # currently 1000G filter is performed at the report stage (not in csvdb) - select = '''vc.getGenotype("%(affecteds_exp)s").getPL().1==0&&vc.getGenotype("%(affecteds_exp2)s").getAD().1>=1%(unaffecteds_exp)s&&(SNPEFF_IMPACT=="HIGH"||SNPEFF_IMPACT=="MODERATE")''' % locals() - exome.selectVariants(infile, outfile, genome, select) - -############################################################################### - - -@transform(dominantVariants, - regex(r"variants/(\S+).dominant.vcf"), - r"variants/\1.dominant.table") -def tabulateDoms(infile, outfile): - '''Tabulate dominant disease candidate variants''' - genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa" - columns = PARAMS["gatk_vcf_to_table"] - exome.vcfToTable(infile, outfile, genome, columns) - -############################################################################### - - -@jobs_limit(PARAMS.get("jobs_limit_db", 1), "db") -@transform(tabulateDoms, regex(r"variants/(\S+).dominant.table"), - r"variants/\1.dominant.table.load") -def loadDoms(infile, outfile): - '''Load dominant disease candidates into database''' - P.load(infile, outfile, - options="--retry --ignore-empty --allow-empty-file") - -############################################################################### -############################################################################### -############################################################################### -# Recessive - - -@follows(loadVariantAnnotation) -@transform("*.ped", - regex( - r"(\S*Trio\S+|\S*Multiplex\S+).ped"), - add_inputs(r"variants/all_samples.snpsift.vcf"), - r"variants/\1.recessive.vcf") -def recessiveVariants(infiles, outfile): - '''Filter variants according to autosomal recessive disease model''' - genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa" - pedfile, infile = infiles - pedigree = csv.DictReader(open(pedfile), delimiter='\t', - fieldnames=['family', 'sample', 'father', - 'mother', 'sex', 'status']) - affecteds = [] - parents = [] - unaffecteds = [] - for row in pedigree: - if row['status'] == '2': - affecteds += [row['sample']] - if row['father'] != '0': - parents += [row['father']] - if row['mother'] != '0': - parents += [row['mother']] - elif row['status'] == '1' and row['sample'] not in parents: - unaffecteds += [row['sample']] - affecteds_exp = '").getPL().2==0&&vc.getGenotype("'.join(affecteds) - if len(unaffecteds) == 0: - unaffecteds_exp = '' - else: - unaffecteds_exp = '&&vc.getGenotype("' + \ - ('").getPL().2!=0&&vc.getGenotype("'.join(unaffecteds)) + \ - '").getPL().2!=0' - if len(parents) == 0: - parents_exp = '' - else: - parents_exp = '&&vc.getGenotype("' + \ - ('").getPL().1==0&&vc.getGenotype("'.join(parents)) + \ - '").getPL().1==0' - # need a way of specifying that other unaffecteds eg. sibs can't be - # homozygous for alt allele - select = '''vc.getGenotype("%(affecteds_exp)s").getPL().2==0%(unaffecteds_exp)s%(parents_exp)s&&(SNPEFF_IMPACT=="HIGH"||SNPEFF_IMPACT=="MODERATE")''' % locals( - ) - exome.selectVariants(infile, outfile, genome, select) - -############################################################################### - - -@transform(recessiveVariants, - regex(r"variants/(\S+).recessive.vcf"), - r"variants/\1.recessive.table") -def tabulateRecs(infile, outfile): - '''Tabulate potential homozygous recessive disease variants''' - genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa" - columns = PARAMS["gatk_vcf_to_table"] - exome.vcfToTable(infile, outfile, genome, columns) - -############################################################################### - - -@jobs_limit(PARAMS.get("jobs_limit_db", 1), "db") -@transform(tabulateRecs, regex(r"variants/(\S+).recessive.table"), - r"variants/\1.recessive.table.load") -def loadRecs(infile, outfile): - '''Load homozygous recessive disease candidates into database''' - P.load(infile, outfile, - options="--retry --ignore-empty --allow-empty-file") - -############################################################################### -############################################################################### -############################################################################### -# X-linked - - -@follows(loadVariantAnnotation) -@transform("*.ped", regex(r"(\S*Trio\S+|\S*Multiplex\S+).ped"), - add_inputs(r"variants/all_samples.snpsift.vcf"), - r"variants/\1.xlinked.vcf") -def xlinkedVariants(infiles, outfile): - '''Find maternally inherited X chromosome variants in male patients''' - track = P.snip(os.path.basename(outfile), ".vcf") - genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa" - pedfile, infile = infiles - pedigree = csv.DictReader(open(pedfile), delimiter='\t', - fieldnames=['family', 'sample', 'father', - 'mother', 'sex', 'status']) - affecteds = [] - mothers = [] - male_unaffecteds = [] - female_unaffecteds = [] - for row in pedigree: - if row['status'] == '2' and row['mother'] != '0': - if row['mother'] not in mothers: - mothers += [row['mother']] - affecteds += [row['sample']] - elif row['status'] == '1' and row['sex'] == '1': - male_unaffecteds += [row['sample']] - elif row['status'] == '1' and row['sex'] != '1' and row['sample'] not in mothers: - female_unaffecteds += [row['sample']] - if len(affecteds) == 0: - affecteds_exp = '' - else: - affecteds_exp = '").isHomRef()&&!vc.getGenotype("'.join(affecteds) -# affecteds_exp = '&&!vc.getGenotype("' + \ -# ('").isHomRef()&&!vc.getGenotype("'.join(affecteds)) + \ -# '").isHomRef()' - if len(mothers) == 0: - mothers_exp = '' - else: - mothers_exp = '&&vc.getGenotype("' + \ - ('").getPL().1==0&&vc.getGenotype("'.join(mothers)) + \ - '").getPL().1==0' - if len(male_unaffecteds) == 0: - male_unaffecteds_exp = '' - else: - male_unaffecteds_exp = '&&vc.getGenotype("' + \ - ('").isHomRef()&&vc.getGenotype("'.join( - male_unaffecteds)) + \ - '").isHomRef()' - if len(female_unaffecteds) == 0: - female_unaffecteds_exp = '' - else: - female_unaffecteds_exp = '&&vc.getGenotype("' + \ - ('").getPL().2!=0&&vc.getGenotype("'.join( - female_unaffecteds)) + \ - '").getPL().2!=0' - select = '''CHROM=="X"&&!vc.getGenotype("%(affecteds_exp)s").isHomRef()%(mothers_exp)s%(male_unaffecteds_exp)s%(female_unaffecteds_exp)s&&(SNPEFF_IMPACT=="HIGH"||SNPEFF_IMPACT=="MODERATE")''' % locals() - exome.selectVariants(infile, outfile, genome, select) - -############################################################################### - - -@transform(xlinkedVariants, - regex(r"variants/(\S+).xlinked.vcf"), - r"variants/\1.xlinked.table") -def tabulateXs(infile, outfile): - '''Tabulate potential X-linked disease variants''' - genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa" - columns = PARAMS["gatk_vcf_to_table"] - exome.vcfToTable(infile, outfile, genome, columns) - -############################################################################### - - -@jobs_limit(PARAMS.get("jobs_limit_db", 1), "db") -@transform(tabulateXs, regex(r"variants/(\S+).xlinked.table"), - r"variants/\1.xlinked.table.load") -def loadXs(infile, outfile): - '''Load X-linked disease candidates into database''' - P.load(infile, outfile, - options="--retry --ignore-empty --allow-empty-file") - -############################################################################### -############################################################################### -############################################################################### -# Compound hets - - -# why does this not work from snpsift VCF file? DS -# @transform(annotateVariantsSNPsift, -# regex(r"variants/(\S*Trio\S+|\S*Multiplex\S+).haplotypeCaller.snpsift.vcf"), -@transform(annotateVariantsSNPeff, - regex( - r"variants/all_samples.snpeff.vcf"), - add_inputs(r"all_samples.ped", r"gatk/all_samples.list"), - r"variants/all_samples.phased.vcf") -def phasing(infiles, outfile): - '''phase variants with GATK''' - infile, pedfile, bamlist = infiles - genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa" - statement = '''GenomeAnalysisTK -T PhaseByTransmission - -R %(genome)s - -V %(infile)s - -ped %(pedfile)s - -mvf %(infile)s.mvf - -o %(outfile)s ;''' % locals() - P.run(statement) - -############################################################################### - - -@transform(phasing, regex(r"variants/all_samples.phased.vcf"), - add_inputs(r"gatk/all_samples.list"), - r"variants/all_samples.rbp.vcf") -def readbackedphasing(infiles, outfile): - '''phase variants with ReadBackedPhasing''' - job_memory = "32G" - infile, bamlist = infiles - genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa" - statement = '''GenomeAnalysisTK -T ReadBackedPhasing -nt 4 - -R %(genome)s - -I %(bamlist)s - -V %(infile)s - -o %(outfile)s; ''' % locals() - P.run(statement) - -############################################################################### - - -@follows(readbackedphasing) -@transform("*.ped", - regex(r"(\S*Multiplex\S+|\S*Trio\S+).ped"), - add_inputs(r"no_multiallelic_all_samples.rbp.vcf", - r"all_samples.ped"), - r"variants/\1.compound_hets.table") -def compoundHets(infiles, outfile): - '''Identify potentially pathogenic compound heterozygous variants - (phasing with GATK followed by compound het calling using Gemini''' - family, infile, pedfile = infiles - family_id = P.snip(os.path.basename(family), ".ped") - statement = '''gemini load -v %(infile)s - -p %(pedfile)s -t snpEff %(family_id)s.db && - gemini comp_hets - --families %(family_id)s - --columns "chrom, start, end, ref, alt, codon_change, gene, qual, depth" - --filter - "(impact_severity = 'HIGH' OR impact_severity = 'MED') - AND (in_esp = 0 OR aaf_esp_all < 0.01) - AND (in_1kg = 0 OR aaf_1kg_all < 0.01)" - %(family_id)s.db > %(outfile)s;''' - # rm -f %(family_id)s.db''' - P.run(statement) - -############################################################################### - - -@jobs_limit(PARAMS.get("jobs_limit_db", 1), "db") -@transform(compoundHets, regex(r"variants/(\S+).compound_hets.table"), - r"variants/\1.compound_hets.table.load") -def loadCompoundHets(infile, outfile): - '''Load compound heterozygous variants into database''' - P.load(infile, outfile, - options="--retry --ignore-empty --allow-empty-file") - -############################################################################### -############################################################################### -############################################################################### -# coverage over candidate genes - - -@active_if(PARAMS["coverage_calculate"] == 1) -@transform(listOfBAMs, regex(r"gatk/all_samples.list"), r"candidate.sample_interval_summary") -def candidateCoverage(infile, outfile): - '''Calculate coverage over exons of candidate genes''' - all_exons = PARAMS["coverage_all_exons"] - candidates = PARAMS["coverage_candidates"] - candidates = candidates.replace(",", " -e ") - genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa" - threshold = PARAMS["coverage_threshold"] - statement = '''zcat %(all_exons)s | grep -e %(candidates)s - | awk '{print $1 ":" $4 "-" $5}' - | sed 's/chr//' - > - candidate.interval_list ; ''' % locals() - statement += '''zcat %(all_exons)s | grep -e %(candidates)s - | awk '{print $16}' - | sed 's/"//g;s/;//g' - > - candidate_gene_names.txt ;''' % locals() - statement += '''GenomeAnalysisTK -T DepthOfCoverage -R %(genome)s - -o candidate -I %(infile)s -ct %(threshold)s -L candidate.interval_list ;''' % locals() - P.run(statement) - -############################################################################### - - -@active_if(PARAMS["coverage_calculate"] == 1) -@transform(candidateCoverage, regex(r"candidate.sample_interval_summary"), r"candidate_coverage_plot.pdf") -def candidateCoveragePlots(infile, outfile): - '''Produce plots of coverage''' - rscript = PARAMS["coverage_rscript"] - threshold = PARAMS["coverage_threshold"] - statement = '''Rscript %(rscript)s %(infile)s candidate_gene_names.txt %(threshold)s %(outfile)s ;''' - P.run(statement) - ############################################################################### ############################################################################### ############################################################################### @@ -2006,7 +1057,6 @@ def buildVCFstats(infile, outfile): @merge(buildVCFstats, "vcf_stats.load") def loadVCFstats(infiles, outfile): '''Import variant statistics into SQLite''' - scriptsdir = PARAMS["general_scriptsdir"] filenames = " ".join(infiles) tablename = P.to_table(outfile) E.info("Loading vcf stats...") @@ -2032,9 +1082,7 @@ def loadVCFstats(infiles, outfile): # Targets -@follows(loadVariantAnnotation, - finalVariantTables, - ancestry) +@follows(loadVariantAnnotation) def testFromVariantRecal(): pass @@ -2054,70 +1102,24 @@ def mapping_tasks(): @follows(GATKBaseRecal, loadPicardDuplicateStatsLane, - GATKIndelRealignSample, + RemoveDuplicatesSample, loadCoverageStats) def gatk(): pass -@follows(loadXYRatio, - indexVCFs, - loadNDR) -def sampleFeatures(): - pass - - @follows(haplotypeCaller, genotypeGVCFs) def callVariants(): pass -@follows(loadTableSnpEff, - listOfBAMs, - loadVariantAnnotation, - finalVariantTables) +@follows(listOfBAMs, + loadVariantAnnotation) def annotation(): pass -@follows(loadDeNovos) -def denovo(): - pass - - -@follows(loadLowerStringencyDeNovos) -def denovo2(): - pass - - -@follows(loadDoms) -def dominant(): - pass - - -@follows(loadRecs) -def recessive(): - pass - - -@follows(loadXs) -def xlinked(): - pass - - -@follows(loadCompoundHets) -def compoundHet(): - pass - - -@follows(denovo, - dominant, - recessive, - compoundHet) -def filtering(): - pass - @follows(buildVCFstats, loadVCFstats) @@ -2127,11 +1129,8 @@ def vcfstats(): @follows(mapping_tasks, gatk, - # sampleFeatures, callVariants, annotation, - filtering, - ancestry, makeAnnotationsTables) def full(): pass diff --git a/cgatpipelines/tools/pipeline_exome/pipeline.yml b/cgatpipelines/tools/pipeline_exome/pipeline.yml index 1d8158e1c..ec86a4889 100644 --- a/cgatpipelines/tools/pipeline_exome/pipeline.yml +++ b/cgatpipelines/tools/pipeline_exome/pipeline.yml @@ -4,16 +4,11 @@ ## Exome pipeline parameters ########################################################## - - -# Script directory -scriptsdir: /ifs/devel/katherineb/cgat/scripts - # the genome to use -genome: hg19 +genome: hg38 # location of indexed genome for SAMtools -genome_dir: /ifs/mirror/genomes/gatk +genome_dir: /path/to/genome/dir ######################################################## # directory where exported data is located @@ -25,20 +20,19 @@ samples: '' web_dir: ../web # If you are not running exome data but targetted sequencing on a smaller region then change to 1 +# Effect of this option is to skip base quality recalibration step targetted: 0 -#pedfile=/ifs/projects/proj024/analysis/illumina_fam8-16/fastq_decon_o5_filt_qc/mapping_human_g1k/samples.ped - -# seed for random number generation -randomseed: 1000 - # gtf file containing exon annotations -geneset: /ifs/mirror/annotations/hg19_ensembl75_hierarchical/ensembl.dir/geneset_all.gtf.gz +geneset: /path/to/ensembl.dir/geneset_all.gtf.gz + +#picard memory +picard_memory: 9G bwa: # location od BWA indexed genome - index_dir: /ifs/mirror/genomes/gatk + index_dir: /path/to/genome # threads threads: 8 @@ -50,7 +44,7 @@ bwa: mem_options: '' # BWA end-pairing parameters - sampe_options: '' + sample_options: '' readgroup: @@ -62,110 +56,118 @@ readgroup: sample: blood -dedup: - - method: picard - gatk: threads: 8 - dbsnp: /ifs/mirror/genomes/gatk/dbsnp_138.hg19.vcf - - hapmap: /ifs/mirror/genomes/gatk/hapmap_3.3.hg19.sites.vcf + memory: 2G - omni: /ifs/mirror/genomes/gatk/1000G_omni2.5.hg19.sites.vcf + # memory required for merging GVCF files + # PLEASE NOTE: this is the memory allocated to picard + # 1.2x this memory will be requested from the cluster to provide sufficent memory for the database tool. + dbmem: 20G - kgenomes: /ifs/mirror/genomes/gatk/1000G_phase1.snps.high_confidence.hg19.sites.vcf + # path to dbsnp VCF (provided by GATK) + dbsnp: /path/to/dbsnp.vcf - mills: /ifs/mirror/genomes/gatk/Mills_and_1000G_gold_standard.indels.hg19.sites.vcf + # path to hapmap VCF (provided by GATK) + hapmap: /path/to/hapmap.vcf - solid_options: '' + # path to OMNI VCF (privided by GATK) + omni: /path/to/1000G_omni - hc_options: -A StrandOddsRatio -A DepthPerSampleHC + # path to 1000G VDF (provided by GATK) + kgenomes: /path/to/1000G_phase1.snps.high_confidence - genotype_options: '' + mills: /path/to/Mills_and_1000G_gold_standard - variant_annotations: [AlleleBalance,AlleleBalanceBySample,AlleleCountBySample,BaseCounts,BaseQualityRankSumTest,ChromosomeCounts,ClippingRankSumTest,Coverage,DepthPerAlleleBySample,FisherStrand,GCContent,GenotypeSummaries,HardyWeinberg,HomopolymerRun,LikelihoodRankSumTest,LowMQ,MappingQualityRankSumTest,MappingQualityZero,MappingQualityZeroBySample,NBaseCount,QualByDepth,RMSMappingQuality,ReadPosRankSumTest,SampleList,SpanningDeletions,StrandBiasBySample,StrandOddsRatio,TandemRepeatAnnotator,VariantType] + axiom: /path/to/axiom - vcf_to_table: -F CHROM -F POS -F ID -F REF -F ALT -F QUAL -F FILTER -F AC -F AF -F AN -F BaseQRankSum -F DB -F DP -F Dels -F FS -F HaplotypeScore -F MLEAC -F MLEAF -F MQ -F MQ0 -F MQRankSum -F QD -F ReadPosRankSum -F SB -F SNPEFF_EFFECT -F SNPEFF_IMPACT -F SNPEFF_FUNCTIONAL_CLASS -F SNPEFF_CODON_CHANGE -F SNPEFF_AMINO_ACID_CHANGE -F SNPEFF_GENE_NAME -F SNPEFF_GENE_BIOTYPE -F SNPEFF_TRANSCRIPT_ID -F SNPEFF_EXON_ID -F dbNSFP_GERP++_RS -F dbNSFP_GERP++_NR -F dbNSFP_Ensembl_transcriptid -F dbNSFP_Uniprot_acc -F dbNSFP_Interpro_domain -F dbNSFP_SIFT_score -F dbNSFP_Polyphen2_HVAR_pred -F dbNSFP_29way_logOdds -F dbNSFP_1000Gp1_AF -F dbNSFP_1000Gp1_AFR_AF -F dbNSFP_1000Gp1_EUR_AF -F dbNSFP_1000Gp1_AMR_AF -F dbNSFP_1000Gp1_ASN_AF -F dbNSFP_ESP6500_AA_AF -F dbNSFP_ESP6500_EA_AF -F RSPOS -F SSR -F SAO -F VP -F VC -F PM -F TPA -F PMC -F MUT -F VLD -F OTHERKG -F PH3 -F CDA -F MTP -F OM -F CAF -F COMMON -GF GT -GF AD -GF GQ -GF PL -GF PQ -GF TP -GF AB -GF DP + # Additional options for GATK baserecalibrator + baserecalibrator_options: '' -hapmap: - - loc: /ifs/mirror/hapmap/genotype_frequencies + # Additional options for GATK HaplotypeCaller + hc_options: -A StrandOddsRatio -A DepthPerSampleHC - # bgzip compressede and tabix indexed VCF file for Hapmap - vcf: /ifs/mirror/genomes/gatk/hapmap_3.3.hg19.sites.vcf + # Additional options for GATK GenotypeGVCFs + genotype_options: '' - padding: 0 + vcf_to_table: -F CHROM -F POS -F ID -F REF -F ALT -F QUAL -F FILTER -F AC -F AF -F AN -F BaseQRankSum -F DB -F DP -F Dels -F FS -F HaplotypeScore -F MLEAC -F MLEAF -F MQ -F MQ0 -F MQRankSum -F QD -F ReadPosRankSum -F SB -F SNPEFF_EFFECT -F SNPEFF_IMPACT -F SNPEFF_FUNCTIONAL_CLASS -F SNPEFF_CODON_CHANGE -F SNPEFF_AMINO_ACID_CHANGE -F SNPEFF_GENE_NAME -F SNPEFF_GENE_BIOTYPE -F SNPEFF_TRANSCRIPT_ID -F SNPEFF_EXON_ID -F dbNSFP_GERP++_RS -F dbNSFP_GERP++_NR -F dbNSFP_Ensembl_transcriptid -F dbNSFP_Uniprot_acc -F dbNSFP_Interpro_domain -F dbNSFP_SIFT_score -F dbNSFP_Polyphen2_HVAR_pred -F dbNSFP_29way_logOdds -F dbNSFP_1000Gp1_AF -F dbNSFP_1000Gp1_AFR_AF -F dbNSFP_1000Gp1_EUR_AF -F dbNSFP_1000Gp1_AMR_AF -F dbNSFP_1000Gp1_ASN_AF -F dbNSFP_ESP6500_AA_AF -F dbNSFP_ESP6500_EA_AF -F RSPOS -F SSR -F SAO -F VP -F VC -F PM -F TPA -F PMC -F MUT -F VLD -F OTHERKG -F PH3 -F CDA -F MTP -F OM -F CAF -F COMMON -GF GT -GF AD -GF GQ -GF PL -GF PQ -GF TP -GF AB -GF DP - hc_options: --emitRefConfidence GVCF +# annotation is done using Ensembl! VEP annotation: memory: 6G - threads: 3 - - #Config file specifies that the genome must be downloaded into your home directory - snpeff_config: /ifs/apps/bio/snpEff-4.3/snpEff.config - - snpeff_genome: hg19 + threads: 4 + + # path to dbNSFP database (aggregation database for annotations) + # leave blank if not required + dbnsfp_path: /path/to/dbNFSP.txt.gz + + # dbNSFP columns to include + # If left empty will fetch ALL annotations. + # Examples: Uniprot_acc, Interpro_domain, + # SIFT_pred, Polyphen2_HDIV_pred, Polyphen2_HVAR_pred, + # LRT_pred, MutationTaster_pred, GERP++_NR, GERP++_RS, + # phastCons100way_vertebrate, 1000Gp1_AF, 1000Gp1_AFR_AF + # ESP6500_AA_AF, ESP6500_EA_AF, MutationTaster_pred, + # MutationAssessor_pred, FATHMM_pred, PROVEAN_pred, + # CADD_phred, MetaSVM_pred, 1000Gp3_AC, 1000Gp3_AF, + # 1000Gp3_AFR_AC, 1000Gp3_AFR_AF, 1000Gp3_EUR_AC, + # ESP6500_EA_AC, ESP6500_EA_AF, ExAC_AC, ExAC_AF, + # ExAC_Adj_AC, ExAC_Adj_AF, ExAC_AFR_AC, ExAC_AFR_AF + dbnsfp_columns: - # path to dbNSFP database - dbnsfp: /ifs/mirror/snpsift/data_2016/dbNFSP.txt.gz + # path to VEP scripts + veppath: /path/to/ensembl-vep-80/variant_effect_predictor - #dbNSFP columns to include - dbnsfpannotators: [SIFT_pred,Polyphen2_HDIV_pred,Polyphen2_HVAR_pred,MutationTaster_pred] + # location of cached annotations + # cached annotations should have the same version number + # as the VEP executable + vepcache: /path/to/VEP - # path to clinvar vcf - clinvar: /ifs/mirror/clinvar/clinvar_20160531.vcf.gz + # species name + vepspecies: homo_sapiens - # path to ExAC vcf - exac: /ifs/mirror/ExAC/sites/ExAC.r0.3.1.sites.vep.vcf.gz + # assembly - needs to correspond to data in cache directory + vepassembly: GRCh38 - # path to dbsnp vcf - dbsnp: /ifs/mirror/dbsnp/All_20160527.vcf.gz + # vep parameters - see vep docs here + # http://www.ensembl.org/info/docs/tools/vep/script/vep_options.html + # format as for command line e.g. as --sift b --regulatory --domains + # Option -e covers almost all annotations + vepoptions: -e - # path to chromosome by chromosome 1000g vcfs - tgdir: /ifs/mirror/1000_genomes/phase3 + # Path to Somalier package (currently not available as conda package) + # Website: https://github.com/brentp/somalier + somalier_path: /path/to/somalier - # path to gwas catalog files - gwas_catalog: /ifs/mirror/gwascatalog/gwas_catalog_v1.0-downloaded_2015-07-14.txt + # Path to Somalier VCF specifying sites used for determination of relatedness + # Provided by Somalier Project + somalier_sites: /path/to/sites.hg38.vcf.gz - # path to "phastcons100way" files - phastcons: /ifs/mirror/phastcons + # Path to 1000G data folder (provided by Somalier project) + somalier_1kg: /path/to/1kg-somalier - snpeff_to_table: -F CHROM -F POS -F ID -F REF -F ALT -F EFF + # Path to Somalier Ancestry Labels for 1000G (provided by Somalier project) + somalier_labels: /path/to/ancestry-labels-1kg.tsv add_genes_of_interest: 0 genes_of_interest: '' - # vep parameters - see vep docs here - # http://www.ensembl.org/info/docs/tools/vep/script/vep_options.html - - # options to pass to vep - this string is appended directly - # to the command when vep is called so should be formatted - # e.g. as - # --sift b --regulatory --domains - vepannotators: --sift b --variant_class --regulatory --domains - - # path to VEP scripts - veppath: /ifs/apps/bio/ensembl-vep-80/variant_effect_predictor - - # location of cached annotations - vepcache: /ifs/mirror/VEP - - # species name - vepspecies: homo_sapiens - - # assembly - needs to correspond to data in cache directory - vepassembly: GRCh37 - # filtering of vcfs to select variants of interest +# filtering of vcfs to select variants of interest +# filtering is done using filtering: + # Quality Filtering can be done either by VQSR, a GATK machine-learning + # algorithm, or by hard-filtering QC scores based on prespecified conditions + quality: [VQSR OR hard] + + # # quality filters, seperated by commas, as column'symbol'score where # column is the column to filter # symbol is '>', '<', '>=', '<=' (including quote marks) @@ -179,55 +181,27 @@ filtering: # any = keep variants filling any of the quality criteria specified above quality_ft: all - rarethresh: 0.01 - - # columns containing comma delimiated allele frequencies to filter by - freqs: [EUR_AF,EAS_AF,AFR_AF,AMR_AF,SAS_AF,ESP_AF_POPMAX,ESP_AF_GLOBAL] - - #populations to use when filtering Exac allele frequencies - #xxx for any columns where an AC_xxx and AN_xxx column is provided in the - #Exac VCF, e.g. Adj will calculate allele frequency from - # the AC_Adj and AN_Adj columns - # currently available Adj (all populations), POPMAX (highest af in any population), - # specific populations e.g. EUR, AFR and others - exac: [Adj,POPMAX] - - # columns containing information about predicted damage from this mutation - # and strings to search for in these columns - # columns containing the strings are kept - # specified as column|string-string-...,column|string... - # e.g. SNPEFF_IMPACT|HIGH-MODERATE,CSQ|HIGH would look for - # "HIGH" or "MODERATE" in the SNPEFF_IMPACT column and - # "HIGH" in the CSQ column - damage: [CSQ|HIGH,dbNSFP_MutationTaster_pred|A-D,dbNSFP_Polyphen2_HDIV_pred|D-P,dbNSFP_Polyphen2_HVAR_pred|D-P,dbNSFP_SIFT_pred|D,SNPEFF_IMPACT|HIGH] - - # if family is 1, keep only variants shared by any family members identified in - # the calculateFamily step - family: 1 + # VEP filtering + # Provide filtering command for filter_vep. + # Can be multiple commands separated + vep: # columns to keep in the final variant tables (apart from CHROM,POS,ID,REF1,ALT,GT) columns: [GQ,DP,FS,SOR,POPMAX_calc,Adj_calc,EUR_AF_calc,EAS_AF_calc,AFR_AF_calc,SAS_AF_calc,ESP_AF_POPMAX_calc,ESP_AF_GLOBAL_calc,CSQ,dbNSFP_MutationTaster_pred,dbNSFP_Polyphen2_HDIV_pred,dbNSFP_Polyphen2_HVAR_pred,dbNSFP_SIFT_pred,SNPEFF_IMPACT] -king: - #path to king - path: /ifs/apps/bio/king - - #path to plink - plink: /ifs/apps/bio/plink-1.07/plink-1.07-x86_64 - coverage: # to activate calculating coverage of candidate genes, specified below. - calculate: 0 + calculate: 1 # supply gtf - all_exons: /ifs/mirror/annotations/hg19_ensembl75/geneset_coding_exons.gtf.gz + all_exons: /path/to/geneset_coding_exons.gtf.gz # comma-separated list of canonical ensembl identifiers for candidate genes of interest candidates: '' # path to r script for generating the coverage plots - rscript: /ifs/projects/proj024/analysis/GRIN1_coverage/coverage.r + rscript: /path/to/coverage.r # the minimum read depth for counting a base as covered threshold: 15 @@ -237,16 +211,16 @@ roi: bed: '' # Regions of interest - roi: /ifs/projects/proj048/agilent/S04380110_Regions.bed + roi: /path/to/agilent/S04380110_Regions.bed - intervals: /ifs/projects/proj048/agilent/S04380110_Regions.interval_list + intervals: /path/to/agilent/S04380110_Regions.interval_list # File mapping regions of interest to genes - to_gene: /ifs/projects/proj048/agilent/regions_to_genes.txt + to_gene: /path/to/agilent/regions_to_genes.txt - baits: /ifs/projects/proj048/agilent/S04380110_Probes_5col_samheader_ucsc.bed + baits: /path/to/agilent/S04380110_Probes_5col_samheader_ucsc.bed - regions: /ifs/projects/proj048/agilent/S04380110_Regions_5col_samheader_ucsc.bed + regions: /path/to/agilent/S04380110_Regions_5col_samheader_ucsc.bed padding: 100 diff --git a/cgatpipelines/tools/pipeline_exome_cancer.py b/cgatpipelines/tools/pipeline_exome_cancer.py index 3b4dab741..a3edb709c 100644 --- a/cgatpipelines/tools/pipeline_exome_cancer.py +++ b/cgatpipelines/tools/pipeline_exome_cancer.py @@ -155,7 +155,7 @@ import sqlite3 import cgatcore.iotools as iotools import cgatpipelines.tasks.mapping as mapping -import cgatpipelines.tasks.mappingqc as mappingqc +import cgatpipelines.tasks.bamstats as bamstats from cgatcore import pipeline as P import re import cgatpipelines.tasks.exome as exome @@ -187,9 +187,11 @@ def connect(): PARAMS = P.PARAMS +PICARD_MEMORY = PARAMS["picard_memory"] + mapping.PARAMS = PARAMS -mappingqc.PARAMS = PARAMS exome.PARAMS = PARAMS + ######################################################################### @@ -251,7 +253,7 @@ def mapReads(infile, outfile): @merge(mapReads, "picard_duplicate_stats.load") def loadPicardDuplicateStats(infiles, outfile): '''Merge Picard duplicate stats into single table and load into SQLite.''' - mappingqc.loadPicardDuplicateStats(infiles, outfile) + bamstats.loadPicardDuplicateStats(infiles, outfile) ######################################################################### @@ -263,7 +265,7 @@ def loadPicardDuplicateStats(infiles, outfile): @merge("bam/*.picard_stats", "picard_stats.load") def loadPicardAlignStats(infiles, outfile): '''Merge Picard alignment stats into single table and load into SQLite.''' - mappingqc.loadPicardAlignmentStats(infiles, outfile) + bamstats.loadPicardAlignmentStats(infiles, outfile) ######################################################################### @@ -292,8 +294,8 @@ def buildCoverageStats(infile, outfile): ''' P.run(statement) - mappingqc.buildPicardCoverageStats( - infile, outfile, modified_baits, modified_baits) + bamstats.buildPicardCoverageStats( + infile, outfile, modified_baits, modified_baits, PICARD_MEMORY) iotools.zap_file(modified_baits) @@ -301,7 +303,7 @@ def buildCoverageStats(infile, outfile): @follows(buildCoverageStats) @merge(buildCoverageStats, "coverage_stats.load") def loadCoverageStats(infiles, outfile): - mappingqc.loadPicardCoverageStats(infiles, outfile) + bamstats.loadPicardCoverageStats(infiles, outfile) ######################################################################### ######################################################################### @@ -463,16 +465,16 @@ def runPicardOnRealigned(infile, outfile): genome = "%s/%s.fa" % (PARAMS["bwa_index_dir"], PARAMS["genome"]) - mappingqc.buildPicardAlignmentStats(infile, outfile, genome) - mappingqc.buildPicardAlignmentStats(infile_tumor, - outfile_tumor, genome) + bamstats.buildPicardAlignmentStats(infile, outfile, genome, PICARD_MEMORY) + bamstats.buildPicardAlignmentStats(infile_tumor, + outfile_tumor, genome, PICARD_MEMORY) @follows(runPicardOnRealigned) @merge("bam/*.realigned.picard_stats", "realigned_picard_stats.load") def loadPicardRealigenedAlignStats(infiles, outfile): '''Merge Picard alignment stats into single table and load into SQLite.''' - mappingqc.loadPicardAlignmentStats(infiles, outfile) + bamstats.loadPicardAlignmentStats(infiles, outfile) ######################################################################### ######################################################################### diff --git a/cgatpipelines/tools/pipeline_exome_cancer/pipeline.yml b/cgatpipelines/tools/pipeline_exome_cancer/pipeline.yml index 5ea8949bd..ce0761375 100644 --- a/cgatpipelines/tools/pipeline_exome_cancer/pipeline.yml +++ b/cgatpipelines/tools/pipeline_exome_cancer/pipeline.yml @@ -9,6 +9,8 @@ # the genome to use genome: human_g1k_v37 +picard_memory: 9G + web_dir: ../web report_engine: cgatreport diff --git a/cgatpipelines/tools/pipeline_genesets.py b/cgatpipelines/tools/pipeline_genesets.py index 20553b932..aaae674f6 100644 --- a/cgatpipelines/tools/pipeline_genesets.py +++ b/cgatpipelines/tools/pipeline_genesets.py @@ -638,7 +638,7 @@ def buildCdsTranscript(infile, outfile): @transform(buildUCSCGeneSet, - regex("*.gtf.gz"), + regex(".*.gtf.gz"), PARAMS['interface_geneset_exons_gtf']) def buildExonTranscript(infile, outfile): ''' diff --git a/cgatpipelines/tools/pipeline_longread.py b/cgatpipelines/tools/pipeline_longread.py new file mode 100644 index 000000000..89f54af05 --- /dev/null +++ b/cgatpipelines/tools/pipeline_longread.py @@ -0,0 +1,180 @@ +"""=========================== +Pipeline template +=========================== + +.. Replace the documentation below with your own description of the + pipeline's purpose + +Overview +======== + +This pipeline computes the word frequencies in the configuration +files :file:``pipeline.yml` and :file:`conf.py`. + +Usage +===== + +See :ref:`PipelineSettingUp` and :ref:`PipelineRunning` on general +information how to use cgat pipelines. + +Configuration +------------- + +The pipeline requires a configured :file:`pipeline.yml` file. +cgatReport report requires a :file:`conf.py` and optionally a +:file:`cgatreport.yml` file (see :ref:`PipelineReporting`). + +Default configuration files can be generated by executing: + + python /pipeline_@template@.py config + +Input files +----------- + +None required except the pipeline configuration files. + +Requirements +------------ + +The pipeline requires the results from +:doc:`pipeline_annotations`. Set the configuration variable +:py:data:`annotations_database` and :py:data:`annotations_dir`. + +Pipeline output +=============== + +.. Describe output files of the pipeline here + +Glossary +======== + +.. glossary:: + + +Code +==== + +""" +from ruffus import * +import sys +import os +import glob +import cgatcore.experiment as E +from cgatcore import pipeline as P + + +# load options from the config file +PARAMS = P.get_parameters( + ["%s/pipeline.yml" % os.path.splitext(__file__)[0], + "../pipeline.yml", + "pipeline.yml"]) + +NPUT_FORMATS = ("*.fastq.1.gz", "*.fastq.gz", "*.sra", "*.csfasta.gz") +REGEX_FORMATS = regex(r"(\S+).(fastq.1.gz|fastq.gz|sra|csfasta.gz)") + +matches = glob.glob("*.fastq.1.gz") + glob.glob( + "*.fastq.gz") + glob.glob("*.sra") + glob.glob("*.csfasta.gz") + +PICARD_MEMORY = PARAMS["picard_memory"] +GATK_MEMORY = PARAMS["gatk_memory"] +ANNOTATION_MEMORY = PARAMS["annotation_memory"] + +PANELS = glob.glob("*.panel.tsv") + +FILTERS = glob.glob ("*.filter.tsv") + + +# --------------------------------------------------- +# Specific pipeline tasks +@mkdir("results.dir") +@transform(("/ceph/project/talbotlab/jscaber/nanopore/"), + regex("(.*)"), + r"/ceph/project/talbotlab/jscaber/nanopore/results.dir") +def nextflowEpi2me(infolder, outfile): + '''count the number of words in the pipeline configuration files.''' + + genome_file = os.path.abspath( + os.path.join(PARAMS["genome_dir"], PARAMS["genome"] + ".fa")) + target_regions = PARAMS["targets"] + + # the command line statement we want to execute + statement = '''nextflow run epi2me-labs/wf-cas9 + --fastq %(infolder)s + --ref_genome %(genome_file)s + --targets %(target_regions)s + --out_dir %(outfile)s + --full_report -profile singularity + + ''' + P.run(statement) + + +@mkdir("last.dir") +@transform(("nextflow/fastq_pass/fastq_pass_ontarget.fastq"), + regex("(.*)"), + r"last.dir/fastq_pass_on_target.par") +def lastTrain(infile, outfile): + '''count the number of words in the pipeline configuration files.''' + + genome_file = os.path.abspath( + os.path.join(PARAMS["genome_dir"], PARAMS["genome"] + ".fa")) + last_index = PARAMS["LAST_index"] + + # the command line statement we want to execute + statement = '''last-train + -P10 -Q0 %(last_index)s %(infile)s > %(outfile)s + ''' + P.run(statement) + +@mkdir("last.dir") +@transform(lastTrain, + regex("(.*).par"), + add_inputs(r"nextflow/fastq_pass/fastq_pass_ontarget.fastq"), + r"\1.maf") +def lastAl(infiles, outfile): + '''count the number of words in the pipeline configuration files.''' + + infile1,infile2 = infiles + genome_file = os.path.abspath( + os.path.join(PARAMS["genome_dir"], PARAMS["genome"] + ".fa")) + last_index = PARAMS["LAST_index"] + + # the command line statement we want to execute + statement = '''lastal -P10 --split -p + %(infile1)s %(last_index)s %(infile2)s > %(outfile)s + ''' + P.run(statement) + + +@transform(lastAl, + regex("(.*).maf"), + r"\1.txt") +def tandemGenotypes(infile, outfile): + '''count the number of words in the pipeline configuration files.''' + + genome_file = os.path.abspath( + os.path.join(PARAMS["genome_dir"], PARAMS["genome"] + ".fa")) + last_index = PARAMS["LAST_index"] + repeat_mask = PARAMS["repeat_mask"] + + + # the command line statement we want to execute + statement = '''tandem-genotypes -g target_regions.bed %(repeat_mask)s %(infile)s > %(outfile)s + ''' + P.run(statement) + +# --------------------------------------------------- +# Generic pipeline tasks +@follows(nextflowEpi2me) +def full(): + pass + + +def main(argv=None): + if argv is None: + argv = sys.argv + P.main(argv) + + +if __name__ == "__main__": + sys.exit(P.main(sys.argv)) diff --git a/cgatpipelines/tools/pipeline_mapping.py b/cgatpipelines/tools/pipeline_mapping.py index 607ccd661..3d6445674 100644 --- a/cgatpipelines/tools/pipeline_mapping.py +++ b/cgatpipelines/tools/pipeline_mapping.py @@ -225,7 +225,6 @@ import cgat.BamTools.bamtools as BamTools import cgatpipelines.tasks.geneset as geneset import cgatpipelines.tasks.mapping as mapping -import cgatpipelines.tasks.mappingqc as mappingqc # Pipeline configuration P.get_parameters( @@ -247,7 +246,6 @@ restrict_interface=True)) geneset.PARAMS = PARAMS -mappingqc.PARAMS = PARAMS # Helper functions mapping tracks to conditions, etc # determine the location of the input files (reads). @@ -1982,7 +1980,7 @@ def mergeBAMFiles(infiles, outfile): infiles = " ".join(infiles) statement = ''' - samtools merge %(outfile)s %(infiles)s >& %(outfile)s.log && + samtools merge -f %(outfile)s %(infiles)s >& %(outfile)s.log && samtools index %(outfile)s ''' P.run(statement) diff --git a/cgatpipelines/tools/pipeline_peakcalling.py b/cgatpipelines/tools/pipeline_peakcalling.py index 423fd9afe..442dd2f1e 100644 --- a/cgatpipelines/tools/pipeline_peakcalling.py +++ b/cgatpipelines/tools/pipeline_peakcalling.py @@ -248,7 +248,7 @@ import cgatcore.experiment as E import cgatcore.iotools as iotools from cgatcore import pipeline as P -import cgatpipelines.tasks.mappingqc as mappingqc +import cgatpipelines.tasks.bamstats as bamstats import cgatpipelines.tasks.peakcalling as peakcalling import cgat.BamTools.bamtools as Bamtools import cgatcore.database as DB @@ -522,7 +522,7 @@ def loadIdxstats(infiles, outfile): idxstats output. Filename format is expected to be 'sample.idxstats' outfile : string Logfile. The table name will be derived from `outfile`.''' - mappingqc.loadIdxstats(infiles, outfile) + bamstats.loadIdxstats(infiles, outfile) @transform((filterChipBAMs, filterInputBAMs), @@ -533,9 +533,10 @@ def buildPicardStats(infiles, outfile): infile = infiles[0] reffile = os.path.join(PARAMS["genome_dir"], PARAMS["genome"] + ".fa") - mappingqc.buildPicardAlignmentStats(infile, - outfile, - reffile) + bamstats.buildPicardAlignmentStats(infile, + outfile, + reffile, + picardmem="9G") @jobs_limit(PARAMS.get("jobs_limit_db", 1), "db") @@ -543,7 +544,7 @@ def buildPicardStats(infiles, outfile): def loadPicardStats(infiles, outfile): '''merge alignment stats into single tables. ''' - mappingqc.loadPicardAlignmentStats(infiles, outfile) + bamstats.loadPicardAlignmentStats(infiles, outfile) @follows(loadFilteringStats, diff --git a/cgatpipelines/tools/pipeline_rnaseqdiffexpression.py b/cgatpipelines/tools/pipeline_rnaseqdiffexpression.py index 552775a82..a9e80c85e 100644 --- a/cgatpipelines/tools/pipeline_rnaseqdiffexpression.py +++ b/cgatpipelines/tools/pipeline_rnaseqdiffexpression.py @@ -539,7 +539,7 @@ def buildSalmonIndex(infile, outfile): path to output file ''' - job_memory = "30G" + job_memory = "40G" threads = 12 if PARAMS["salmon_threads"] is not None: threads = PARAMS["salmon_threads"] @@ -1037,6 +1037,7 @@ def count(): def filterDESeq2(infiles, outfile, design_name, quantifier_name): ''' Load counts into RDS object and filter''' + job_memory = '20G' counts, design = infiles transcripts, genes = counts quantifier_name = quantifier_name.lower() diff --git a/cgatpipelines/tools/pipeline_rnaseqqc.py b/cgatpipelines/tools/pipeline_rnaseqqc.py index 3b9220396..6cced4a81 100644 --- a/cgatpipelines/tools/pipeline_rnaseqqc.py +++ b/cgatpipelines/tools/pipeline_rnaseqqc.py @@ -193,7 +193,7 @@ import cgatcore.iotools as iotools import cgatpipelines.tasks.mapping as mapping import cgatpipelines.tasks.windows as windows -import cgatpipelines.tasks.mappingqc as mappingqc +import cgatpipelines.tasks.bamstats as bamstats import cgatpipelines.tasks.rnaseq as rnaseq from cgatcore import pipeline as P @@ -634,12 +634,12 @@ def buildBAMStats(infile, outfile): P.run(statement) -@P.add_doc(mappingqc.loadBAMStats) +@P.add_doc(bamstats.loadBAMStats) @jobs_limit(PARAMS.get("jobs_limit_db", 1), "db") @merge(buildBAMStats, "bam_stats.load") def loadBAMStats(infiles, outfile): ''' load bam statistics into bam_stats table ''' - mappingqc.loadBAMStats(infiles, outfile) + bamstats.loadBAMStats(infiles, outfile) @P.add_doc(windows.summarizeTagsWithinContext) @@ -842,7 +842,7 @@ def buildRefFlat(infile, outfile): os.unlink(tmpflat) -@P.add_doc(mappingqc.buildPicardRnaSeqMetrics) +@P.add_doc(bamstats.buildPicardRnaSeqMetrics) @transform(mapReadsWithHisat, suffix(".bam"), add_inputs(buildRefFlat), @@ -857,16 +857,16 @@ def buildPicardRnaSeqMetrics(infiles, outfile): else: strand = "NONE" - mappingqc.buildPicardRnaSeqMetrics(infiles, strand, outfile) + bamstats.buildPicardRnaSeqMetrics(infiles, strand, outfile, "9G") -@P.add_doc(mappingqc.loadPicardRnaSeqMetrics) +@P.add_doc(bamstats.loadPicardRnaSeqMetrics) @jobs_limit(PARAMS.get("jobs_limit_db", 1), "db") @merge(buildPicardRnaSeqMetrics, ["picard_rna_metrics.load", "picard_rna_histogram.load"]) def loadPicardRnaSeqMetrics(infiles, outfiles): '''merge alignment stats into single tables.''' - mappingqc.loadPicardRnaSeqMetrics(infiles, outfiles) + bamstats.loadPicardRnaSeqMetrics(infiles, outfiles) ################################################################### diff --git a/cgatpipelines/tools/pipeline_splicing.py b/cgatpipelines/tools/pipeline_splicing.py index 1e9a9e0fb..9bee80f59 100644 --- a/cgatpipelines/tools/pipeline_splicing.py +++ b/cgatpipelines/tools/pipeline_splicing.py @@ -789,6 +789,9 @@ def collateMATS(infiles, outfile): MATS_fdr : string :term:`PARAMS`. User specified threshold for result counting + MATS_psi: string + :term: `PARAMS`. User specified PSI threshold for result counting + outfile: string summary file containing number of results below FDR threshold ''' @@ -811,20 +814,51 @@ def collateMATS(infiles, outfile): for event in ["SE", "A5SS", "A3SS", "MXE", "RI"]: temp = pd.read_csv("%s/%s.MATS.JC.txt" % (indir, event), sep='\t') - total.append(int(len(temp[(temp['FDR'] < - float(PARAMS['MATS_fdr'])) & (abs(temp['IncLevelDifference']) > 0.1)]))) - up.append(int(len(temp[(temp['FDR'] < - float(PARAMS['MATS_fdr'])) & (temp['IncLevelDifference'] > 0.1)]))) - down.append(int(len(temp[(temp['FDR'] < - float(PARAMS['MATS_fdr'])) & (temp['IncLevelDifference'] < -0.1)]))) + # calculate mean coverage for particular junction for each variable + temp['SJC_SUM_1'] = [sum(x)/len(x) for x in temp.SJC_SAMPLE_1.str.split(',').apply(lambda x: [float(i) for i in x])] + temp['IJC_SUM_1'] = [sum(x)/len(x) for x in temp.IJC_SAMPLE_1.str.split(',').apply(lambda x: [float(i) for i in x])] + temp['SJC_SUM_2'] = [sum(x)/len(x) for x in temp.SJC_SAMPLE_2.str.split(',').apply(lambda x: [float(i) for i in x])] + temp['IJC_SUM_2'] = [sum(x)/len(x) for x in temp.IJC_SAMPLE_2.str.split(',').apply(lambda x: [float(i) for i in x])] + temp['1_MAX'] = temp[['SJC_SUM_1','IJC_SUM_1']].max(axis=1) + temp['2_MAX'] = temp[['SJC_SUM_2','IJC_SUM_2']].max(axis=1) + # three conditions to be met: + # 1. FDR thrshold + # 2. Fixed PSI threshold + # 3. minimum coverage in both conditions + total.append(int(len(temp[(temp['FDR'] < float(PARAMS['MATS_fdr'])) & + (abs(temp['IncLevelDifference']) > PARAMS['MATS_psi']) & + (temp['1_MAX'] >= PARAMS['MATS_coverage']) & + (temp['2_MAX'] >= PARAMS['MATS_coverage'])]))) + up.append(int(len(temp[(temp['FDR'] < float(PARAMS['MATS_fdr'])) & + (temp['IncLevelDifference'] > PARAMS['MATS_psi']) & + (temp['1_MAX'] >= PARAMS['MATS_coverage']) & + (temp['2_MAX'] >= PARAMS['MATS_coverage'])]))) + down.append(int(len(temp[(temp['FDR'] < float(PARAMS['MATS_fdr'])) & + (temp['IncLevelDifference'] < -PARAMS['MATS_psi']) & + (temp['1_MAX'] >= PARAMS['MATS_coverage']) & + (temp['2_MAX'] >= PARAMS['MATS_coverage'])]))) + temp = pd.read_csv("%s/%s.novelJunc.JC.tsv" % (indir, event), sep='\t') - total_newJunc.append(int(len(temp[(temp['FDR'] < - float(PARAMS['MATS_fdr'])) & (abs(temp['IncLevelDifference']) > 0.1)]))) - up_newJunc.append(int(len(temp[(temp['FDR'] < - float(PARAMS['MATS_fdr'])) & (temp['IncLevelDifference'] > 0.1)]))) - down_newJunc.append(int(len(temp[(temp['FDR'] < - float(PARAMS['MATS_fdr'])) & (temp['IncLevelDifference'] < -0.1)]))) + temp['SJC_SUM_1'] = [sum(x)/len(x) for x in temp.SJC_SAMPLE_1.str.split(',').apply(lambda x: [float(i) for i in x])] + temp['IJC_SUM_1'] = [sum(x)/len(x) for x in temp.IJC_SAMPLE_1.str.split(',').apply(lambda x: [float(i) for i in x])] + temp['SJC_SUM_2'] = [sum(x)/len(x) for x in temp.SJC_SAMPLE_2.str.split(',').apply(lambda x: [float(i) for i in x])] + temp['IJC_SUM_2'] = [sum(x)/len(x) for x in temp.IJC_SAMPLE_2.str.split(',').apply(lambda x: [float(i) for i in x])] + temp['1_MAX'] = temp[['SJC_SUM_1','IJC_SUM_1']].max(axis=1) + temp['2_MAX'] = temp[['SJC_SUM_2','IJC_SUM_2']].max(axis=1) + total_newJunc.append(int(len(temp[(temp['FDR'] < float(PARAMS['MATS_fdr'])) & + (abs(temp['IncLevelDifference']) > PARAMS['MATS_psi']) & + (temp['1_MAX'] >= PARAMS['MATS_coverage']) & + (temp['2_MAX'] >= PARAMS['MATS_coverage'])]))) + up_newJunc.append(int(len(temp[(temp['FDR'] < float(PARAMS['MATS_fdr'])) & + (temp['IncLevelDifference'] > PARAMS['MATS_psi']) & + (temp['1_MAX'] >= PARAMS['MATS_coverage']) & + (temp['2_MAX'] >= PARAMS['MATS_coverage'])]))) + down_newJunc.append(int(len(temp[(temp['FDR'] < float(PARAMS['MATS_fdr'])) & + (temp['IncLevelDifference'] < -PARAMS['MATS_psi']) & + (temp['1_MAX'] >= PARAMS['MATS_coverage']) & + (temp['2_MAX'] >= PARAMS['MATS_coverage'])]))) + #experimental feature - deactivated #temp = pd.read_csv("%s/%s.novelSS.JC.tsv" % # (indir, event), sep='\t') @@ -949,8 +983,17 @@ def runPermuteMATS(infiles, outfile, design): for event in ["SE", "A5SS", "A3SS", "MXE", "RI"]: temp = pd.read_csv("%s/%s.MATS.JC.txt" % (os.path.dirname(outfile), event), sep='\t') - collate.append(str(len(temp[(temp['FDR'] < - float(PARAMS['MATS_fdr'])) & (abs(temp['IncLevelDifference']) > 0.1)]))) + + temp['SJC_SUM_1'] = [sum(x)/len(x) for x in temp.SJC_SAMPLE_1.str.split(',').apply(lambda x: [float(i) for i in x])] + temp['IJC_SUM_1'] = [sum(x)/len(x) for x in temp.IJC_SAMPLE_1.str.split(',').apply(lambda x: [float(i) for i in x])] + temp['SJC_SUM_2'] = [sum(x)/len(x) for x in temp.SJC_SAMPLE_2.str.split(',').apply(lambda x: [float(i) for i in x])] + temp['IJC_SUM_2'] = [sum(x)/len(x) for x in temp.IJC_SAMPLE_2.str.split(',').apply(lambda x: [float(i) for i in x])] + temp['1_MAX'] = temp[['SJC_SUM_1','IJC_SUM_1']].max(axis=1) + temp['2_MAX'] = temp[['SJC_SUM_2','IJC_SUM_2']].max(axis=1) + collate.append(str(len(temp[(temp['FDR'] < float(PARAMS['MATS_fdr'])) & + (abs(temp['IncLevelDifference']) > PARAMS['MATS_psi']) & + (temp['1_MAX'] >= PARAMS['MATS_coverage']) & + (temp['2_MAX'] >= PARAMS['MATS_coverage'])]))) with open(outfile, "w") as f: f.write("Group1\tGroup2\tSE\tA5SS\tA3SS\tMXE\tRI\n") f.write('\t'.join(collate)) diff --git a/cgatpipelines/tools/pipeline_splicing/pipeline.yml b/cgatpipelines/tools/pipeline_splicing/pipeline.yml index 505e2995b..6bf7975fc 100644 --- a/cgatpipelines/tools/pipeline_splicing/pipeline.yml +++ b/cgatpipelines/tools/pipeline_splicing/pipeline.yml @@ -44,17 +44,28 @@ annotations: dir: '' - ################################################################ - # - # MATS options - # - ################################################################ +################################################################ +# +# MATS options +# +################################################################ MATS: # Cutoff splicing difference # The cutoff used in the null hypothesis test for differential splicing. # The default is 0.0001 for 0.01% difference. Valid: 0 <= cutoff < 1 cutoff: '' + # PSI threshold + # This allows for setting a fixed minimum PSI change (e.g. 0.1) for filtering + # results. This can be used with a more lenient cstat, to increase the number + # of results. The threshold is not hypothesis-tested + psi: '' + + # Coverage threshold + # This allows for setting a fixed minimum number of reads that support this + # alternative splicing + coverage: '' + # Strandednes # fr-firststrand, fr-secondstrand or fr-unstranded libtype: '' @@ -71,11 +82,11 @@ MATS: - ################################################################ - # - # DEXSeq options - # - ################################################################ +################################################################ +# +# DEXSeq options +# + ################################################################ DEXSeq: # Strand specific library prep diff --git a/cgatpipelines/tools/pipeline_windows.py b/cgatpipelines/tools/pipeline_windows.py index 108f44e6f..a09ba435f 100644 --- a/cgatpipelines/tools/pipeline_windows.py +++ b/cgatpipelines/tools/pipeline_windows.py @@ -120,7 +120,7 @@ from cgatcore import pipeline as P import cgatpipelines.tasks.windows as windows import cgatpipelines.tasks.tracks as tracks -import cgatpipelines.tasks.mappingqc as mappingqc +import cgatpipelines.tasks.bamstats as bamstats ######################################################################### @@ -210,13 +210,12 @@ def loadTagCounts(infiles, outfile): suffix=".tsv") -# @P.add_doc(mappingqc.loadPicardDuplicateStats) @merge(prepareTags, "picard_duplicates.load") def loadPicardDuplicateStats(infiles, outfile): '''Merge Picard duplicate stats into single table and load into SQLite table picard_duplicates. ''' - mappingqc.loadPicardDuplicateStats( + bamstats.loadPicardDuplicateStats( infiles, outfile, pipeline_suffix=".bed.gz") diff --git a/conda/environments/cgat-flow.yml b/conda/environments/cgat-flow.yml index 2224fcb59..e465dfba5 100644 --- a/conda/environments/cgat-flow.yml +++ b/conda/environments/cgat-flow.yml @@ -14,7 +14,6 @@ channels: dependencies: # python dependencies - python >= 3.6 -- rpy2 <= 2.9.4 - tzlocal # exome.py diff --git a/setup.py b/setup.py index a1c95e7de..ec544f9ab 100644 --- a/setup.py +++ b/setup.py @@ -70,7 +70,7 @@ raise SystemExit("""CGAT requires Python 3 or later.""") cgat_packages = find_packages() -cgat_package_dirs = {'cgatpipelines': 'cgatpipelines'} +cgat_package_dirs = {'cgatpipelines': 'cgatpipelines', 'Rtools': 'Rtools'} # Classifiers classifiers = """