diff --git a/MANIFEST.in b/MANIFEST.in
index 99ff29cfa..44c74b012 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -22,5 +22,8 @@ exclude scripts/*
 include scripts/__init__.py
 include scripts/version.py
 include scripts/cgat_ruffus_profile.py
+include cgatpipelines/Rtools/*
+include cgatpipelines/experiment.R
+
 
 # extensions
diff --git a/cgatpipelines/Rtools/diffexpression.R b/cgatpipelines/Rtools/diffexpression.R
index b69374312..e5c279b2d 100644
--- a/cgatpipelines/Rtools/diffexpression.R
+++ b/cgatpipelines/Rtools/diffexpression.R
@@ -22,11 +22,11 @@ suppressMessages(library(goseq))
 source(file.path(Sys.getenv("R_ROOT"), "experiment.R"))
 
 
-mart = useMart(biomart = "ENSEMBL_MART_ENSEMBL",dataset="hsapiens_gene_ensembl", host = "jul2018.archive.ensembl.org")
+mart = useMart(biomart = "ENSEMBL_MART_ENSEMBL",dataset="hsapiens_gene_ensembl", host = "https://aug2020.archive.ensembl.org")
 getmart <- function(values){
   data<- getBM(
     filters= "ensembl_gene_id", 
-    attributes= c("ensembl_gene_id", "external_gene_name", "description","entrezgene", 'chromosome_name',
+    attributes= c("ensembl_gene_id", "external_gene_name", "description","entrezgene_id", 'chromosome_name',
                    'start_position', 'end_position'),
     values= values,
     mart= mart,
@@ -35,7 +35,7 @@ getmart <- function(values){
   return(data)
 }
 
-start_plot <- function(section, height = 6, width = 6, type = "png", outdir="") {
+start_plot <- function(section, height = 10, width = 10, type = "png", outdir="") {
     file = get_output_filename(paste0(outdir,"/",section, ".", type))
     Cairo(file = file,
           type = type,
@@ -78,7 +78,8 @@ plotTPMs <- function(dftemp, contrast_name){
     ggplot(aes(x = contrast, y = value, color = contrast)) +
     geom_point(position = position_jitter(w = 0.15, h = 0)) +
     facet_wrap(~ var, scales = "free") + theme_bw() +
-    ylab("normalised counts") + xlab (contrast_name) + guides(color = "none")
+    ylab("normalised counts") + xlab (contrast_name) + guides(color = "none")+
+    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
 }
 
 
@@ -114,7 +115,7 @@ run <- function(opt) {
   flog.info("... plotting MA")
   ## MA Plot
   start_plot("MAPlot", outdir=opt$outdir)
-    DESeq2::plotMA(resLFC, ylim = c(-3,3))
+    DESeq2::plotMA(resLFC, ylim = c(ceiling(max(resLFC$log2FoldChange)),floor(min(resLFC$log2FoldChange))))
   end_plot()
   
   flog.info("... saving DE data")
@@ -178,6 +179,7 @@ run <- function(opt) {
 
   flog.info("... plotting user-defined genes")
   genelist <- unlist(strsplit(opt$userlist, ","))
+  genelist <- genelist[genelist %in% rownames(dds)]
   dftemp <- makeTPMtable(genelist, counts(dds, normalized=TRUE), colData(dds), opt$contrast)
   start_plot("Userdefined", outdir=opt$outdir)
     print(plotTPMs(dftemp, opt$contrast))
diff --git a/cgatpipelines/Rtools/exploratory.R b/cgatpipelines/Rtools/exploratory.R
index ce966434d..20aa2b7bc 100755
--- a/cgatpipelines/Rtools/exploratory.R
+++ b/cgatpipelines/Rtools/exploratory.R
@@ -33,11 +33,11 @@ source(file.path(Sys.getenv("R_ROOT"), "experiment.R"))
 # Dependencies: biomaRt package
 # Input: vector of ensembl gene ids
 # Output: data frame with description, symbol and entrez gene
-mart = useMart(biomart = "ENSEMBL_MART_ENSEMBL",dataset="hsapiens_gene_ensembl", host = "jul2018.archive.ensembl.org")
+mart = useMart(biomart = "ENSEMBL_MART_ENSEMBL",dataset="hsapiens_gene_ensembl", host = "https://aug2020.archive.ensembl.org")
 getmart <- function(values){
   data<- getBM(
     filters= "ensembl_gene_id", 
-    attributes= c("ensembl_gene_id", "external_gene_name", "description","entrezgene"),
+    attributes= c("ensembl_gene_id", "external_gene_name", "description","entrezgene_id"),
     values= values,
     mart= mart,
     useCache = FALSE)
@@ -74,35 +74,15 @@ run <- function(opt) {
     dds = DESeqDataSetFromMatrix(experiment$counts, experiment$sample, design = formula(opt$model))
   }
   futile.logger::flog.info(paste("reading  Experiment object", paste(dim(counts(experiment)), collapse = ",")))
-  
-  ### SVA - ANALYSIS OF BIASES ###
-  futile.logger::flog.info(paste("Performing Surrogate Variable Analysis"))
-  dds <- estimateSizeFactors(dds)
-  dat  <- counts(dds, normalized = TRUE)
-  idx  <- rowMeans(dat) > 1
-  dat  <- dat[idx, ]
-  mod  <- model.matrix(formula(opt$model), colData(dds))
-  mod0 <- model.matrix(~ 1, colData(dds))
-  svseq <- svaseq(dat, mod, mod0, n.sv = 2)
-  for(factor in opt$factors){
-    start_plot(paste0('SVA for ', factor), outdir=opt$outdir)
-    par(mfrow = c(2, 1), mar = c(3,5,3,1))
-    for (i in 1:2) {
-      stripchart(svseq$sv[, i] ~ colData(dds)[, factor], vertical = TRUE, main = paste0("SV", i))
-      abline(h = 0)
-    }
-    end_plot()
-  }
-  
+
   ### TRANSFORMATION OF DATA ###
   futile.logger::flog.info(paste("Transforming data"))
-  rld<- rlog(dds)
+  dds <- estimateSizeFactors(dds)
   vsd<- vst(dds)
   df <- bind_rows(
     as_tibble(log2(counts(dds, normalized=TRUE)[, 1:2]+1)) %>%
       mutate(transformation = "log2(x + 1)"),
-    as_tibble(assay(vsd)[, 1:2]) %>% mutate(transformation = "vst"),
-    as_tibble(assay(rld)[, 1:2]) %>% mutate(transformation = "rlog"))
+    as_tibble(assay(vsd)[, 1:2]) %>% mutate(transformation = "vst"))
   colnames(df)[1:2] <- c("x", "y")
   start_plot('Variance_Transformations', outdir=opt$outdir)
     print(ggplot(df, aes(x = x, y = y)) + geom_hex(bins = 80) +
@@ -120,11 +100,12 @@ run <- function(opt) {
     if(dim(pca$x)[1]>7){
       dim_pca = 8
     } else {
-      dim_pca = dim(pca$x)
+      dim_pca = dim(pca$x)[1]
     }
     scores <- data.frame(variable.group, pca$x[,1:dim_pca])
     start_plot(paste0('PCA_', factor), outdir=opt$outdir)
-      print(qplot(x=PC1, y=PC2, data=scores, colour=factor(variable.group)) +
+      print(ggplot(scores, aes(x=PC1, y=PC2, colour=factor(variable.group))) +
+	      geom_point() +
         theme(legend.position="right") +  
         labs(colour=factor, x=paste0("PC1 (", percentVar[1],"% of variance)"),
              y=paste0("PC2 (", percentVar[2],"% of variance)")) + 
@@ -133,7 +114,8 @@ run <- function(opt) {
         theme(text=element_text(family='serif')))
     end_plot()
     start_plot(paste0('PCA_13_', factor), outdir=opt$outdir)
-          print(qplot(x=PC1, y=PC3, data=scores, colour=factor(variable.group)) +
+          print(ggplot(scores, aes(x=PC1, y=PC3, colour=factor(variable.group))) +
+          geom_point() +
         theme(legend.position="right") +  
         labs(colour=factor, x=paste0("PC1 (", percentVar[1],"% of variance)"),
              y=paste0("PC3 (", percentVar[3],"% of variance)")) + 
@@ -146,7 +128,7 @@ run <- function(opt) {
   names(variable.group) <- opt$contrast
   scores <- data.frame(variable.group, pca$x[,1:dim_pca])
   start_plot(paste0('PCA_grid'), outdir=opt$outdir)
-  if(dim_pca>7){
+  if(dim_pca==8){
     print(ggplot(scores, aes(x = .panel_x, y = .panel_y, fill = variable.group, colour = variable.group)) + 
       geom_point(shape = 16, size = 0.5, position = 'auto') + 
       geom_autodensity(alpha = 0.3, colour = NA, position = 'identity') + 
@@ -171,14 +153,18 @@ run <- function(opt) {
   df <- as.data.frame(colData(dds)[,opt$factors])
   rownames(df) <- colData(dds)$track
   # Heatmap of Top 20 Expressed Genes
-  select <- order(rowMeans(counts(dds,normalized=TRUE)),decreasing=TRUE)[1:20]
+  selected <- order(rowMeans(counts(dds,normalized=TRUE)),decreasing=TRUE)[1:20]
   start_plot('Heatmap_topExpressed', outdir=opt$outdir)
-    pheatmap(assay(vsd)[select,], cluster_rows=FALSE, cluster_cols=FALSE, show_rownames=FALSE, annotation_col=df)
+    pheatmap(assay(vsd)[selected,], cluster_rows=FALSE, cluster_cols=FALSE, show_rownames=FALSE, annotation_col=df)
   end_plot()
   # Heatmap of Top 20 Variable Genes
   topVarGenes <- head(order(rowVars(assay(vsd)),decreasing=TRUE),20)
+  futile.logger::flog.info(paste("topVarGenes            ",topVarGenes))
   mat <- assay(vsd)[ topVarGenes, ]
+  futile.logger::flog.info(paste("mat           ",rownames(mat)))
   temp <- getmart(rownames(mat))
+  temp <- temp[!duplicated(temp$ensembl_gene_id), ]
+  futile.logger::flog.info(paste("temp            ",temp))
   row.names(temp) <- temp$ensembl_gene_id
   rownames(mat) <- temp[rownames(mat),"external_gene_name"]
   start_plot('Heatmap_topVariable', outdir=opt$outdir)
@@ -187,7 +173,9 @@ run <- function(opt) {
   # Heatmap of Genes of interest
   if (!is.null(opt$genes_of_interest)) {
     print(rownames(assay(vsd)))
-    mat <- assay(vsd)[opt$genes_of_interest, ]
+    goi <- opt$genes_of_interest
+    goi <- goi[goi %in% rownames(assay(vsd))]
+    mat <- assay(vsd)[goi, ]
     mat <- mat - rowMeans(mat)
     temp <- getmart(rownames(mat))
     row.names(temp) <- temp$ensembl_gene_id
@@ -227,6 +215,7 @@ run <- function(opt) {
   
   ### EXPLORE BATCH EFFECTS ###
   futile.logger::flog.info(paste("Exploring Batch Effects"))
+  futile.logger::flog.info(dim_pca)
   if(dim_pca == 8){
     for(factor in opt$factors){
       factor_transformed <- vsd
@@ -237,7 +226,8 @@ run <- function(opt) {
       percentVar <- round(100 * summary(pca)$importance[2,])
       scores <- data.frame(variable.group, sample.group, pca$x[,1:2])
       start_plot(paste0('PCA_', factor, '_removed'), outdir=opt$outdir)
-      print(qplot(x=PC1, y=PC2, data=scores, colour=factor(variable.group), shape=factor(sample.group)) +
+      print(ggplot(scores, aes(x=PC1, y=PC2, colour=factor(variable.group), shape=factor(sample.group))) + 
+            geom_point() +
             theme(legend.position="right") +  
             labs(colour=opt$contrast, shape=factor, x=paste0("PC1 (", percentVar[1],"% of variance)"),
                  y=paste0("PC2 (", percentVar[2],"% of variance)")) + 
diff --git a/cgatpipelines/Rtools/filtercounts.R b/cgatpipelines/Rtools/filtercounts.R
index 00ff92ef9..6f8d7ebd7 100755
--- a/cgatpipelines/Rtools/filtercounts.R
+++ b/cgatpipelines/Rtools/filtercounts.R
@@ -28,9 +28,10 @@ run <- function(opt) {
   ### READING DATA ###
   # Read in sampleData Table
   futile.logger::flog.info(paste("reading sampleData table from", normalizePath(opt$sampleData)))
-  sampleData <- read.table(opt$sampleData, header = TRUE)
+  sampleData <- read_tsv(opt$sampleData)
   sampleData <-sampleData[sampleData$include ==1, ]
   futile.logger::flog.info(paste("read sampleData ", paste(dim(sampleData), collapse = ",")))
+  futile.logger::flog.info(paste(sampleData))
   rownames(sampleData) <- sampleData$track
 
   futile.logger::flog.info(paste("reading in data from ", opt$source))
@@ -81,7 +82,8 @@ run <- function(opt) {
       flattenedfile=opt$flattenedFile)
   } else if(opt$source == "counts_table"){
     # Read in Data
-    raw <- read.table(file = gzfile(opt$counts_tsv), header=TRUE, row.name=1)
+    raw <- read.table(file = gzfile(opt$counts_tsv), header=TRUE, row.name=1, check.names = FALSE)
+    futile.logger::flog.info(paste("Colnames: ", colnames(raw)))
     experiment_tsv <- raw[,sampleData$track,drop=FALSE]
     if(opt$method == "deseq2"){
       dataset = DESeqDataSetFromMatrix(experiment_tsv, sampleData, design = formula(opt$model))
@@ -107,6 +109,9 @@ run <- function(opt) {
       futile.logger::flog.info(paste("Counts before filtering ", paste(dim(counts(dataset)), collapse = ",")))
       keep <- rowSums(counts(dataset)) >= 10
       dataset <- dataset[keep,]
+      #dataset <- dataset[Reduce('|', dataset[sapply(dataset, is.numeric)]),]
+      #dataset <- dataset[rowSums(dataset[,sapply(dataset, is.numeric)]==0)<6,]
+      #dataset <- subset(dataset, !rowSums(assay(dataset) == 0) > 6)
       counts_table <- counts(dataset)
       futile.logger::flog.info(paste("Counts after filtering ", paste(dim(counts(dataset)), collapse = ",")))
     } else if(opt$method == "dexseq"){
diff --git a/cgatpipelines/cgatflow.py b/cgatpipelines/cgatflow.py
index 0bf22afe4..374a21180 100644
--- a/cgatpipelines/cgatflow.py
+++ b/cgatpipelines/cgatflow.py
@@ -21,7 +21,7 @@
 import sys
 import re
 import glob
-import imp
+import importlib.util
 import cgatpipelines
 
 
@@ -84,9 +84,10 @@ def main(argv=None):
     # remove 'cgatflow' from sys.argv
     del sys.argv[0]
 
-    (file, pathname, description) = imp.find_module(pipeline, paths)
-
-    module = imp.load_module(pipeline, file, pathname, description)
+    spec = importlib.util.spec_from_file_location(pipeline, path+"/"+pipeline+".py")
+    module = importlib.util.module_from_spec(spec)
+    sys.modules[pipeline] = module
+    spec.loader.exec_module(module)
 
     module.main(sys.argv)
 
diff --git a/cgatpipelines/tasks/bamstats.py b/cgatpipelines/tasks/bamstats.py
index e67ca4916..720e03fa7 100644
--- a/cgatpipelines/tasks/bamstats.py
+++ b/cgatpipelines/tasks/bamstats.py
@@ -67,7 +67,7 @@ def buildPicardInsertSizeStats(infile, outfile, genome_file,
         Filename with genomic sequence.
     '''
     job_memory = picardmem
-    picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals()
+    picard_opts = '-Xmx%(job_memory)s -XX:+UseG1GC' % locals()
     job_threads = 3
 
     if BamTools.getNumReads(infile) == 0:
@@ -159,7 +159,7 @@ def buildPicardAlignmentStats(infile, outfile, genome_file,
     '''
 
     job_memory = picardmem
-    picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals()
+    picard_opts = '-Xmx%(job_memory)s -XX:+UseG1GC ' % locals()
     job_threads = 3
 
     if BamTools.getNumReads(infile) == 0:
@@ -191,7 +191,7 @@ def buildPicardDuplicationStats(infile, outfile, picardmem):
     '''
 
     job_memory = picardmem
-    picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals()
+    picard_opts = '-Xmx%(job_memory)s -XX:+UseG1GC ' % locals()
     job_threads = 3
 
     if BamTools.getNumReads(infile) == 0:
@@ -244,7 +244,7 @@ def buildPicardDuplicateStats(infile, outfile, picardmem):
         Output filename with picard output.
     '''
     job_memory = picardmem
-    picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals()
+    picard_opts = '-Xmx%(job_memory)s -XX:+UseG1GC ' % locals()
     job_threads = 3
 
     if BamTools.getNumReads(infile) == 0:
@@ -279,7 +279,7 @@ def buildPicardCoverageStats(infile, outfile, baits, regions,
     '''
 
     job_memory = picardmem
-    picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals()
+    picard_opts = '-Xmx%(job_memory)s -XX:+UseG1GC ' % locals()
     job_threads = 3
 
     if BamTools.getNumReads(infile) == 0:
@@ -310,7 +310,7 @@ def buildPicardGCStats(infile, outfile, genome_file, picardmem):
     """
 
     job_memory = picardmem
-    picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals()
+    picard_opts = '-Xmx%(job_memory)s -XX:+UseG1GC ' % locals()
     job_threads = 3
 
     if BamTools.getNumReads(infile) == 0:
@@ -1060,7 +1060,7 @@ def loadIdxstats(infiles, outfile):
                                       columns=(['region', 'mapped']))
 
         # reformat the df
-        df = df.append(reformatted_df, ignore_index=True)
+        df = df._append(reformatted_df, ignore_index=True)
         df.set_index('region', inplace=True)
         df1 = df[['mapped']].T
         dfs.append(df1)
@@ -1291,7 +1291,7 @@ def buildPicardRnaSeqMetrics(infiles, strand, outfile, picardmem):
 
     '''
     job_memory = picardmem
-    picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals()
+    picard_opts = '-Xmx%(job_memory)s -XX:+UseG1GC ' % locals()
     job_threads = 3
     infile, genome, rRNA_intervals = infiles
 
diff --git a/cgatpipelines/tasks/diffexonexpression.R b/cgatpipelines/tasks/diffexonexpression.R
new file mode 100644
index 000000000..8cbfe9a8b
--- /dev/null
+++ b/cgatpipelines/tasks/diffexonexpression.R
@@ -0,0 +1,269 @@
+#' Running DEXSeq
+#'
+#' WARNING: This script is work-in-progress
+#' 
+#' Example usage:
+#' 
+#' Rscript PATH/TO/diffexonexpression.R --rds-filename=experiment.rds --model=~sample+exon+group:exon --reducedmodel=~sample+exon
+#'
+#' `experiment.rds` is a DEXSeq experiment object after filtering
+#'
+#'
+#' -> todo: ideas
+
+
+
+suppressMessages(library(futile.logger))
+suppressMessages(library(getopt))
+suppressMessages(library(fgsea))
+suppressMessages(library(data.table))
+suppressMessages(library(sva))	
+suppressMessages(library(biomaRt))
+suppressMessages(library(tidyverse))
+suppressMessages(library(RUVSeq))
+suppressMessages(library(Cairo))
+suppressMessages(library(goseq))
+suppressMessages(library(DEXSeq))
+
+source(file.path(Sys.getenv("R_ROOT"), "experiment.R"))
+
+
+mart = useMart(biomart = "ENSEMBL_MART_ENSEMBL",dataset="hsapiens_gene_ensembl", host = "jul2018.archive.ensembl.org")
+getmart <- function(values){
+  data<- getBM(
+    filters= "ensembl_gene_id", 
+    attributes= c("ensembl_gene_id", "external_gene_name", "description","entrezgene"),
+    values= values,
+    mart= mart)
+  data$description <- gsub("\t", "", data$description)
+  return(data)
+}
+
+start_plot <- function(section, outdir = "", height = 6, width = 6, type = "png") {
+  file = get_output_filename(paste0(outdir,"/",section, ".", type))
+  Cairo(file = file,
+        type = type,
+        width = width,
+        height = height,
+        units="in",
+        dpi = 300,
+        bg = "white")
+  #opar <- par(lwd=0.5)
+}
+
+end_plot <- function() {
+  dev.off()
+}
+
+
+# plotTPMs function
+# Function: plots a table of TPMs from a data frame, ensuring appropriate formating for ggplot
+# Input: data frame
+# Output: plot object
+plotTPMs <- function(dftemp, contrast_name){
+  dftemp %>% 
+    gather(key = "var", value="value", -contrast, -track) %>% 
+    mutate(var = factor(var, levels=unique(var))) %>%
+    ggplot(aes(x = contrast, y = value, color = contrast)) +
+    geom_point(position = position_jitter(w = 0.15, h = 0)) +
+    facet_wrap(~ var, scales = "free") + theme_bw() +
+    ylab("normalised counts") + xlab (contrast_name) + guides(color = "none")
+}
+
+
+run <- function(opt) {
+  
+  futile.logger::flog.info(paste("reading Experiment object from", normalizePath(opt$rds_filename)))
+  experiment <- readRDS(opt$rds_filename)
+  if (class(experiment) != "DEXSeqDataSet"){
+    stop("This script supports DEXSeqDataSet objects only.")
+  }
+  flog.info("Running DEXSeq")
+  dxd = estimateSizeFactors(experiment)
+  dxd = estimateDispersions(dxd)
+  dxd = testForDEU(dxd, reducedModel = formula(opt$reducedmodel),
+                   fullModel =  formula(opt$model))
+  dxd = estimateExonFoldChanges( dxd, fitExpToVar=opt$contrast)
+  res = DEXSeqResults(dxd)
+  
+  flog.info("... plotting dispersion estimates")
+  ## Plot dispersion estimates
+  start_plot("Dispersion", opt$outdir)
+  plotDispEsts(dxd)
+  end_plot()
+  
+  flog.info("... plotting MA")
+  ## MA Plot
+  start_plot("MAPlot", opt$outdir)
+  plotMA(res, ylim = c(-3,3))
+  end_plot()
+  
+  flog.info("... saving DE data")
+  ## Save DE data
+  #resSig <- subset(res, padj < opt$alpha)
+  #data <- getmart(as.character(map(res$transcripts, 1)))
+  #res$symbol<-data$external_gene_name[match(data$ensembl_transcript_id, as.character(map(res$transcripts, 1)))]
+  #res$desc<-data$description[match(data$ensembl_transcript_id, as.character(map(res$transcripts, 1)))]
+  resSig <- subset(res, padj < opt$alpha)
+  write.table(resSig, paste0(opt$outdir,"/","results.tsv"), sep = "\t")
+  write.table(res, paste0(opt$outdir,"/","results_full.tsv"), sep = "\t")
+  
+  flog.info("... plotting P histogram")
+  ## Plot P value Histogram
+  start_plot("PHistogram", opt$outdir)
+  hist(res$pvalue,breaks=50, col='skyblue', xlab="p-value", main="P-value Histogram")
+  end_plot()
+  
+  flog.info("... plotting downregulated genes")    
+  ## Plot Top Downregulated Genes
+  genelist <- rownames(res[ order( res[,grepl("log2",colnames(res))] ), ][0:9,])
+  for(gene in unique(substr(genelist,0,15))){
+    start_plot(paste0("Downregulated_",gene), opt$outdir)
+      plotDEXSeq(res, gene, legend=TRUE, cex.axis=1.2, cex=1.3, lwd=2 ,fitExpToVar = opt$contrast)
+    end_plot()
+  }
+
+  
+  flog.info("... plotting upregulated genes")
+  genelist <- rownames(res[ order( -res[,grepl("log2",colnames(res))] ), ][0:9,])
+  for(gene in unique(substr(genelist,0,15))){
+    start_plot(paste0("Upregulated_",gene), opt$outdir)
+    plotDEXSeq(res, gene, legend=TRUE, cex.axis=1.2, cex=1.3, lwd=2 ,fitExpToVar = opt$contrast)
+    end_plot()
+  }
+
+  flog.info("... plotting significant genes")
+  genelist <- rownames(resSig[order(resSig$padj), ])
+  for(gene in unique(substr(genelist,0,15))){
+    start_plot(paste0("Significant_",gene), opt$outdir)
+    plotDEXSeq(res, gene, legend=TRUE, cex.axis=1.2, cex=1.3, lwd=2 ,fitExpToVar = opt$contrast)
+    end_plot()
+  }
+
+  if(opt$perm >0){
+    flog.info("... performing permutations")
+    designperm <- colData(dxd)
+    dxdperm = dxd
+    x = 0
+    i = 1
+    y= list()
+    while(i <= opt$perm) {
+      flog.info(paste0("...... Permutation ", i))
+      designperm[, opt$contrast] = sample(designperm[, opt$contrast])
+      if(sum(designperm$group == levels(designperm[, opt$contrast])[1]) != table(colData(dxd)[,opt$contrast])[1]){
+        flog.info(paste0("......... Failed. Skipping design: ", paste(as.character(designperm$group),collapse = ",")))
+        next
+      }
+      colData(dxdperm) <- designperm
+      dxdperm = testForDEU(dxdperm)
+      dxdperm = estimateExonFoldChanges( dxdperm, fitExpToVar=opt$contrast)
+      resperm = DEXSeqResults(dxdperm)
+      x[i] = length(subset(resperm, padj < opt$alpha)$padj)
+      y[[i]] = dxdperm$group
+      i = i+1
+    }
+    flog.info(paste0("Result:   "  , x))
+    start_plot("Simulations", outdir=opt$outdir)
+    theme_set(theme_gray(base_size = 18))
+    sims = qplot(x,
+                 geom="histogram",
+                 breaks=seq(0, 20, by = 1),fill=I("grey"), col=I("black"),
+                 main = "Histogram of DE experiments\n with random group labels", 
+                 xlab = "Number of differentially expressed exons",
+                 ylab = "Number of simulations") +
+      geom_vline(xintercept = length(rownames(subset(res, padj < opt$alpha)))) +
+      theme_classic() + theme(plot.title = element_text(hjust = 0.5, size=22))
+    print(sims)
+    end_plot()
+    flog.info(paste0("... Permutation p value: ",
+                     length(x[x > length(rownames(subset(res, padj < opt$alpha)))])/length(x)))
+    z <- list()
+    for(i in 0:length(x)){
+      z[i] <- paste( unlist(y[i]), collapse=' ')
+    }
+    z <- unlist(z)
+    df <- data.frame(number=x, combination=z)
+    df
+    write_tsv(df,paste0(opt$outdir,"/","Simulations.tsv"))
+  }
+}
+
+main <- function() {
+  
+  option_list <- list(
+    make_option(
+      "--rds-filename",
+      dest = "rds_filename",
+      type = "character",
+      default = "experiment.rds",
+      help = paste("filename with input data of counts")
+    ),
+    make_option(
+      "--model",
+      dest = "model",
+      type = "character",
+      default = "~ sample + exon + group:exon",
+      help = paste("model formula for GLM")
+    ),
+    make_option(
+      "--reducedmodel",
+      dest = "reducedmodel",
+      type = "character",
+      default = "~ sample + exon",
+      help = paste("model formula for GLM")
+    ),
+    make_option(
+      "--contrast",
+      dest = "contrast",
+      type = "character",
+      default = "group",
+      help = paste("contrast/factor to use for comparison")
+    ),
+    make_option(
+      "--refgroup",
+      dest = "refgroup",
+      type = "character",
+      default = "CTR",
+      help = paste("reference group, e.g. CTR")
+    ),
+    make_option(
+      "--alpha",
+      dest = "alpha",
+      type = "numeric",
+      default = 0.05,
+      help = paste("Adjusted P value threshold")
+    ),
+   make_option(
+        "--outdir",
+        dest = "outdir",
+        type = "character",
+        default = "experiment",
+        help = paste("Libraries for gsea")
+    ),
+    make_option(
+      "--permute",
+      dest = "perm",
+      type = "integer",
+      default = 0,
+      help = paste("number of permutations")
+    ),
+    make_option(
+      "--pathways",
+      dest = "pathways",
+      type = "character",
+      default = "",
+      help = paste("Libraries for gsea")
+    )
+  )
+  opt <- experiment_start(option_list = option_list,
+                          description = description)
+  
+  if (!is.null(opt$pathways)) {
+    opt$pathways = unlist(strsplit(opt$pathways, ","))
+  }
+  run(opt)
+  
+  experiment_stop()
+}
+
+main()
diff --git a/cgatpipelines/tasks/diffexpression.R b/cgatpipelines/tasks/diffexpression.R
new file mode 100644
index 000000000..20fda35ff
--- /dev/null
+++ b/cgatpipelines/tasks/diffexpression.R
@@ -0,0 +1,397 @@
+#' Differential expression analysis
+#'
+#' WARNING: This script is work-in-progress
+#' 
+#' Example usage:
+#' 
+#' Rscript PATH/TO/diffexpression.R --rds-filename=sce.rds --model=~group --coef=group_MUT_vs_CTR --refgroup=CTR
+#'
+
+
+suppressMessages(library(futile.logger))
+suppressMessages(library(getopt))
+suppressMessages(library(fgsea))
+suppressMessages(library(data.table))
+suppressMessages(library(sva))	
+suppressMessages(library(biomaRt))
+suppressMessages(library(tidyverse))
+suppressMessages(library(RUVSeq))
+suppressMessages(library(Cairo))
+suppressMessages(library(goseq))
+
+source(file.path(Sys.getenv("R_ROOT"), "experiment.R"))
+
+
+mart = useMart(biomart = "ENSEMBL_MART_ENSEMBL",dataset="hsapiens_gene_ensembl", host = "jul2018.archive.ensembl.org")
+getmart <- function(values){
+  data<- getBM(
+    filters= "ensembl_gene_id", 
+    attributes= c("ensembl_gene_id", "external_gene_name", "description","entrezgene", 'chromosome_name',
+                   'start_position', 'end_position'),
+    values= values,
+    mart= mart,
+    useCache = FALSE)
+  data$description <- gsub("\t", "", data$description)
+  return(data)
+}
+
+start_plot <- function(section, height = 10, width = 10, type = "png", outdir="") {
+    file = get_output_filename(paste0(outdir,"/",section, ".", type))
+    Cairo(file = file,
+          type = type,
+          width = width,
+          height = height,
+          units="in",
+          dpi = 300,
+          bg = "white")
+    #opar <- par(lwd=0.5)
+}
+
+end_plot <- function() {
+    dev.off()
+}
+
+# makeTPMtable
+# Function: combines data from a genelist with abundance data to create a 
+# Input: 1. genelist vector in ENSEMBL format, 2. abundance matrix from tximport
+# Output: data frame with gene symbol names
+makeTPMtable  <- function(genelist, abundance, design, contrast){
+  genelist.df <- getmart(genelist)
+  genelist.df <- genelist.df[!duplicated(genelist.df[,1]),]
+  rownames(genelist.df) <- genelist.df$ensembl_gene_id
+  genelist.names <- genelist.df[genelist,]$external_gene_name
+  genelist.names[is.na(genelist.names)] <- genelist[is.na(genelist.names)]
+  dftemp <- as_tibble(t(abundance[genelist,]), rownames = "track")
+  dftemp <- dftemp %>% rename_at(vars(genelist), ~ genelist.names)
+  dftemp$contrast <- design[dftemp$track,][,contrast]
+  return(dftemp)
+}
+
+# plotTPMs function
+# Function: plots a table of TPMs from a data frame, ensuring appropriate formating for ggplot
+# Input: data frame
+# Output: plot object
+plotTPMs <- function(dftemp, contrast_name){
+  dftemp %>% 
+    gather(key = "var", value="value", -contrast, -track) %>% 
+    mutate(var = factor(var, levels=unique(var))) %>%
+    ggplot(aes(x = contrast, y = value, color = contrast)) +
+    geom_point(position = position_jitter(w = 0.15, h = 0)) +
+    facet_wrap(~ var, scales = "free") + theme_bw() +
+    ylab("normalised counts") + xlab (contrast_name) + guides(color = "none")+
+    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
+}
+
+
+run <- function(opt) {
+
+  futile.logger::flog.info(paste("reading Experiment object from", normalizePath(opt$rds_filename)))
+  experiment <- readRDS(opt$rds_filename)
+  futile.logger::flog.info(paste("Dataset", class(experiment)))
+  if (class(experiment) == "DESeqDataSet"){
+    flog.info("Running DESeq2")
+    suppressMessages(library(DESeq2))
+    colData(experiment)[,opt$contrast] <- relevel(colData(experiment)[,opt$contrast], ref = opt$refgroup)
+    design(experiment) <- formula(opt$model)
+    dds <- DESeq(experiment, betaPrior=FALSE)
+    futile.logger::flog.info(paste("coef", opt$coef))
+    futile.logger::flog.info(paste("coef", resultsNames(dds)))
+    res <- results(dds, name=opt$coef)
+    resLFC <- lfcShrink(dds, coef=opt$coef, type=opt$shrinkage)
+  } 
+  if (class(experiment) == "DGEList"){
+    dds = DESeqDataSetFromMatrix(experiment$counts, experiment$sample, design = formula(opt$model))
+  } 
+  if (class(experiment) == "DEXSeqDataSet"){
+    stop("please use diffexonexpression R script")
+  }
+
+  flog.info("... plotting dispersion estimates")
+  ## Plot dispersion estimates
+  start_plot("Dispersion", outdir=opt$outdir)
+    plotDispEsts(dds)
+  end_plot()
+  
+  flog.info("... plotting MA")
+  ## MA Plot
+  start_plot("MAPlot", outdir=opt$outdir)
+    DESeq2::plotMA(resLFC, ylim = c(ceiling(max(resLFC$log2FoldChange)),floor(min(resLFC$log2FoldChange))))
+  end_plot()
+  
+  flog.info("... saving DE data")
+  ## Save DE data
+  resSig <- subset(resLFC, padj < opt$alpha)
+  data <- getmart(rownames(resLFC))
+  resLFC$symbol<-data$external_gene_name[match(rownames(resLFC), data$ensembl_gene_id)]
+  resLFC$desc<-data$description[match(rownames(resLFC), data$ensembl_gene_id)]
+  resLFC$chromosome<-data$chromosome_name[match(rownames(resLFC), data$ensembl_gene_id)]
+  resLFC$start<-data$start_position[match(rownames(resLFC), data$ensembl_gene_id)]
+  resSig <- subset(resLFC, padj < opt$alpha)
+  write.table(resSig, paste0(opt$outdir,"/","results.tsv"), sep = "\t")
+  write.table(resLFC, paste0(opt$outdir,"/","results_full.tsv"), sep = "\t")
+  resSig2 <- subset(res, padj < opt$alpha)
+  write.table(resSig2, paste0(opt$outdir,"/","results_noLFCshrinkage.tsv"), sep = "\t")
+  resdf <- data.frame(geneid=rownames(resLFC), pvalue=resLFC$pvalue*sign(resLFC$log2FoldChange))
+  write.table(resdf, paste0(opt$outdir,"/","px_results_pvalue.gene.tsv"), sep = "\t", row.names = FALSE, quote = FALSE)
+  resdf <- data.frame(geneid=rownames(resLFC), l2fc=resLFC$log2FoldChange)
+  write.table(resdf, paste0(opt$outdir,"/","px_results_l2fc.gene.tsv"), sep = "\t", row.names = FALSE, quote = FALSE)
+  resdf <- data.frame(geneid=rownames(resLFC), l2fc=resLFC$log2FoldChange)
+  write.table(resdf, paste0(opt$outdir,"/","px_results_l2fc.gene.tsv"), sep = "\t", row.names = FALSE, quote = FALSE)
+  if(any(names(assays(experiment)) == "avgTxLength")){
+    genelengths <- rowMeans(assays(experiment)$avgTxLength)
+    write.table(genelengths, paste0(opt$outdir,"/","gene_lengths.tsv"), sep = "\t", row.names = TRUE, quote = FALSE)
+  }
+  file = get_output_filename(paste0(opt$outdir,"/","results_table.rds"))
+  flog.info(paste("saving results data to", file))
+  saveRDS(resLFC, file = file)
+
+
+  flog.info("... plotting P histogram")
+  ## Plot P value Histogram
+  start_plot("PHistogram", outdir=opt$outdir)
+    hist(res$pvalue,breaks=50, col='skyblue', xlab="p-value", main="P-value Histogram")
+  end_plot()
+
+  flog.info("... plotting downregulated genes")    
+  ## Plot Top Downregulated Genes
+  genelist <- rownames(resLFC[ order( resLFC[,grepl("log2",colnames(resLFC))] ), ][0:9,])
+  dftemp <- makeTPMtable(genelist, counts(dds, normalized=TRUE), colData(dds), opt$contrast)
+  start_plot("Downregulated", outdir=opt$outdir)
+  print(plotTPMs(dftemp, opt$contrast))
+  end_plot()
+  
+  flog.info("... plotting upregulated genes")
+  genelist <- rownames(resLFC[ order( -resLFC[,grepl("log2",colnames(resLFC))] ), ][0:9,])
+  dftemp <- makeTPMtable(genelist, counts(dds, normalized=TRUE), colData(dds), opt$contrast)
+  start_plot("Upregulated", outdir=opt$outdir)
+    print(plotTPMs(dftemp, opt$contrast))
+  dev.off()
+  
+  flog.info("... plotting significant genes")
+  resSig <- subset(resLFC, padj < 0.05)
+  genelist <- rownames(resSig[order(resSig$padj), ])
+  if(length(genelist) > 9){
+      genelist <- genelist[0:9]}
+  dftemp <- makeTPMtable(genelist, counts(dds, normalized=TRUE), colData(dds), opt$contrast)
+  start_plot("significant", outdir=opt$outdir)
+    print(plotTPMs(dftemp, opt$contrast))
+  dev.off()
+
+  flog.info("... plotting user-defined genes")
+  genelist <- unlist(strsplit(opt$userlist, ","))
+  genelist <- genelist[genelist %in% rownames(dds)]
+  dftemp <- makeTPMtable(genelist, counts(dds, normalized=TRUE), colData(dds), opt$contrast)
+  start_plot("Userdefined", outdir=opt$outdir)
+    print(plotTPMs(dftemp, opt$contrast))
+  dev.off()
+  
+  flog.info("... performing gene ontology (GO) enrichment analysis")
+  # Currently only supports data from salmon and kallisto
+  # Needs implementation of genelength matrix from featurecounts
+  if(any(names(assays(experiment)) == "avgTxLength")){
+      res.nona <- subset(resLFC, (!is.na(padj & baseMean) & baseMean > 1))
+      res.list <- list(as.integer(res.nona$padj < 0.05), 
+      	       	  as.integer(res.nona$log2FoldChange >0 & res.nona$padj < 0.05), 
+                  as.integer(res.nona$log2FoldChange <0 & res.nona$padj < 0.05))
+      res.names <- list("all", "up", "down")
+      for(i in 1:3) {
+      	    de.genes <- res.list[[i]]
+            names(de.genes) <- rownames(res.nona)
+            temp <- rowMeans(assays(experiment)$avgTxLength[match(names(de.genes), rownames(assays(experiment)$avgTxLength)),])
+      	    pwf=nullp(de.genes,bias.data=temp)
+      	    all = goseq(pwf,"hg38","ensGene", method="Hypergeometric")
+      	    sigall <- all
+      	    names(sigall) <- c("category","pvalue","underrepresented_pvalue","numberDE", "numberTOT", "term", "ontology")
+      	    sigall$pvalue <- p.adjust(sigall$pvalue, method="BH")
+      	    sigall$percent <- sigall$numberDE/sigall$numberTOT
+      	    sigall <- sigall[which(sigall[,2] < 0.05),]
+      	    cats <- sigall$category
+      	    write.table(sigall[,c("category", "pvalue")], file=paste0(opt$outdir,"/GO_",res.names[[i]],".tsv"), quote=FALSE, sep='\t', row.names = FALSE)
+      	    write.table(sigall, paste0(opt$outdir,"/GO_annotated_",res.names[[i]],".tsv"), quote=FALSE, sep='\t', row.names = FALSE)
+      	    write.table(all, paste0(opt$outdir,"/GO_complete_",res.names[[i]],".tsv"), quote=FALSE, sep='\t', row.names = FALSE)
+      }
+   }
+  
+  
+  flog.info("... performing gene set enrichment analysis (GSEA)")
+  resrnk<-resLFC
+  data <- getmart(rownames(resrnk))
+  resrnk$symbol <- data$external_gene_name[match(rownames(resrnk), data$ensembl_gene_id)]
+  resrnk$desc <-data$description[match(rownames(resrnk), data$ensembl_gene_id)]
+  resrnk$entrezgene <-data$entrezgene[match(rownames(resrnk), data$ensembl_gene_id)]
+  rnk.df <- as_tibble(resrnk[,c("entrezgene","log2FoldChange")]) %>% na.omit()
+  rnk <- rnk.df$log2FoldChange
+  names(rnk) <- rnk.df$entrezgene
+  rnk <- rnk[isUnique(names(rnk))]
+  if (!dir.exists(paste0(opt$outdir,"/gsea"))) {
+    dir.create(paste0(opt$outdir,"/gsea"))
+  }
+
+  for(pathway in opt$pathways){
+      pathways <- gmtPathways(pathway)
+      fgseaRes <- fgsea(pathways = pathways, 
+                        stats = rnk,
+                        minSize=15,
+                        maxSize=500,
+                        nperm=10000)
+      fwrite(fgseaRes, file=paste0(opt$outdir,"/gsea/",sub("([^.]+)\\.[[:alnum:]]+$", "\\1", (basename(pathway))),'.tsv'), sep="\t", sep2=c("", " ", ""))
+      topPathwaysUp <- fgseaRes[ES > 0,][head(order(pval), n=10),]$pathway
+      png(paste0(opt$outdir,"/gsea/",'UP_',sub("([^.]+)\\.[[:alnum:]]+$", "\\1", (basename(pathway))),'.png'),
+          width =15, height = 3, units = 'in', res = 600)
+      plotGseaTable(pathways[topPathwaysUp], rnk, fgseaRes, 
+                    gseaParam = 0.5, colwidths = c(10,2,1,1,1))
+      dev.off()
+  
+  
+      topPathwaysDown <- fgseaRes[ES < 0,][head(order(pval), n=10),]$pathway
+      plotGseaTable(pathways[topPathwaysDown], rnk, fgseaRes, 
+      gseaParam = 0.5)
+      png(paste0(opt$outdir,"/gsea/",'DOWN_',sub("([^.]+)\\.[[:alnum:]]+$", "\\1", (basename(pathway))),'.png'),
+          width =15, height = 3, units = 'in', res = 600)
+      plotGseaTable(pathways[topPathwaysDown], rnk, fgseaRes, 
+                    gseaParam = 0.5, colwidths = c(10,2,1,1,1))
+      dev.off()
+  }
+  
+  if(opt$perm >0){
+    flog.info("... performing permutations")
+    designperm <- colData(dds)
+    ddsperm = dds
+    x = 0
+    i = 1
+    y= list()
+    while(i <= opt$perm) {
+      flog.info(paste0("...... Permutation ", i))
+      # Code to only shuffle the group labels for groups in coef
+      tempdesign <- designperm[grep(paste(as.list(unlist(strsplit(opt$coef, "_"))) [c(2,4)],collapse = "|"),designperm[, opt$contrast]),]
+      designperm[grep(paste(as.list(unlist(strsplit(opt$coef, "_"))) [c(2,4)],collapse = "|"),designperm[, opt$contrast]),][,opt$contrast] = sample(tempdesign[,opt$contrast])
+      if(sum(designperm$group == levels(designperm[, opt$contrast])[1]) != table(colData(dds)[,opt$contrast])[1]){
+        flog.info(paste0("......... Failed. Skipping design: ", paste(as.character(designperm$group),collapse = ",")))
+        flog.info(paste0("......... Number of reference replicates is: ", sum(designperm$group == levels(designperm[, opt$contrast])[1]), ", but should be: ", table(colData(dds)[,opt$contrast])[1]))
+        next
+      }
+      colData(ddsperm) <- designperm
+      ddsperm <- DESeq(ddsperm)
+      resperm <- results(ddsperm)
+      x[i] = length(subset(resperm, padj < opt$alpha)$padj)
+      y[[i]] = ddsperm$group
+      i = i+1
+    }
+    start_plot("Simulations", outdir=opt$outdir)
+    theme_set(theme_gray(base_size = 18))
+    sims = qplot(x,
+                 geom="histogram",
+                 breaks=seq(0, 20, by = 1),fill=I("grey"), col=I("black"),
+                 main = "Histogram of DE experiments\n with random group labels", 
+                 xlab = "Number of differentially expressed genes",
+                 ylab = "Number of simulations") +
+      geom_vline(xintercept = length(rownames(subset(res, padj < opt$alpha)))) +
+      theme_classic() + theme(plot.title = element_text(hjust = 0.5, size=22))
+    print(sims)
+    end_plot()
+    flog.info(paste0("... Permutation p value: ",
+                     length(x[x > length(rownames(subset(res, padj < opt$alpha)))])/length(x)))
+    z <- list()
+    for(i in 0:length(x)){
+      z[i] <- paste( unlist(y[i]), collapse=' ')
+    }
+    z <- unlist(z)
+    df <- data.frame(number=x, combination=z)
+    df
+    write_tsv(df,paste0(opt$outdir,"/","Simulations.tsv"))
+  }
+}
+
+main <- function() {
+
+    option_list <- list(
+        make_option(
+            "--rds-filename",
+            dest = "rds_filename",
+            type = "character",
+            default = "experiment.rds",
+            help = paste("filename with input data of counts")
+        ),
+        make_option(
+            "--model",
+            dest = "model",
+            type = "character",
+            default = "~group",
+            help = paste("model formula for GLM")
+        ),
+        make_option(
+          "--contrast",
+          dest = "contrast",
+          type = "character",
+          default = "group",
+          help = paste("contrast/factor to use for comparison")
+        ),
+        make_option(
+          "--coef",
+          dest = "coef",
+          type = "character",
+          default = "",
+          help = paste("Comparison of interest for DESeq2: e.g. group_MUT_vs_CTR, where MUT and CTR are in group")
+        ),	
+        make_option(
+          "--refgroup",
+          dest = "refgroup",
+          type = "character",
+          default = "CTR",
+          help = paste("reference group, e.g. CTR")
+        ),
+        make_option(
+            "--alpha",
+            dest = "alpha",
+            type = "numeric",
+            default = 0.05,
+            help = paste("Adjusted P value threshold")
+        ),
+        make_option(
+            "--shrinkage",
+            dest = "shrinkage",
+            type = "character",
+            default = "apeglm",
+            help = paste("Method for LFC shrinkage. Options are apeglm (default), ashr, normal")
+        ),
+         make_option(
+            "--userlist",
+            dest = "userlist",
+            type = "character",
+            default = "ENSG00000205927,ENSG00000130675,ENSG00000016082,ENSG00000070748,ENSG00000064300,ENSG00000078018",
+            help = paste("User defined genes to plot")
+        ),
+       make_option(
+            "--outdir",
+            dest = "outdir",
+            type = "character",
+            default = "experiment",
+            help = paste("Directory for output")
+        ),
+        make_option(
+            "--permute",
+            dest = "perm",
+            type = "integer",
+            default = 0,
+            help = paste("number of permutations")
+        ),
+        make_option(
+            "--pathways",
+            dest = "pathways",
+            type = "character",
+            default = "",
+            help = paste("Libraries for gsea")
+        )
+    )
+    opt <- experiment_start(option_list = option_list,
+                            description = description)
+
+    if (!is.null(opt$pathways)) {
+        opt$pathways = unlist(strsplit(opt$pathways, ","))
+    }
+    run(opt)
+    
+    experiment_stop()
+}
+
+main()
diff --git a/cgatpipelines/tasks/exome.py b/cgatpipelines/tasks/exome.py
index 6a0cb662e..e9521ba2e 100644
--- a/cgatpipelines/tasks/exome.py
+++ b/cgatpipelines/tasks/exome.py
@@ -27,11 +27,6 @@
 # Set PARAMS in calling module
 PARAMS = {}
 
-
-def getGATKOptions():
-    return "-l mem_free=1.4G"
-
-
 def makeSoup(address):
     sock = urlopen(address)
     htmlSource = sock.read()
@@ -41,23 +36,23 @@ def makeSoup(address):
 ##############################################################################
 
 
-def GATKReadGroups(infile, outfile, genome,
+def GATKReadGroups(infile, outfile, dictionary,
                    library="unknown", platform="Illumina",
-                   platform_unit="1", track="unknown"):
+                   platform_unit="1", track="unknown", gatkmem="2G"):
     '''Reorders BAM according to reference fasta and adds read groups'''
 
     if track == 'unknown':
         track = P.snip(os.path.basename(infile), ".bam")
     tmpdir_gatk = P.get_temp_dir('.')
-    job_options = getGATKOptions()
+    job_memory = gatkmem
     job_threads = 3
 
     statement = '''picard ReorderSam
-                    INPUT=%(infile)s
-                    OUTPUT=%(tmpdir_gatk)s/%(track)s.reordered.bam
-                    REFERENCE=%(genome)s
-                    ALLOW_INCOMPLETE_DICT_CONCORDANCE=true
-                    VALIDATION_STRINGENCY=SILENT ;''' % locals()
+                    -INPUT %(infile)s
+                    -OUTPUT %(tmpdir_gatk)s/%(track)s.reordered.bam
+                    -SEQUENCE_DICTIONARY %(dictionary)s
+                    -ALLOW_INCOMPLETE_DICT_CONCORDANCE true
+                    -VALIDATION_STRINGENCY SILENT ;''' % locals()
 
     statement += '''samtools index %(tmpdir_gatk)s/%(track)s.reordered.bam ;
                  ''' % locals()
@@ -80,55 +75,28 @@ def GATKReadGroups(infile, outfile, genome,
 ##############################################################################
 
 
-def GATKIndelRealign(infile, outfile, genome, intervals, padding, threads=4):
-    '''Realigns BAMs around indels using GATK'''
-
-    intervalfile = outfile.replace(".bam", ".intervals")
-    job_options = getGATKOptions()
-    job_threads = 3
-
-    statement = '''GenomeAnalysisTK
-                    -T RealignerTargetCreator
-                    -o %(intervalfile)s
-                    --num_threads %(threads)s
-                    -R %(genome)s
-                    -L %(intervals)s
-                    -ip %(padding)s
-                    -I %(infile)s; ''' % locals()
-
-    statement += '''GenomeAnalysisTK
-                    -T IndelRealigner
-                    -o %(outfile)s
-                    -R %(genome)s
-                    -I %(infile)s
-                    -targetIntervals %(intervalfile)s;''' % locals()
-    P.run(statement)
-
-##############################################################################
-
-
 def GATKBaseRecal(infile, outfile, genome, intervals, padding, dbsnp,
-                  solid_options=""):
+                  options="", gatkmem="2G"):
     '''Recalibrates base quality scores using GATK'''
 
     track = P.snip(os.path.basename(infile), ".bam")
     tmpdir_gatk = P.get_temp_dir('.')
-    job_options = getGATKOptions()
+    job_memory = gatkmem
     job_threads = 3
 
-    statement = '''GenomeAnalysisTK
-                    -T BaseRecalibrator
-                    --out %(tmpdir_gatk)s/%(track)s.recal.grp
+    statement = '''gatk
+                    BaseRecalibrator
+                    -O %(tmpdir_gatk)s/%(track)s.recal.grp
                     -R %(genome)s
                     -L %(intervals)s
                     -ip %(padding)s
                     -I %(infile)s
-                    --knownSites %(dbsnp)s %(solid_options)s ;
+                    --known-sites %(dbsnp)s %(options)s ;
                     ''' % locals()
 
-    statement += '''GenomeAnalysisTK
-                    -T PrintReads -o %(outfile)s
-                    -BQSR %(tmpdir_gatk)s/%(track)s.recal.grp
+    statement += '''gatk
+                    ApplyBQSR -O %(outfile)s
+                    --bqsr-recal-file %(tmpdir_gatk)s/%(track)s.recal.grp
                     -R %(genome)s
                     -I %(infile)s ;
                     ''' % locals()
@@ -140,21 +108,19 @@ def GATKBaseRecal(infile, outfile, genome, intervals, padding, dbsnp,
 
 
 def haplotypeCaller(infile, outfile, genome,
-                    dbsnp, intervals, padding, options):
+                    dbsnp, intervals, padding, options, gatkmem = "2G"):
     '''Call SNVs and indels using GATK HaplotypeCaller in all members of a
     family together'''
-    job_options = getGATKOptions()
+    job_memory = gatkmem
     job_threads = 3
 
-    statement = '''GenomeAnalysisTK
-                    -T HaplotypeCaller
+    statement = '''gatk
+                    HaplotypeCaller
                     -ERC GVCF
-                    -variant_index_type LINEAR
-                    -variant_index_parameter 128000
-                    -o %(outfile)s
+                    -O %(outfile)s
                     -R %(genome)s
                     -I %(infile)s
-                    --dbsnp %(dbsnp)s
+                    --dbsnp %(dbsnp)s 
                     -L %(intervals)s
                     -ip %(padding)s
                     %(options)s''' % locals()
@@ -164,16 +130,61 @@ def haplotypeCaller(infile, outfile, genome,
 ##############################################################################
 
 
-def genotypeGVCFs(inputfiles, outfile, genome, options):
+def consolidateGVCFs(inputfiles, outfile, inputlen, dbmem="20G"):
     '''Joint genotyping of all samples together'''
-    job_options = getGATKOptions()
+
+    runmem = dbmem
+    if not re.match(r' ', runmem):
+       runmem  = re.sub(r'([KMGT])', r' \1', runmem)
+    number, unit = [string.strip() for string in runmem.split()]
+    job_threads = 2
+    job_memory = f"{float(number)*1.2/job_threads:.1f}{unit}"
+    tmpdir = P.get_temp_dir('.')
+
+    tmpfile = P.get_temp_filename(".", suffix=".list")
+    chrID = [ 'chr{}'.format(x) for x in list(range(1,23)) + ['X', 'Y'] ]
+    with open(tmpfile, 'w+') as f:
+        for contig in chrID:
+            f.write('%s\n' % contig)
+
+    statement = '''gatk
+                    --java-options "-Xmx%(dbmem)s -Xms%(dbmem)s"
+                    GenomicsDBImport
+                    -V %(inputfiles)s
+                    -L %(tmpfile)s
+                    --reader-threads %(job_threads)s
+                    --tmp-dir %(tmpdir)s
+                    --genomicsdb-workspace-path %(outfile)s
+                    ''' % locals()
+    if inputlen > 50:
+        statement += ''' --batch-size 50'''
+    statement += ''' > %(outfile)s.log.tmp 2>&1;
+                     rm -rf %(tmpdir)s ;
+                     mv %(outfile)s.log.tmp %(outfile)s.log''' % locals()
+    P.run(statement)
+
+    os.unlink(tmpfile)
+
+
+##############################################################################
+
+
+def genotypeGVCFs(infile, outfile, genome, options, gatkmem="2G"):
+    '''Joint genotyping of all samples together'''
+    job_memory = gatkmem
     job_threads = 3
+    infile = P.snip(infile, ".log")
+    tmpdir = P.get_temp_dir('.')
+    logfile = P.snip(outfile,".vcf")+".log"
 
-    statement = '''GenomeAnalysisTK
-                    -T GenotypeGVCFs
-                    -o %(outfile)s
+    statement = '''gatk
+                    GenotypeGVCFs
+                    -O %(outfile)s
                     -R %(genome)s
-                    --variant %(inputfiles)s''' % locals()
+                    -V gendb://%(infile)s
+                    --tmp-dir %(tmpdir)s
+                    %(options)s > %(logfile)s 2>&1;
+                    rm -rf %(tmpdir)s''' % locals()
     P.run(statement)
 
 ##############################################################################
@@ -245,113 +256,90 @@ def strelkaINDELCaller(infile_control, infile_tumor, outfile, genome, config,
 ##############################################################################
 
 
-def variantAnnotator(vcffile, bamlist, outfile, genome,
-                     dbsnp, annotations="", snpeff_file=""):
-    '''Annotate variant file using GATK VariantAnnotator'''
-    job_options = getGATKOptions()
-    job_threads = 3
-
-    if "--useAllAnnotations" in annotations:
-        anno = "--useAllAnnotations"
-    elif annotations:
-        anno = " -A " + " -A ".join(annotations)
-    else:
-        anno = ""
-
-    statement = '''GenomeAnalysisTK -T VariantAnnotator
-                    -R %(genome)s
-                    -I %(bamlist)s
-                    -A SnpEff
-                    --snpEffFile %(snpeff_file)s
-                    -o %(outfile)s
-                    --variant %(vcffile)s
-                    -L %(vcffile)s
-                    --dbsnp %(dbsnp)s
-                    %(anno)s''' % locals()
-    P.run(statement)
-
-##############################################################################
-
-
 def variantRecalibrator(infile, outfile, genome, mode, dbsnp=None,
-                        kgenomes=None, hapmap=None, omni=None, mills=None):
+                        kgenomes=None, hapmap=None, omni=None, mills=None, 
+                        axiom=None, gatkmem = "2G"):
     '''Create variant recalibration file'''
-    job_options = getGATKOptions()
+    job_memory = gatkmem
     job_threads = 3
 
     track = P.snip(outfile, ".recal")
     if mode == 'SNP':
-        statement = '''GenomeAnalysisTK -T VariantRecalibrator
+        statement = '''gatk VariantRecalibrator
         -R %(genome)s
-        -input %(infile)s
-        -resource:hapmap,known=false,training=true,truth=true,prior=15.0 %(hapmap)s
-        -resource:omni,known=false,training=true,truth=true,prior=12.0 %(omni)s
-        -resource:dbsnp,known=true,training=false,truth=false,prior=2.0 %(dbsnp)s
-        -resource:1000G,known=false,training=true,truth=false,prior=10.0 %(kgenomes)s
-        -an QD -an SOR -an MQRankSum
-        -an ReadPosRankSum -an FS -an MQ
-        --maxGaussians 4
+        -V %(infile)s
+        --trust-all-polymorphic
+        -tranche 100.0 -tranche 99.95 -tranche 99.9 -tranche 99.8 -tranche 99.6 -tranche 99.5 -tranche 99.4 -tranche 99.3 -tranche 99.0 -tranche 98.0 -tranche 97.0 -tranche 90.0
+        -an QD -an MQRankSum -an ReadPosRankSum -an FS -an MQ -an SOR -an InbreedingCoeff
         -mode %(mode)s
-        -recalFile %(outfile)s
-        -tranchesFile %(track)s.tranches
-        -rscriptFile %(track)s.plots.R ''' % locals()
+        --max-gaussians 6
+        --resource:hapmap,known=false,training=true,truth=true,prior=15 %(hapmap)s
+        --resource:omni,known=false,training=true,truth=true,prior=12 %(omni)s
+        --resource:1000G,known=false,training=true,truth=false,prior=10 %(kgenomes)s
+        --resource:dbsnp,known=true,training=false,truth=false,prior=7 %(dbsnp)s
+        -O %(track)s.recal
+        --tranches-file %(track)s.tranches
+        --rscript-file %(track)s.plots.R
+        > %(track)s.log 2>&1 ''' % locals()
         P.run(statement)
     elif mode == 'INDEL':
-        statement = '''GenomeAnalysisTK -T VariantRecalibrator
+        statement = '''gatk VariantRecalibrator
         -R %(genome)s
-        -input %(infile)s
-        -resource:mills,known=true,training=true,truth=true,prior=12.0 %(mills)s
-        -an QD -an MQRankSum
-        -an ReadPosRankSum -an FS -an MQ
-        --maxGaussians 4
-        --minNumBadVariants 5000
+        -V %(infile)s
+        --resource:mills,known=true,training=true,truth=true,prior=12 %(mills)s
+        --resource:axiomPoly,known=false,training=true,truth=false,prior=10 %(axiom)s
+        --resource:dbsnp,known=true,training=false,truth=false,prior=2 %(dbsnp)s
+        -an QD -an MQRankSum -an ReadPosRankSum -an FS -an MQ
+        --max-gaussians 4
         -mode %(mode)s
-        -recalFile %(outfile)s
-        -tranchesFile %(track)s.tranches
-        -rscriptFile %(track)s.plots.R ''' % locals()
+        -O %(track)s.recal
+        --tranches-file %(track)s.tranches
+        --rscript-file %(track)s.plots.R 
+        > %(track)s.log 2>&1 ''' % locals()
         P.run(statement)
 
 ##############################################################################
 
 
-def applyVariantRecalibration(vcf, recal, tranches, outfile, genome, mode):
+def applyVQSR(vcf, recal, tranches, outfile, genome, mode, gatkmem):
     '''Perform variant quality score recalibration using GATK '''
-    job_options = getGATKOptions()
+    job_memory = gatkmem
     job_threads = 3
 
-    statement = '''GenomeAnalysisTK -T ApplyRecalibration
+    statement = '''gatk ApplyVQSR
     -R %(genome)s
-    -input:VCF %(vcf)s
-    -recalFile %(recal)s
-    -tranchesFile %(tranches)s
-    --ts_filter_level 99.0
+    -V %(vcf)s
+    --recal-file %(recal)s
+    --tranches-file %(tranches)s
+    --truth-sensitivity-filter-level 99.7
+    --create-output-variant-index true
     -mode %(mode)s
-    -o %(outfile)s ''' % locals()
+    -O %(outfile)s
+    > %(outfile)s.log 2>&1  ''' % locals()
     P.run(statement)
 
 ##############################################################################
 
 
-def vcfToTable(infile, outfile, genome, columns):
+def vcfToTable(infile, outfile, genome, columns, gatkmem):
     '''Converts vcf to tab-delimited file'''
-    job_options = getGATKOptions()
+    job_memory = gatkmem
     job_threads = 3
 
-    statement = '''GenomeAnalysisTK -T VariantsToTable
+    statement = '''gatk VariantsToTable
                    -R %(genome)s
                    -V %(infile)s
-                   --showFiltered
-                   --allowMissingData
+                   --show-filtered
                    %(columns)s
-                   -o %(outfile)s''' % locals()
+                   -O %(outfile)s''' % locals()
     P.run(statement)
 
 ##############################################################################
 
 
 def selectVariants(infile, outfile, genome, select):
-    '''Filter de novo variants based on provided jexl expression'''
-    statement = '''GenomeAnalysisTK -T SelectVariants
+    '''Filter de novo variants based on provided expression'''
+    statement = '''gatk SelectVariants
                     -R %(genome)s
                     --variant %(infile)s
                     -select '%(select)s'
@@ -421,22 +409,6 @@ def buildSelectStatementfromPed(filter_type, pedfile, template):
 
 ##############################################################################
 
-
-def guessSex(infile, outfile):
-    '''Guess the sex of a sample based on ratio of reads
-    per megabase of sequence on X and Y'''
-    statement = '''calc `samtools idxstats %(infile)s
-                    | grep 'X'
-                    | awk '{print $3/($2/1000000)}'`
-                    /`samtools idxstats %(infile)s | grep 'Y'
-                    | awk '{print $3/($2/1000000)}'`
-                    | tr -d " " | tr "=" "\\t" | tr "/" "\\t"
-                    > %(outfile)s'''
-    P.run(statement)
-
-##############################################################################
-
-
 def filterMutect(infile, outfile, logfile,
                  control_id, tumour_id,
                  min_t_alt, min_n_depth,
@@ -909,313 +881,6 @@ def filterQuality(infile, qualstr, qualfilter, outfiles):
     out2.close()
 
 
-@cluster_runnable
-def FilterExacCols(infile, exac_suffs, exac_thresh):
-    '''
-    Returns a set of line indices indicating lines where either of the alleles
-    called have a frequency of greater that exac_thresh in any of the
-    populations specified as exac_suffs.
-    Where no data is available an allele frequency of -1 is used.
-
-    Exac provide data as AC_xxx and AN_xxx where AC is the allele count
-    - the number of times the allele has been called
-    - and AN is chromosome count - the number of
-    samples in which the allele could have been called - in population xxx.
-    AC / AN = allele frequecy.
-
-    exac_suffs are any columns where an AC_xxx and AN_xxx column is provided
-    in the VCF, e.g. Adj will calculate allele frequency from the AC_Adj
-    and AN_Adj columns
-
-    '''
-    # read columns from the input VCF
-    exac_suffs = exac_suffs.split(",")
-    cols = iotools.open_file(infile).readline().strip().split("\t")
-    nD = dict()
-    afdict = dict()
-    for e in exac_suffs:
-        # find the columns with the appropriate information
-        # Allele count
-        AC_i = cols.index("AC_%s" % (e))
-        # Allele Number
-        AN_i = cols.index("AN_%s" % (e))
-        # Genotype
-        GT_i = cols.index('GT')
-        nlist = set()
-        n = 0
-        AFS = []
-        with iotools.open_file(infile) as input:
-            for line in input:
-                if n > 1:
-                    line = line.strip().split("\t")
-                    # At multi-allelic sites, comma delimited AC and AN values
-                    # are provided
-                    # "." and "NA" indicate no data here - this is represented
-                    # as an AF of -1
-                    AC = line[AC_i].replace(".", "-1").replace(
-                        "NA", "-1").split(",")
-                    AN = line[AN_i].replace(".", "1").replace(
-                        "NA", "1").split(",")
-                    AC = np.array([float(a) for a in AC])
-                    AN = np.array([float(a) for a in AN])
-                    AF = AC / AN
-                    AF2 = [af if af > 0 else 0 for af in AF]
-                    AF = np.insert(AF, 0, (1 - sum(AF2)))
-
-                    # Chromosome count is usually the same for all minor
-                    # alleles (but not always)
-                    # If it is not the same the AC and AN lists should be the
-                    # same length
-                    # Otherwise AN will have length 1
-                    if len(AC) != len(AN):
-                        AN = [AN] * len(AC)
-
-                    # Record the genotype called in this sample for this SNP
-                    GT = line[GT_i]
-                    GT = GT.replace(".", '0')
-                    GT = GT.split("/")
-                    GT[0], GT[1] = int(GT[0]), int(GT[1])
-
-                    # If the variant is not in ExAC the ExAC columns show "."
-                    # but the site
-                    # may still have been called as multi allelic
-                    # - use -1 for all frequencies
-                    # in this case
-                    if max(GT) > (len(AF) - 1):
-                        AF = np.array([-1] * (max(GT) + 1))
-
-                    AF1 = AF[GT[0]]
-                    AF2 = AF[GT[1]]
-                    AFS.append((AF1, AF2))
-                    # Remember where both allele frequencies are
-                    # greater than exac_thresh
-                    if AF1 >= exac_thresh and AF2 >= exac_thresh:
-                        nlist.add(n)
-                else:
-                    AFS.append(('NA', 'NA'))
-                n += 1
-        afdict[e] = AFS
-        nD[e] = nlist
-
-    ns = set.union(*list(nD.values()))
-    return afdict, ns
-
-
-@cluster_runnable
-def FilterFreqCols(infile, thresh, fcols):
-    '''
-    Returns a set of line indices indicating lines where either of the alleles
-    called have a frequency of less than thresh in all of the columns specified
-    in fcols.
-    No information - assigned allele frequency of -1.
-    '''
-    fcols = fcols.split(",")
-    # read the column headings from the variant table
-    cols = iotools.open_file(infile).readline().strip().split("\t")
-    # store allele frequency columns
-    AFdict = dict()
-    # store low frequency indices
-    nD = dict()
-    for col in fcols:
-        ind = cols.index(col)
-        GT_i = cols.index('GT')
-        n = 0
-        nlist = set()
-        AFS = []
-        with iotools.open_file(infile) as input:
-            for line in input:
-                if n > 1:
-                    line = line.strip().split("\t")
-                    GT = line[GT_i].replace(".", "0").split("/")
-                    af = line[ind].split(",")
-                    AF = []
-                    # where the allele frequency is not numeric
-                    # "." or "NA" use -1 to indicate no data
-                    for a in af:
-                        try:
-                            AF.append(float(a))
-                        except:
-                            AF.append(float(-1))
-                    AF2 = [l if l > 0 else 0 for l in AF]
-                    AF = np.array(AF)
-                    AF = np.insert(AF, 0, 1 - sum(AF2))
-                    GT[0] = int(GT[0])
-                    GT[1] = int(GT[1])
-                    # If the variant is not in database the column shows "."
-                    # but the site
-                    # may still have been called as multi allelic
-                    # - use -1 for all frequencies
-                    # in this case
-                    if max(GT[0], GT[1]) > (len(AF) - 1):
-                        AF = [float(-1)] * (max(GT[0], GT[1]) + 1)
-                    AF1 = AF[GT[0]]
-                    AF2 = AF[GT[1]]
-                    if AF1 >= thresh and AF2 >= thresh:
-                        nlist.add(n)
-                    AFS.append((AF1, AF2))
-                else:
-                    AFS.append(('NA', 'NA'))
-                n += 1
-        AFdict[col] = AFS
-        nD[col] = nlist
-
-    ns = set.union(*list(nD.values()))
-    return AFdict, ns
-
-
-def WriteFreqFiltered(infile, exacdict, exacinds, otherdict, otherinds,
-                      outfiles):
-    '''
-    Writes the output of the frequency filtering steps to file, including
-    the new columns showing calculated allele frequency for the allele
-    in this specific sample.
-    '''
-    x = 0
-    out = iotools.open_file(outfiles[0], "w")
-    out2 = iotools.open_file(outfiles[1], "w")
-
-    exaccols = list(exacdict.keys())
-    othercols = list(otherdict.keys())
-
-    # column names for the new columns
-    exacnewcols = ["%s_calc" % c for c in exaccols]
-    othernewcols = ["%s_calc" % c for c in othercols]
-
-    with iotools.open_file(infile) as infile:
-        for line in infile:
-            line = line.strip()
-            if x <= 1:
-                # write the column names
-                out.write("%s\t%s\t%s\n" % (line, "\t".join(exacnewcols),
-                                            "\t".join(othernewcols)))
-            else:
-                freqs = []
-                for key in exaccols:
-                    col = exacdict[key]
-                    freqs.append(col[x])
-                for key in othercols:
-                    col = otherdict[key]
-                    freqs.append(col[x])
-                freqs = [str(freq) for freq in freqs]
-
-                if x not in exacinds and x not in otherinds:
-                    out.write("%s\t%s\n" % (line, "\t".join(freqs)))
-                else:
-                    out2.write("%s\t%s\n" % (line, "\t".join(freqs)))
-            x += 1
-    out.close()
-    out2.close()
-
-
-@cluster_runnable
-def filterRarity(infile, exac, freqs, thresh, outfiles):
-    '''
-    Filter out variants which are common in any of the exac or other
-    population datasets as specified in the pipeline.yml.
-    '''
-    exacdict, exacinds = FilterExacCols(infile, exac, thresh)
-    otherdict, otherinds = FilterFreqCols(infile, thresh, freqs)
-    WriteFreqFiltered(infile, exacdict, exacinds, otherdict,
-                      otherinds, outfiles)
-
-
-@cluster_runnable
-def filterDamage(infile, damagestr, outfiles):
-    '''
-    Filter variants which have not been assessed as damaging by any
-    of the specified tools.
-    Tools and thresholds can be specified in the pipeline.yml.
-
-    Does not account for multiple alt alleles - if any ALT allele has
-    been assessed as damaging with any tool the variant is kept,
-    regardless of if this is the allele called in the sample.
-
-    '''
-    damaging = damagestr.split(",")
-    cols = iotools.open_file(infile).readline().strip().split("\t")
-
-    D = dict()
-    # parses the "damage string" from the pipeline.yml
-    # this should be formatted as COLUMN|result1-result2-...,COLUMN|result1...
-    # where variants with any of these results in this column will
-    # be retained
-    for d in damaging:
-        d = d.split("|")
-        col = d[0]
-        res = d[1].split("-")
-        i = cols.index(col)
-        D[col] = ((res, i))
-
-    x = 0
-    out = iotools.open_file(outfiles[0], "w")
-    out2 = iotools.open_file(outfiles[1], "w")
-    with iotools.open_file(infile) as input:
-        for line in input:
-            if x > 1:
-                # grep for specific strings within this column of this
-                # line of the input file
-                line = line.strip().split("\t")
-                isdamaging = 0
-                for key in D:
-                    res, i = D[key]
-                    current = line[i]
-                    for r in res:
-                        if re.search(r, current):
-                            isdamaging = 1
-                if isdamaging == 1:
-                    out.write("%s\n" % "\t".join(line))
-                else:
-                    out2.write("%s\n" % "\t".join(line))
-            else:
-                out.write(line)
-            x += 1
-    out.close()
-    out2.close()
-
-
-@cluster_runnable
-def filterFamily(infile, infile2, outfiles):
-    '''
-    Filter variants according to the output of calculateFamily -
-    only variants shared by both members of a family will be kept.
-    '''
-    cps1 = set()
-    cps2 = set()
-
-    # make a list of variants in infile1
-    with iotools.open_file(infile) as input:
-        for line in input:
-            line = line.strip().split("\t")
-            chrom = line[0]
-            pos = line[1]
-            cp = "%s_%s" % (chrom, pos)
-            cps1.add(cp)
-
-    # make a list of variants in infile2
-    with iotools.open_file(infile2) as input:
-        for line in input:
-            line = line.strip().split("\t")
-            chrom = line[0]
-            pos = line[1]
-            cp = "%s_%s" % (chrom, pos)
-            cps2.add(cp)
-
-    # only variants in both are of interest
-    cps = cps1 & cps2
-
-    out = iotools.open_file(outfiles[0], "w")
-    out2 = iotools.open_file(outfiles[1], "w")
-    with iotools.open_file(infile) as input:
-        for line in input:
-            line = line.strip().split("\t")
-            if "%s_%s" % (line[0], line[1]) in cps:
-                out.write("%s\n" % "\t".join(line))
-            else:
-                out2.write("%s\n" % "\t".join(line))
-    out.close()
-    out2.close()
-
-
 @cluster_runnable
 def CleanVariantTables(genes, variants, cols, outfile):
     variants = pd.read_csv(variants, sep="\t")
@@ -1276,3 +941,5 @@ def CleanVariantTables(genes, variants, cols, outfile):
     df.columns = ['gene', 'CHROM', 'POS']
     variants = vp1.merge(df, 'left')
     variants.to_csv(outfile, sep="\t")
+
+    
diff --git a/cgatpipelines/tasks/exomeancestry.py b/cgatpipelines/tasks/exomeancestry.py
deleted file mode 100644
index 7618bab10..000000000
--- a/cgatpipelines/tasks/exomeancestry.py
+++ /dev/null
@@ -1,487 +0,0 @@
-import pandas as pd
-import numpy as np
-import cgatcore.iotools as iotools
-import random
-import itertools
-from cgatcore.pipeline import cluster_runnable
-import json
-import decimal
-from sklearn.cluster import KMeans
-import matplotlib.pyplot as plt
-import matplotlib.cm as cm
-import matplotlib.patches as mpatches
-
-
-@cluster_runnable
-def MakeSNPFreqDict(infiles, outfiles, rs):
-    '''
-    Generates a random set of SNPs to use to characterise the ancestry
-    and relatedness of the samples.
-
-    1. Finds all SNPs which have a known genotype frequency for all 11
-       hapmap ancestries
-    2. Picks a random 50000 of these SNPs
-    3. Stores a list of these SNPs as randomsnps.tsv
-    4. Builds a dictionary of dictionaries of dictionaries where:
-           At Level 1 each key is a HapMap ancestry ID
-               each of these keys leads to another dictionary - level 2.
-           At Level 2 each key is a dbSNP SNP ID
-               each of these keys leads to another dictionary - level 3.
-           At Level 3 each key is a genotype:
-               the values of these keys are the genotype frequency of this
-               genotype at this SNP in this ancestry.
-       e.g. snpdict['ASW']['rs000000001']['CT'] -->
-            frequency of the CT genotype at the rs0000001 SNP in the
-            ASW population
-    5.  Stores this dictionary in json format as randomsnps.json
-    '''
-
-    # list all chromosomes
-    chroms = np.array([f.split("/")[-1].split("_")[2] for f in infiles])
-    uchroms = set(chroms)
-    infiles = np.array(infiles)
-    snpdict = dict()
-    for chrom in uchroms:
-        # get all the files for this chromosome
-        c = np.where(chroms == chrom)[0]
-        thischrom = infiles[c]
-        snpidsets = []
-        # make a set of all the snp ids of known frequency for
-        # this chromsome for each ancestry
-        for f in thischrom:
-            snpids = []
-            anc = f.split("/")[-1].split("_")[3]
-            with iotools.open_file(f) as inp:
-                for line in inp:
-                    line = line.strip().split(" ")
-                    snpids.append(line[0])
-            snpids = set(snpids)
-            snpidsets.append(snpids)
-        # find the snp ids where frequency is known in all ancestries
-        thischromsnps = set.intersection(*snpidsets)
-        snpdict[chrom] = thischromsnps
-
-    vals = list(snpdict.values())
-    # set of all the SNPs genotyped in all ancestries on all chroms
-    pooled = set.union(*vals)
-    # take a random sample of snps from this set
-    random.seed(rs)
-    sam = set(random.sample(pooled, 50000))
-
-    # make a new dictionary of just the sampled snps for each chrom
-    snpdictsam = dict()
-    for key in snpdict:
-        i = snpdict[key] & sam
-        snpdictsam[key] = i
-
-    # generate all possible genotypes as list of strings
-    # (e.g. CC CT CG CA TT etc)
-    res = ["%s%s" % (g[0], g[1])
-           for g in itertools.permutations("CGAT", 2)]
-    res += ['CC', 'GG', 'TT', 'AA']
-    freqdict = dict()
-    ancs = set([l.split("/")[-1].split("_")[3] for l in infiles])
-
-    # build an empty dictionary of dictionaries of dictionaries
-    # to store genotype freqs for each ancestry
-    for anc in ancs:
-        freqdict.setdefault(anc, dict())
-        for s in sam:
-            freqdict[anc].setdefault(s, dict())
-            for r in res:
-                freqdict[anc][s][r] = 0.0
-
-    # read genotype freqs from the input file and store in the dictionary
-    for f in infiles:
-        bits = f.split("/")[-1].split("_")
-        anc = bits[3]
-        chrom = bits[2]
-        snpset = snpdictsam[chrom]
-        with iotools.open_file(f) as input:
-            for line in input:
-                line = line.strip().split(" ")
-                if line[0] in snpset:
-                    G1 = "%s%s" % (line[10][0], line[10][-1])
-                    try:
-                        F1 = float(line[11])
-                        G2 = "%s%s" % (line[13][0], line[13][-1])
-                        F2 = float(line[14])
-                        G3 = "%s%s" % (line[16][0], line[16][-1])
-                        F3 = float(line[17])
-                    except:
-                        # because occasionally the GFs are not numeric
-                        continue
-                    freqdict[anc][line[0]][G1] = F1
-                    freqdict[anc][line[0]][G2] = F2
-                    freqdict[anc][line[0]][G3] = F3
-
-    # dump dictionary in a json file to resurrect later
-    outf = iotools.open_file(outfiles[0], "w")
-    json.dump(freqdict, outf)
-    outf.close()
-
-    # store the sampled list of snps
-    out = iotools.open_file(outfiles[1], "w")
-    for snp in sam:
-        out.write("%s\n" % snp)
-    out.close()
-
-
-@cluster_runnable
-def GenotypeSNPs(infile, snplist, outfile):
-    '''
-    Fetches the genotype from the variant tables for all samples
-    for SNPs in the hapmap sample from makeRandomSNPSet.
-
-    Complex sites are ignored (as simple SNPs are sufficient for these
-    calculations).
-    These are:
-        Sites which failed QC (column 3 in the variant table is not PASS)
-        Sites with more than 2 alleles defined (column 6 in the variant table
-        contains more than one alternative allele)
-        SNPs with more than one ID
-        Indels
-    '''
-    out = iotools.open_file(outfile, "w")
-    with iotools.open_file(infile) as inf:
-        for line in inf:
-            line = line.strip().split()
-            # if the variant passed QC
-            if line[4] == "PASS":
-                genotype = line[7]
-                # if the genotype looks normal e.g. 1/1
-                if len(genotype) == 3:
-                    # get the actual genotype (rather than the index)
-                    if genotype[0] != ".":
-                        ind1 = int(genotype[0])
-                    else:
-                        ind1 = 0
-                    if genotype[2] != ".":
-                        ind2 = int(genotype[2])
-                    else:
-                        ind2 = 0
-                    A1 = line[5]
-                    A2 = line[6].split(",")
-                    AS = [A1] + A2
-
-                    if len(AS) <= 2:
-                        GT = "%s%s" % (AS[ind1], AS[ind2])
-                        refGT = "%s%s" % (A1, A1)
-                        if len(GT) == 2:
-                            if line[3][0:2] == "rs" and len(
-                                    line[3].split(";")) == 1:
-                                snpid = line[3]
-                                chrom = line[0]
-                                pos = line[1]
-                                if snpid in snplist:
-                                    out.write("%s\t%s\t%s\t%s\t%s\n"
-                                              % (snpid, chrom, pos, GT,
-                                                 refGT))
-    out.close()
-
-
-@cluster_runnable
-def CalculateAncestry(infile, calledsnps, snpdict, outfiles):
-    '''
-    Takes the data stored in MakeRandomSNPSet and the genotype of each sample
-    at each site in calledsnps.tsv and tabulates the frequency of this
-    genotype in each of the HapMap ancestry categories.
-    The overall probability of each ancestry is then calculated as the
-    product of these frequencies. These can only be used in comparison to
-    each other - to show which of the 11 ancestries is most probable.
-    '''
-
-    # List the SNPs in the sample where a variant has been called
-    # Record the reference genotype at each of these SNPs
-    called = set()
-    refs = dict()
-    for line in iotools.open_file(calledsnps).readlines():
-        line = line.strip().split("\t")
-        refs[line[0]] = line[1]
-        called.add(line[0])
-
-    # Record the genotype of the current sample at each SNP
-    currents = dict()
-    for line in iotools.open_file(infile).readlines():
-        line = line.strip().split("\t")
-        currents[line[0]] = line[3]
-
-    # Regenerate the genotype frequency dictionary made earlier
-    s = iotools.open_file(snpdict)
-    snpdict = json.load(s)
-    s.close()
-    ancs = list(snpdict.keys())
-
-    # Build a table of the frequency of the genotype in this sample
-    # in each of the hapmap ancestries
-    out = iotools.open_file(outfiles[0], "w")
-    out.write("snp\tgenotype\t%s\n" % "\t".join(ancs))
-    for snp in called:
-        # where a variant hasn't been called in the current sample assume
-        # the reference genotype
-        if snp in currents:
-            geno = currents[snp]
-        else:
-            geno = refs[snp]
-
-        res = []
-        for anc in ancs:
-            D = snpdict[anc]
-            sub = D[snp]
-            freq = sub[geno]
-            res.append(freq)
-
-        res = [str(r) for r in res]
-        out.write("%s\t%s\t%s\n" % (snp, geno, "\t".join(res)))
-
-    out.close()
-    out = iotools.open_file(outfiles[1], "w")
-
-    # read the output of the previous step into pandas
-    res = pd.read_csv(outfiles[0], sep="\t")
-
-    # calculate the probability of each ancestry as the product of all the
-    # genotype frequencies
-    # SNPs where a genotype not recorded in hapmap has been called are
-    # skipped
-    # decimal package is used because the results are very small floats
-    res = res.replace(0, float('nan'))
-    for anc in set(ancs):
-        r = res[anc][np.invert(res[anc].isnull())]
-        r = [decimal.Decimal(d) for d in r]
-        out.write("%s\t%s\n" % (anc, np.product(r)))
-    out.close()
-
-
-@cluster_runnable
-def MakePEDFile(infiles, outfiles):
-    '''
-    Generates the required input for the Plink and King software packages.
-       - PED file - columns are SNPs and rows are samples, each cell is the
-         genotype of the sample at this SNP
-       - MAP file - rows are SNPs in the same order as the columns in the ped
-         file, each row shows chromosome, snp id and position.
-    '''
-    # Record the chromosome and position of each SNP
-    chromposdict = dict()
-    for line in iotools.open_file(infiles[-1]).readlines():
-        line = line.strip().split("\t")
-        chromposdict[line[0]] = ((line[2], line[3]))
-
-    # Generate the PED file - takes the genotype column of the table from
-    # the CalculateAncestry step and transposes it across a row
-
-    out = iotools.open_file(outfiles[0], "w")
-    for inf in infiles[0:-1]:
-        inf = inf[0]
-        df = pd.read_csv(inf, sep="\t")
-        geno = df['genotype']
-        genolist = []
-        for g in geno:
-            genolist.append(" ".join(list(g)))
-        out.write("%s\t%s\n" % (inf.split("/")[-1], "\t".join(genolist)))
-    out.close()
-
-    # Generate the MAP file - chromosome and position of each SNP in the same
-    # order as the PED file
-    mapf = iotools.open_file(outfiles[1], "w")
-    slist = df['snp']
-    for snp in slist:
-        mapf.write("%s\t%s\t%s\n" % (chromposdict[snp][0], snp,
-                                     chromposdict[snp][1]))
-    mapf.close()
-
-
-def CalculateFamily(infile, outfile):
-    '''
-    Translates and filters the output from King.
-    Pairs of related samples are written to the output file along with the
-    degree of relatedness.  Degrees are decided using thresholds from
-    the King documentation, here
-    http://people.virginia.edu/~wc9c/KING/manual.html
-    '''
-    out = iotools.open_file(outfile, "w")
-    input = open(infile).readlines()[1:]
-    for line in input:
-        line = line.strip().split("\t")
-        score = float(line[14])
-        # if there is a significant relatedness score
-        if score >= 0.0442:
-            ID1 = line[0]
-            ID2 = line[2]
-            # 2nd degree relatives
-            if score <= 0.177:
-                degree = 2
-            # 1st degree relatives
-            elif score <= 0.354:
-                degree = 1
-            # 0th degree relatives (ID twins or duplicate)
-            else:
-                degree = 0
-            out.write("%s\t%s\t%s\n" % (ID1, ID2, degree))
-    out.close()
-
-
-@cluster_runnable
-def CalculateSex(infiles, outfile):
-    '''
-    Approximates the sex of the sample using the data in the variant table.
-    Basic estimate based on heterozygosity on the X chromosome - genotypes
-    are taken for all called variants on X passing QC and the percentage
-    of heterozygotes is taken.
-    This tends to produce two clear populations so Kmeans clustering
-    is used to split the data into two - male and female.  Samples which are
-    unclear are marked in the output.
-    '''
-    percs = []
-    for f in infiles:
-        homs = 0
-        tot = 0
-        # Count the homozygous and total variants called on the
-        # X chromosome in each input file (after QC)
-        with iotools.open_file(f) as inf:
-            for line in inf:
-                line = line.strip().split("\t")
-                if line[0] == "chrX" and line[4] == "PASS":
-                    geno = line[7]
-                    s = set(list(geno))
-                    if len(s) == 2:
-                        homs += 1
-                    tot += 1
-        perc = float(homs) / float(tot)
-        percs.append(perc)
-
-    # impose a grid on the data so clustering is possible
-    ests = np.array(percs)
-    ones = np.array([1] * len(ests))
-    grid = np.column_stack((ones, ests))
-
-    # divide data into two clusters
-    means = KMeans(n_clusters=2)
-    fit1 = means.fit(grid)
-    res = fit1.predict(grid)
-
-    # find the position of the cluster centres on the y axis
-    ccs = fit1.cluster_centers_
-    cc1 = ccs[0, 1]
-    cc2 = ccs[1, 1]
-    ccs = [cc1, cc2]
-
-    # pick out the scores from each cluster and calculate the standard dev
-    s1 = ests[res == 0]
-    s2 = ests[res == 1]
-    sds = [np.std(s1), np.std(s2)]
-    # males will be more homozygous - the bigger value represents males
-    if cc1 > cc2:
-        sexes = ["male", "female"]
-    else:
-        sexes = ["female", "male"]
-
-    x = 0
-    out = iotools.open_file(outfile, "w")
-    for f in infiles:
-        # for each input file take the group, the score, the standard deviation
-        # of this group, the distance from the cluster centre and the sex
-        group = res[x]
-        est = ests[x]
-        std = sds[group]
-        dist = est - ccs[group]
-        sex = sexes[group]
-
-        # is the value with 3 SDs of the mean for this sex
-        if sex == "male":
-            if ((3 * std) - abs(dist) >= 0 or dist >= 0):
-                sig = "*"
-            else:
-                sig = "-"
-        else:
-            if ((3 * std) - abs(dist) >= 0 or dist <= 0):
-                sig = "*"
-            else:
-                sig = "-"
-        out.write("%s\t%f\t%f\t%s\t%s\n" % (f, est, dist, sexes[group], sig))
-        x += 1
-    out.close()
-
-    # plot the output - blue dots male, red dots female
-    # lines represent 3 standard deviations from the cluster centre
-    p = plt.figure()
-    a = p.add_subplot('111')
-
-    a.axis([0, len(infiles),
-            (min([min(s1), min(s2)]) - 0.1), (max([max(s1), max(s2)]) + 0.1)])
-
-    dist1 = np.linspace(0, len(infiles), len(s1))
-    dist2 = np.linspace(0, len(infiles), len(s2))
-
-    a.plot(dist1, s1, 'b.')
-    a.plot(dist2, s2, 'r.')
-
-    top = ccs[0] + (3 * sds[0])
-    bottom = ccs[1] - (3 * sds[1])
-    a.plot([0, len(infiles)], [top, top], 'b-')
-    a.plot([0, len(infiles)], [bottom, bottom], 'r-')
-    a.tick_params(axis='x', labelbottom='off', bottom='off')
-    a.yaxis.set_label_text('percentage homozygous X variants')
-    fig = outfile.replace(".tsv", ".png")
-    p.savefig(fig)
-
-
-def PlotAncestry(infile):
-    '''
-    Draws a plot showing the score (probabilty) for each individual
-    for their assigned ancestry and the second closest match.
-    x = individual
-    y = score
-    large diamonds represent the best match and small triangles the second
-    best match
-    '''
-    ancestry = pd.read_csv(infile, sep="\t", header=None, dtype="str")
-
-    # score using the index of the probability e.g. 5e-1000 as -1000
-    ancestry[2] = [int(a[1]) * -1 for a in ancestry[2].str.split("-")]
-    ancestry[4] = [int(a[1]) * -1 for a in ancestry[4].str.split("-")]
-
-    # sort by assigned ancestry
-    ancestry = ancestry.sort_values([1])
-    ancs = list(set(ancestry[1].append(ancestry[3])))
-
-    # assign colours to points by ancestry
-    ancestry[5] = ancestry[1].apply(lambda x: ancs.index(x), 1)
-    ancestry[6] = ancestry[3].apply(lambda x: ancs.index(x), 1)
-    floats = np.linspace(0, 1, len(set(ancs)))
-    colours = cm.jet(floats)
-
-    f = plt.figure(figsize=(30, 15))
-    a = f.add_subplot('111')
-    ymin = min(ancestry[2]) - 100
-    ymax = max(ancestry[4]) + 100
-
-    a.axis([-5, len(ancestry) + 5, ymin, ymax])
-
-    # plot best scoring ancestry
-    a.scatter(list(range(len(ancestry))),
-              ancestry[2], c=colours[ancestry[5].values],
-              s=500, marker='D', edgecolors='None')
-
-    # plot second best scoring ancestry
-    a.scatter(list(range(len(ancestry))),
-              ancestry[4], c=colours[ancestry[6].values],
-              s=150, marker="^", edgecolors='None')
-    sancs = set(ancestry[1])
-    i = 20
-
-    # draw lines between individuals assigned to different ancestries
-    for anc in sancs:
-        p = max(np.where(ancestry[1] == anc)[0]) + 0.5
-        p2 = min(np.where(ancestry[1] == anc)[0]) + 0.5
-        a.plot([p, p],
-               [ymin, ymax], 'k-')
-        a.text(p2, ymax - i, anc, ha='center')
-        i += 10
-
-    # legend
-    ps = [mpatches.Patch(color=col) for col in colours]
-    a.legend(handles=ps, labels=ancs, loc=3)
-
-    f.savefig(infile.replace(".tsv", ".png"))
diff --git a/cgatpipelines/tasks/exploratory.R b/cgatpipelines/tasks/exploratory.R
new file mode 100755
index 000000000..d22da0cbd
--- /dev/null
+++ b/cgatpipelines/tasks/exploratory.R
@@ -0,0 +1,302 @@
+#' Differential expression analysis Script
+#'
+#' 
+#' Example usage:
+#' 
+#' cgatflow R exploratory --model=~ group --contrast=group --factor=mouse_id,collection_date,slice_depth,slice_number,pipette_visual,timepoint
+#'
+#' Primary Input: `filename.rds` - output from the readandfiltercounts.R file or DESeqExpermient object or DEGList object
+#' Additional inputs: model and contrast as well as factors of interest
+#'
+#' Output: png images of: PCA, clustering, Surrogate Variable Analysis, Heatmaps
+#'
+
+suppressMessages(library(getopt))
+suppressMessages(library(tidyverse))
+suppressMessages(library(data.table))
+suppressMessages(library(DESeq2))
+suppressMessages(library(edgeR))
+suppressMessages(library(rhdf5))
+suppressMessages(library(sva))
+suppressMessages(library(ggplot2))
+suppressMessages(library(biomaRt))
+suppressMessages(library(Cairo))
+suppressMessages(library(pheatmap))
+suppressMessages(library(RColorBrewer))
+suppressMessages(library(ggforce))
+
+source(file.path(Sys.getenv("R_ROOT"), "experiment.R"))
+
+
+# getmart
+# Function: wrapper for getBM for ENSEMBL gene list
+# Dependencies: biomaRt package
+# Input: vector of ensembl gene ids
+# Output: data frame with description, symbol and entrez gene
+mart = useMart(biomart = "ENSEMBL_MART_ENSEMBL",dataset="hsapiens_gene_ensembl", host = "jul2018.archive.ensembl.org")
+getmart <- function(values){
+  data<- getBM(
+    filters= "ensembl_gene_id", 
+    attributes= c("ensembl_gene_id", "external_gene_name", "description","entrezgene"),
+    values= values,
+    mart= mart,
+    useCache = FALSE)
+  data$description <- gsub("\t", "", data$description)
+  return(data)
+}
+
+
+start_plot <- function(section, height = 6, width = 6, type = "png", outdir="") {
+    file = get_output_filename(paste0(outdir,"/",section, ".", type))
+  Cairo(file = file,
+        type = type,
+        width = width,
+        height = height,
+        units="in",
+        dpi = 300,
+        bg = "white")
+  #opar <- par(lwd=0.5)
+}
+
+end_plot <- function() {
+  dev.off()
+}
+
+
+run <- function(opt) {
+
+  ### READING DATA ###
+  futile.logger::flog.info(paste("reading Experiment object from", normalizePath(opt$rds_filename)))
+  experiment <- readRDS(opt$rds_filename)
+  if (class(experiment) == "DESeqDataSet"){
+    dds <- experiment
+  } else if (class(experiment) == "DGEList"){
+    dds = DESeqDataSetFromMatrix(experiment$counts, experiment$sample, design = formula(opt$model))
+  }
+  futile.logger::flog.info(paste("reading  Experiment object", paste(dim(counts(experiment)), collapse = ",")))
+
+  ### TRANSFORMATION OF DATA ###
+  futile.logger::flog.info(paste("Transforming data"))
+  rld<- rlog(dds)
+  vsd<- vst(dds)
+  df <- bind_rows(
+    as_tibble(log2(counts(dds, normalized=TRUE)[, 1:2]+1)) %>%
+      mutate(transformation = "log2(x + 1)"),
+    as_tibble(assay(vsd)[, 1:2]) %>% mutate(transformation = "vst"),
+    as_tibble(assay(rld)[, 1:2]) %>% mutate(transformation = "rlog"))
+  colnames(df)[1:2] <- c("x", "y")
+  start_plot('Variance_Transformations', outdir=opt$outdir)
+    print(ggplot(df, aes(x = x, y = y)) + geom_hex(bins = 80) +
+      coord_fixed() + facet_grid( . ~ transformation))
+  end_plot()
+  
+  ### PRINCIPAL COMPONENT ANALYSIS ###
+  futile.logger::flog.info(paste("Performing Principal Component Analysis"))
+  pca = prcomp(t(assay(vsd)))
+  for(factor in opt$factors){
+    variable.group <- colData(dds)[, factor]
+    names(variable.group) <- factor
+    percentVar <- round(100 * summary(pca)$importance[2,])
+    futile.logger::flog.info(paste("number of PCA dimensions", dim(pca$x)))
+    if(dim(pca$x)[1]>7){
+      dim_pca = 8
+    } else {
+      dim_pca = dim(pca$x)
+    }
+    scores <- data.frame(variable.group, pca$x[,1:dim_pca])
+    start_plot(paste0('PCA_', factor), outdir=opt$outdir)
+      print(ggplot(scores,aes(x=PC1, y=PC2)) +
+        geom_point(colour=factor(variable.group)) +
+        theme(legend.position="right") +  
+        labs(colour=factor, x=paste0("PC1 (", percentVar[1],"% of variance)"),
+             y=paste0("PC2 (", percentVar[2],"% of variance)")) + 
+        ggtitle("Principal Component Analysis") + theme_grey(base_size = 15) +
+        theme(plot.title = element_text(lineheight=1, face="bold"))  + geom_point(size=2) +
+        theme(text=element_text(family='serif')))
+    end_plot()
+    start_plot(paste0('PCA_13_', factor), outdir=opt$outdir)
+          print(ggplot(scores,aes(x=PC1, y=PC3)) +
+        geom_point(colour=factor(variable.group)) +
+        theme(legend.position="right") +  
+        labs(colour=factor, x=paste0("PC1 (", percentVar[1],"% of variance)"),
+             y=paste0("PC3 (", percentVar[3],"% of variance)")) + 
+        ggtitle("Principal Component Analysis") + theme_grey(base_size = 15) +
+        theme(plot.title = element_text(lineheight=1, face="bold"))  + geom_point(size=2) +
+        theme(text=element_text(family='serif')))
+    end_plot()
+  }
+  variable.group <- colData(dds)[, opt$contrast]
+  names(variable.group) <- opt$contrast
+  scores <- data.frame(variable.group, pca$x[,1:dim_pca])
+  start_plot(paste0('PCA_grid'), outdir=opt$outdir)
+  if(dim_pca==8){
+    print(ggplot(scores, aes(x = .panel_x, y = .panel_y, fill = variable.group, colour = variable.group)) + 
+      geom_point(shape = 16, size = 0.5, position = 'auto') + 
+      geom_autodensity(alpha = 0.3, colour = NA, position = 'identity') + 
+      facet_matrix(vars(PC1:PC8), layer.diag = 2))
+  }else{
+    print(ggplot(scores, aes(x = .panel_x, y = .panel_y, fill = variable.group, colour = variable.group)) + 
+      geom_point(shape = 16, size = 0.5, position = 'auto') + 
+      geom_autodensity(alpha = 0.3, colour = NA, position = 'identity') + 
+      facet_matrix(vars(everything()), layer.diag = 2))
+  }
+  end_plot()
+  
+  loadings <- pca$rotation[,1:dim_pca]
+  loadings <- data.frame(loadings[order(loadings[,1]), ])
+  data <- getmart(rownames(loadings))
+  loadings$symbol<-data$mgi_symbol[match(rownames(loadings), data$ensembl_gene_id)]
+  loadings$description <- data$description[match(rownames(loadings), data$ensembl_gene_id)]
+  write.table(loadings, file=paste0(opt$outdir,"/",'PCA_loadings.tsv'), quote=FALSE, sep='\t', row.names = TRUE, col.names = NA) 
+  
+  ### HEATMAPS ###
+  futile.logger::flog.info(paste("Performing Heatmap"))
+  df <- as.data.frame(colData(dds)[,opt$factors])
+  rownames(df) <- colData(dds)$track
+  # Heatmap of Top 20 Expressed Genes
+  select <- order(rowMeans(counts(dds,normalized=TRUE)),decreasing=TRUE)[1:20]
+  start_plot('Heatmap_topExpressed', outdir=opt$outdir)
+    pheatmap(assay(vsd)[select,], cluster_rows=FALSE, cluster_cols=FALSE, show_rownames=FALSE, annotation_col=df)
+  end_plot()
+  # Heatmap of Top 20 Variable Genes
+  topVarGenes <- head(order(rowVars(assay(vsd)),decreasing=TRUE),20)
+  mat <- assay(vsd)[ topVarGenes, ]
+  temp <- getmart(rownames(mat))
+  row.names(temp) <- temp$ensembl_gene_id
+  rownames(mat) <- temp[rownames(mat),"external_gene_name"]
+  start_plot('Heatmap_topVariable', outdir=opt$outdir)
+    pheatmap(mat, annotation_col=df, cluster_rows=FALSE,fontsize_row = 6)
+  end_plot()
+  # Heatmap of Genes of interest
+  if (!is.null(opt$genes_of_interest)) {
+    print(rownames(assay(vsd)))
+    goi <- opt$genes_of_interest
+    goi <- goi[goi %in% rownames(assay(vsd))]
+    mat <- assay(vsd)[goi, ]
+    mat <- mat - rowMeans(mat)
+    temp <- getmart(rownames(mat))
+    row.names(temp) <- temp$ensembl_gene_id
+    rownames(mat) <- temp[rownames(mat),"external_gene_name"]
+    start_plot('Heatmap_ofInterest', outdir=opt$outdir)
+      pheatmap(mat, annotation_col=df, scale = "row", cluster_cols = FALSE)
+    end_plot()
+  }
+
+  ### CLUSTERING ###
+  futile.logger::flog.info(paste("Clustering"))
+  # for all genes
+  sampleDists <- dist(t(assay(vsd)))
+  sampleDistMatrix <- as.matrix( sampleDists )
+  rownames(sampleDistMatrix) <- paste(colData(vsd)[,opt$contrast], vsd$track, sep = " - " )
+  colnames(sampleDistMatrix) <- NULL
+  colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
+  start_plot('Heatmap_all', outdir=opt$outdir)
+    pheatmap(sampleDistMatrix,
+    clustering_distance_rows = sampleDists,
+    clustering_distance_cols = sampleDists,
+    col = colors)
+  end_plot()
+  # for top 500 genes
+  top500 <- head(assay(vsd)[ order(rowMeans(assay(vsd)), decreasing=TRUE ),  ], n=500)
+  distsRL500 <- dist(t(top500))
+  sampleDistMatrix500 <- as.matrix( distsRL500 )
+  rownames(sampleDistMatrix500) <- paste(colData(vsd)[,opt$contrast], vsd$track, sep = " - " )
+  colnames(sampleDistMatrix500) <- NULL
+  colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
+  start_plot('Heatmap_top500', outdir=opt$outdir)
+    pheatmap(sampleDistMatrix500,
+    clustering_distance_rows = distsRL500,
+    clustering_distance_cols = distsRL500,
+    col = colors)
+  end_plot()
+  
+  ### EXPLORE BATCH EFFECTS ###
+  futile.logger::flog.info(paste("Exploring Batch Effects"))
+  futile.logger::flog.info(dim_pca)
+  if(dim_pca == 8){
+    for(factor in opt$factors){
+      factor_transformed <- vsd
+      assay(factor_transformed) <- limma::removeBatchEffect(assay(factor_transformed), colData(factor_transformed)[,factor])
+      pca = prcomp(t(assay(factor_transformed)))
+      sample.group <- as_factor(colData(dds)[, factor])
+      variable.group <- colData(dds)[, opt$contrast]
+      percentVar <- round(100 * summary(pca)$importance[2,])
+      scores <- data.frame(variable.group, sample.group, pca$x[,1:2])
+      start_plot(paste0('PCA_', factor, '_removed'), outdir=opt$outdir)
+      print(ggplot(scores,aes(x=PC1, y=PC2)) +
+            geom_point(colour=factor(variable.group), shape=factor(sample.group)) +
+            theme(legend.position="right") +  
+            labs(colour=opt$contrast, shape=factor, x=paste0("PC1 (", percentVar[1],"% of variance)"),
+                 y=paste0("PC2 (", percentVar[2],"% of variance)")) + 
+            ggtitle(paste0("Principal Component Analysis\n after batch correction for ", factor)) + 
+            theme_grey(base_size = 15) +
+            theme(plot.title = element_text(lineheight=1, face="bold"))  + geom_point(size=2) +
+            theme(text=element_text(family='serif')))
+      end_plot()
+    }
+  }
+  else {
+      futile.logger::flog.info(paste("... not done - too few degrees of freedom"))
+  }
+}
+
+main <- function() {
+  option_list <- list(
+    make_option(
+      "--rds-filename",
+      dest = "rds_filename",
+      type = "character",
+      default = "experiment.rds",
+      help = paste("filename with input data of counts")
+    ),
+    make_option(
+      "--model",
+      dest = "model",
+      type = "character",
+      default = "~ group",
+      help = paste("formula for multivariate model")
+    ),
+    make_option(
+      "--contrast",
+      dest = "contrast",
+      type = "character",
+      default = "group",
+      help = paste("formula for multivariate model")
+    ),
+    make_option(
+      "--factors",
+      dest = "factors",
+      type = "character",
+      default = "",
+      help = paste("formula for multivariate model")
+    ),
+    make_option(
+      "--genes_of_interest",
+      dest = "genes_of_interest",
+      type = "character",
+      default = NULL,
+      help = paste("genes of interest for plotting")
+    ),
+    make_option(
+      c("-o","--outdir"),
+      dest = "outdir",
+      type = "character",
+      default = "",
+      help = paste("genes of interest for plotting")
+    )
+  )
+  opt <- experiment_start(option_list = option_list,
+                          description = description)
+  
+  if (!is.null(opt$factors)) {
+    opt$factors = unlist(strsplit(opt$factors, ","))
+  }
+  if (!is.null(opt$genes_of_interest)) {
+    opt$genes_of_interest = unlist(strsplit(opt$genes_of_interest, ","))
+  }
+  run(opt)
+  
+  experiment_stop()
+}
+
+main()
diff --git a/cgatpipelines/tasks/filtercounts.R b/cgatpipelines/tasks/filtercounts.R
new file mode 100755
index 000000000..6f8d7ebd7
--- /dev/null
+++ b/cgatpipelines/tasks/filtercounts.R
@@ -0,0 +1,250 @@
+#' Basic filtering analysis Script
+#'
+#' 
+#' Example usage:
+#' 
+#' Rscript PATH/TO/filtercounts.R 
+#' 
+#' input: directory containing read count files or tsv file containing reads
+#' additional input variables: method used to generate file, model
+#' output: `experiment_out.rds` is an experiment object after filtering
+#' 
+
+suppressMessages(library(futile.logger))
+suppressMessages(library(getopt))
+suppressMessages(library(tidyverse))
+suppressMessages(library(data.table))
+suppressMessages(library(DESeq2))
+suppressMessages(library(edgeR))
+suppressMessages(library(csaw))
+suppressMessages(library(rhdf5))
+suppressMessages(library(tximport))
+suppressMessages(library(DEXSeq))
+
+source(file.path(Sys.getenv("R_ROOT"), "experiment.R"))
+
+run <- function(opt) {
+  
+  ### READING DATA ###
+  # Read in sampleData Table
+  futile.logger::flog.info(paste("reading sampleData table from", normalizePath(opt$sampleData)))
+  sampleData <- read_tsv(opt$sampleData)
+  sampleData <-sampleData[sampleData$include ==1, ]
+  futile.logger::flog.info(paste("read sampleData ", paste(dim(sampleData), collapse = ",")))
+  futile.logger::flog.info(paste(sampleData))
+  rownames(sampleData) <- sampleData$track
+
+  futile.logger::flog.info(paste("reading in data from ", opt$source))
+  if(opt$source %in% c("salmon", "kallisto")){
+    # Read in Transcript to Gene Map
+    tx2gene <- read_tsv(opt$tx2gene)
+    colnames(tx2gene) <- c("ensembl_transcript_id", "ensembl_gene_id")
+    if(opt$tx2gene_regex != "None"){
+      tx2gene <- filter(tx2gene, !grepl(opt$tx2gene_regex,ensembl_transcript_id))
+    }
+    # Read in Data
+    futile.logger::flog.info(opt$counts_dir)
+    futile.logger::flog.info(sampleData$track)
+    files <- file.path(opt$counts_dir, sampleData$track, "quant.sf")
+    names(files) <- sampleData$track
+    txi <- tximport(files, type = opt$source, tx2gene = tx2gene)
+    if(opt$method == "deseq2"){
+      dataset <- DESeqDataSetFromTximport(txi, colData = sampleData, design = formula(opt$model))
+    } else if(opt$method == "edger"){
+      cts <- txi$counts
+      normMat <- txi$length
+      normMat <- normMat/exp(rowMeans(log(normMat)))
+      normCts <- cts/normMat
+      normMat <- sweep(normMat, 2, eff.lib, "*")
+      normMat <- log(normMat)
+      dataset <- DGEList(counts=cts, 
+                   samples=sampleData)
+      dataset <- scaleOffset(dataset,normMat)
+    } else if(opt$method == "Sleuth"){
+      stop("Sleuth method not yet implemented. Sorry.")
+      
+    } else{
+      stop("Method not defined. Allowable methods are \"DESeq2\", \"EdgeR\" or \"Sleuth\"")
+    }
+  }
+  
+  if(opt$source == "dexseq"){
+    # Read in Data
+    files <- file.path(opt$counts_dir, paste0(sampleData$track, ".txt"))
+    names(files) <- sampleData$track
+    if(opt$method != "dexseq"){
+      stop("DEXSeq input is handled by diffexonexpression. Please correct the method argument.")
+    }
+    dataset = DEXSeqDataSetFromHTSeq(
+      files,
+      sampleData=sampleData,
+      design=formula(opt$model),
+      flattenedfile=opt$flattenedFile)
+  } else if(opt$source == "counts_table"){
+    # Read in Data
+    raw <- read.table(file = gzfile(opt$counts_tsv), header=TRUE, row.name=1, check.names = FALSE)
+    futile.logger::flog.info(paste("Colnames: ", colnames(raw)))
+    experiment_tsv <- raw[,sampleData$track,drop=FALSE]
+    if(opt$method == "deseq2"){
+      dataset = DESeqDataSetFromMatrix(experiment_tsv, sampleData, design = formula(opt$model))
+    } else if(opt$method == "edger"){
+      dataset <- DGEList(counts=experiment_tsv, samples=sampleData)
+    } else if(opt$method == "sleuth"){
+      stop("Sleuth method not yet implemented. Sorry.")
+    } else{
+      stop("Method not defined. Allowable methods are \"DESeq2\", \"EdgeR\" or \"Sleuth\"")
+    }
+  }
+  
+  ### FILTERING ###
+  if(opt$filter == TRUE) {
+    futile.logger::flog.info(paste("filtering data ", opt$source))
+    if(opt$method == "edger"){
+      futile.logger::flog.info(paste("Counts before filtering ", paste(dim(dataset$counts), collapse = ",")))
+      keep <- filterByExpr(dataset)
+      dataset <- dataset[keep, , keep.lib.sizes=FALSE]
+      counts_table <- dataset$counts
+      futile.logger::flog.info(paste("Counts after filtering ", paste(dim(dataset$counts), collapse = ",")))
+    } else if(opt$method == "deseq2"){
+      futile.logger::flog.info(paste("Counts before filtering ", paste(dim(counts(dataset)), collapse = ",")))
+      keep <- rowSums(counts(dataset)) >= 10
+      dataset <- dataset[keep,]
+      #dataset <- dataset[Reduce('|', dataset[sapply(dataset, is.numeric)]),]
+      #dataset <- dataset[rowSums(dataset[,sapply(dataset, is.numeric)]==0)<6,]
+      #dataset <- subset(dataset, !rowSums(assay(dataset) == 0) > 6)
+      counts_table <- counts(dataset)
+      futile.logger::flog.info(paste("Counts after filtering ", paste(dim(counts(dataset)), collapse = ",")))
+    } else if(opt$method == "dexseq"){
+      futile.logger::flog.info(paste("Filtering for DEXSeq not implemented "))
+      counts_table <- counts(dataset)
+    }
+  }
+  else {
+    futile.logger::flog.info(paste("No filtering on dataset performed.", opt$source))
+  }
+
+  ### SAVING DATA ###
+  file = get_output_filename(paste0(opt$outdir,"/experiment_out.rds"))
+  flog.info(paste("saving experiment data to", file))
+  saveRDS(dataset, file = file)
+  
+  ## Set up gene lengths for RPKM
+  flog.info("outputting counts data")
+  write.table(counts(dataset),
+              file = paste0(opt$outdir,"/Counts_Data.tsv"),
+              sep = "\t",
+              quote = FALSE,
+              row.names = TRUE,
+              col.names = NA)
+  if(opt$source %in% c("salmon", "kallisto")){
+    write.table(txi$abundance,
+                file = paste0(opt$outdir,"/tpm.tsv"),
+                sep = "\t",
+                quote = FALSE,
+                row.names = TRUE,
+                col.names = NA)
+  }
+}
+
+  
+main <- function() {
+  option_list <- list(
+    make_option(
+      "--counts-dir",
+      dest="counts_dir",
+      type="character",
+      help=paste("directory containing expression estimates",
+                 "from salmon/kallisto/DEXSeq.")
+    ),
+    make_option(
+      "--counts-tsv",
+      dest="counts_tsv",
+      type="character",
+      help=paste("file containing counts generated",
+                 "by e.g. featurecounts.")
+    ),
+    make_option(
+      c("-d", "--sampleData"),
+      dest="sampleData",
+      type="character",
+      default = "",
+      help=paste("input file with experimental design/sample info")
+    ),
+    make_option(
+      c("--outdir"),
+      dest="outdir",
+      type="character",
+      default = "results.dir",
+      help=paste("output directory")
+    ),
+    make_option(
+      "--model",
+      dest = "model",
+      type = "character",
+      default = "~group",
+      help = paste("formula for multivariate model")
+    ),
+    make_option(
+      c("-s", "--source"),
+      dest="source", 
+      type="character",
+      default="salmon",
+      help=paste("Source of data. Possible options are ",
+                 "\"salmon\", \"kallisto\", \"counts_table\", \"dexseq\"")
+    ),
+    make_option(
+      c("--tx2gene"),
+      dest="tx2gene", 
+      type="character",
+      default="transcript2geneMap.tsv",
+      help=paste("Path to transcript to gene tsv.")
+    ),
+    make_option(
+      c("--tx2gene_regex"),
+      dest="tx2gene_regex", 
+      type="character",
+      default="None",
+      help=paste("Regex/Prefix for removal of certain features from ",
+                 "experiment (e.g. removal of spike-ins)")
+    ),  
+    make_option(
+      "--method",
+      dest="method", 
+      type="character",
+      default="deseq2",
+      help=paste("differential expression method to apply ")
+    ),
+    make_option(
+      "--filter",
+      dest="filter",
+      type="logical",
+      default=TRUE,
+      help=paste("adopt filtering strategy. ",
+                 "For EDGER, the default strategy is applied. ",
+                 "For DESeq2 basic rowsum filtering < 10 is applied.")
+    ),
+    make_option(
+      "--dexseq-flattened-file",
+      dest="flattenedFile",
+      type="character",
+      help=paste("directory containing flat gtf for dexseq. DEXSeq ",
+                 "expects this to be generated by the",
+                 "DEXSeq_prepare_annotations.py script")
+    )
+  )
+   
+  opt <- experiment_start(option_list = option_list,
+                          description = description)
+  if (!is.null(opt$method)) {
+    opt$method = str_to_lower(opt$method)
+  }
+  
+  if (!is.null(opt$source)) {
+    opt$source = str_to_lower(opt$source)
+  }
+  run(opt)
+  
+  experiment_stop()
+}
+
+main()
diff --git a/cgatpipelines/tasks/gtfsubset.py b/cgatpipelines/tasks/gtfsubset.py
index 62c467fc8..cd9c1c0fd 100644
--- a/cgatpipelines/tasks/gtfsubset.py
+++ b/cgatpipelines/tasks/gtfsubset.py
@@ -14,6 +14,7 @@
 import cgat.GTF as GTF
 import cgatcore.pipeline as P
 import cgatcore.database as Database
+from sqlalchemy import text
 
 
 class SubsetGTF():
@@ -142,10 +143,12 @@ def getRepeatDataFromUCSC(dbhandle,
        expression given.
 
     '''
-    cc = dbhandle.execute("SHOW TABLES LIKE '%%rmsk'")
-    tables = [x[0] for x in cc.fetchall()]
-    if len(tables) == 0:
-        raise ValueError("could not find any `rmsk` tables")
+    with dbhandle.connect() as conn:
+        query = "SHOW TABLES LIKE '%%rmsk'"
+        cc = conn.execute(text(query))
+        tables = [x[0] for x in cc.fetchall()]
+        if len(tables) == 0:
+            raise ValueError("could not find any `rmsk` tables")
 
     # now collect repeats
     tmpfile = P.get_temp_file(".")
@@ -166,9 +169,10 @@ def getRepeatDataFromUCSC(dbhandle,
         sql = sql % locals()
 
         E.debug("executing sql statement: %s" % sql)
-        cc = dbhandle.execute(sql)
-        for data in cc.fetchall():
-            tmpfile.write("\t".join(map(str, data)) + "\n")
+        with dbhandle.connect() as conn:
+            cc = conn.execute(text(sql))
+            for data in cc.fetchall():
+                tmpfile.write("\t".join(map(str, data)) + "\n")
 
     tmpfile.close()
 
diff --git a/cgatpipelines/tasks/mappingqc.py b/cgatpipelines/tasks/mappingqc.py
deleted file mode 100644
index a9cb44845..000000000
--- a/cgatpipelines/tasks/mappingqc.py
+++ /dev/null
@@ -1,807 +0,0 @@
-"""
-mappingqc.py - Tasks for QC'ing mapping
-===============================================
-
-Reference
----------
-
-"""
-
-import cgatcore.experiment as E
-import os
-import re
-import pandas
-import cgatcore.iotools as iotools
-import cgat.BamTools.bamtools as BamTools
-from cgatcore import pipeline as P
-
-PICARD_MEMORY = "9G"
-
-
-def getNumReadsFromReadsFile(infile):
-    '''get number of reads from a .nreads file.'''
-    with iotools.open_file(infile) as inf:
-        line = inf.readline()
-        if not line.startswith("nreads"):
-            raise ValueError(
-                "parsing error in file '%s': "
-                "expected first line to start with 'nreads'")
-        nreads = int(line[:-1].split("\t")[1])
-    return nreads
-
-
-def buildPicardInsertSizeStats(infile, outfile, genome_file):
-    '''run Picard:CollectInsertSizeMetrics
-
-    Collect insert size statistics.
-
-    Arguments
-    ---------
-    infile : string
-        Input filename in :term:`BAM` format.
-    outfile : string
-        Output filename with picard output.
-    genome_file : string
-        Filename with genomic sequence.
-    '''
-
-    job_memory = PICARD_MEMORY
-    picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals()
-    job_threads = 3
-
-    if BamTools.getNumReads(infile) == 0:
-        E.warn("no reads in %s - no metrics" % infile)
-        iotools.touch_file(outfile)
-        return
-
-    statement = '''picard %(picard_opts)s CollectInsertSizeMetrics
-    INPUT=%(infile)s
-    REFERENCE_SEQUENCE=%(genome_file)s
-    ASSUME_SORTED=true
-    OUTPUT=%(outfile)s
-    VALIDATION_STRINGENCY=SILENT
-    >& %(outfile)s'''
-
-    P.run(statement)
-
-
-def buildPicardAlignmentStats(infile, outfile, genome_file):
-    '''run picard:CollectMultipleMetrics
-
-    Arguments
-    ---------
-    infile : string
-        Input filename in :term:`BAM` format.
-    outfile : string
-        Output filename with picard output.
-    genome_file : string
-        Filename with genomic sequence.
-    '''
-
-    job_memory = PICARD_MEMORY
-    picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals()
-    job_threads = 3
-    tmp_bam = P.get_temp_filename(".")
-
-    if BamTools.getNumReads(infile) == 0:
-        E.warn("no reads in %s - no metrics" % infile)
-        iotools.touch_file(outfile)
-        return
-
-    # Picard seems to have problem if quality information is missing
-    # or there is no sequence/quality information within the bam file.
-    # Thus, add it explicitly.
-    statement = '''cat %(infile)s
-    | cgat bam2bam
-    -v 0
-    --method=set-sequence
-    --output-sam
-    --log=%(outfile)s.bam2bam.log
-    > %(tmp_bam)s &&
-    picard %(picard_opts)s CollectMultipleMetrics
-    INPUT=%(tmp_bam)s
-    REFERENCE_SEQUENCE=%(genome_file)s
-    ASSUME_SORTED=true
-    OUTPUT=%(outfile)s
-    VALIDATION_STRINGENCY=SILENT
-    >& %(outfile)s'''
-
-    P.run(statement)
-
-    os.unlink(tmp_bam)
-
-def buildPicardDuplicationStats(infile, outfile):
-    '''run picard:MarkDuplicates
-
-    Record duplicate metrics using Picard, the marked records
-    are discarded.
-
-    Arguments
-    ---------
-    infile : string
-        Input filename in :term:`BAM` format.
-    outfile : string
-        Output filename with picard output.
-    '''
-
-    job_memory = PICARD_MEMORY
-    picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals()
-    job_threads = 3
-
-    if BamTools.getNumReads(infile) == 0:
-        E.warn("no reads in %s - no metrics" % infile)
-        iotools.touch_file(outfile)
-        return
-
-    # currently, MarkDuplicates cannot handle split alignments from gsnap
-    # these can be identified by the custom XT tag.
-    if ".gsnap.bam" in infile:
-        tmpf = P.get_temp_file(".")
-        tmpfile_name = tmpf.name
-        statement = '''samtools view -h %(infile)s
-        | awk "!/\\tXT:/"
-        | samtools view /dev/stdin -S -b > %(tmpfile_name)s;
-        ''' % locals()
-        data_source = tmpfile_name
-    else:
-        statement = ""
-        data_source = infile
-
-    statement += '''picard %(picard_opts)s MarkDuplicates
-    INPUT=%(data_source)s
-    ASSUME_SORTED=true
-    METRICS_FILE=%(outfile)s
-    OUTPUT=/dev/null
-    VALIDATION_STRINGENCY=SILENT
-    '''
-    P.run(statement)
-
-    if ".gsnap.bam" in infile:
-        os.unlink(tmpfile_name)
-
-
-def buildPicardDuplicateStats(infile, outfile):
-    '''run picard:MarkDuplicates
-
-    Record duplicate metrics using Picard and keep the dedupped .bam
-    file.
-
-    Pair duplication is properly handled, including inter-chromosomal
-    cases. SE data is also handled.  These stats also contain a
-    histogram that estimates the return from additional sequecing.  No
-    marked bam files are retained (/dev/null...)  Note that picards
-    counts reads but they are in fact alignments.
-
-    Arguments
-    ---------
-    infile : string
-        Input filename in :term:`BAM` format.
-    outfile : string
-        Output filename with picard output.
-
-    '''
-    job_memory = PICARD_MEMORY
-    picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals()
-    job_threads = 3
-
-    if BamTools.getNumReads(infile) == 0:
-        E.warn("no reads in %s - no metrics" % infile)
-        iotools.touch_file(outfile)
-        return
-
-    statement = '''picard %(picard_opts)s MarkDuplicates
-    INPUT=%(infile)s
-    ASSUME_SORTED=true
-    METRICS_FILE=%(outfile)s.duplicate_metrics
-    OUTPUT=%(outfile)s
-    VALIDATION_STRINGENCY=SILENT;
-    '''
-    statement += '''samtools index %(outfile)s ;'''
-    P.run(statement)
-
-
-def buildPicardCoverageStats(infile, outfile, baits, regions):
-    '''run picard:CollectHSMetrics
-
-    Generate coverage statistics for regions of interest from a bed
-    file using Picard.
-
-    Arguments
-    ---------
-    infile : string
-        Input filename in :term:`BAM` format.
-    outfile : string
-        Output filename with picard output.
-    baits : :term:`bed` formatted file of bait regions
-    regions : :term:`bed` formatted file of target regions
-
-    '''
-
-    job_memory = PICARD_MEMORY
-    picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals()
-    job_threads = 3
-
-    if BamTools.getNumReads(infile) == 0:
-        E.warn("no reads in %s - no metrics" % infile)
-        iotools.touch_file(outfile)
-        return
-
-    statement = '''picard %(picard_opts)s CollectHsMetrics
-    BAIT_INTERVALS=%(baits)s
-    TARGET_INTERVALS=%(regions)s
-    INPUT=%(infile)s
-    OUTPUT=%(outfile)s
-    VALIDATION_STRINGENCY=LENIENT''' % locals()
-    P.run(statement)
-
-
-def buildPicardGCStats(infile, outfile, genome_file):
-    """picard:CollectGCBiasMetrics
-
-    Collect GC bias metrics.
-
-    Arguments
-    ---------
-    infile : string
-        Input filename in :term:`BAM` format.
-    outfile : string
-        Output filename with picard output.
-    genome_file : string
-        Filename with genomic sequence.
-
-    """
-
-    job_memory = PICARD_MEMORY
-    picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals()
-    job_threads = 3
-
-    if BamTools.getNumReads(infile) == 0:
-        E.warn("no reads in %s - no metrics" % infile)
-        iotools.touch_file(outfile)
-        return
-
-    statement = '''picard %(picard_opts) CollectGcBiasMetrics
-    INPUT=%(infile)s
-    REFERENCE_SEQUENCE=%(genome_file)s
-    OUTPUT=%(outfile)s
-    VALIDATION_STRINGENCY=SILENT
-    CHART_OUTPUT=%(outfile)s.pdf
-    SUMMARY_OUTPUT=%(outfile)s.summary
-    >& %(outfile)s'''
-
-    P.run(statement)
-
-
-def loadPicardMetrics(infiles, outfile, suffix,
-                      pipeline_suffix=".picard_stats",
-                      tablename=None):
-    '''load picard metrics.
-
-    Arguments
-    ---------
-    infiles : string
-        Filenames of files with picard metric information. Each file
-        corresponds to a different track.
-    outfile : string
-        Logfile.
-    suffix : string
-        Suffix to append to table name.
-    pipeline_suffix : string
-        Suffix to remove from track name.
-    tablename : string
-        Tablename to use. If unset, the table name will be derived
-        from `outfile` and suffix as ``to_table(outfile) + "_" +
-        suffix``.
-
-    '''
-
-    if not tablename:
-        tablename = "%s_%s" % (P.to_table(outfile), suffix)
-
-    outf = P.get_temp_file(".")
-
-    filenames = ["%s.%s" % (x, suffix) for x in infiles]
-
-    first = True
-    for filename in filenames:
-        track = P.snip(os.path.basename(filename), "%s.%s" %
-                       (pipeline_suffix, suffix))
-
-        if not os.path.exists(filename):
-            E.warn("File %s missing" % filename)
-            continue
-
-        lines = iotools.open_file(filename, "r").readlines()
-
-        # extract metrics part
-        rx_start = re.compile("## METRICS CLASS")
-        for n, line in enumerate(lines):
-            if rx_start.search(line):
-                lines = lines[n + 1:]
-                break
-
-        for n, line in enumerate(lines):
-            if not line.strip():
-                lines = lines[:n]
-                break
-
-        if len(lines) == 0:
-            E.warn("no lines in %s: %s" % (track, filename))
-            continue
-
-        if first:
-            outf.write("%s\t%s" % ("track", lines[0]))
-            fields = lines[0][:-1].split("\t")
-        else:
-            f = lines[0][:-1].split("\t")
-            if f != fields:
-                raise ValueError(
-                    "file %s has different fields: expected %s, got %s" %
-                    (filename, fields, f))
-
-        first = False
-        for i in range(1, len(lines)):
-            outf.write("%s\t%s" % (track, lines[i]))
-
-    outf.close()
-
-    P.load(outf.name,
-           outfile,
-           tablename=tablename,
-           options="--add-index=track --allow-empty-file")
-
-    os.unlink(outf.name)
-
-
-def loadPicardHistogram(infiles, outfile, suffix, column,
-                        pipeline_suffix=".picard_stats", tablename=False):
-    '''extract a histogram from a picard output file and load
-    it into database.
-
-    Arguments
-    ---------
-    infiles : string
-        Filenames of files with picard metric information. Each file
-        corresponds to a different track.
-    outfile : string
-        Logfile.
-    suffix : string
-        Suffix to append to table name.
-    column : string
-        Column name to take from the histogram.
-    pipeline_suffix : string
-        Suffix to remove from track name.
-    tablename : string
-        Tablename to use. If unset, the table name will be derived
-        from `outfile` and suffix as ``to_table(outfile) + "_" +
-        suffix``.
-    '''
-
-    if not tablename:
-        tablename = "%s_%s" % (P.to_table(outfile), suffix)
-        tablename = tablename.replace("_metrics", "_histogram")
-
-    # some files might be missing
-    xfiles = [x for x in infiles if os.path.exists("%s.%s" % (x, suffix))]
-
-    if len(xfiles) == 0:
-        E.warn("no files for %s" % tablename)
-        return
-
-    header = ",".join([os.path.basename(x) for x in xfiles])
-    filenames = " ".join(["%s.%s" % (x, suffix) for x in xfiles])
-
-    # there might be a variable number of columns in the tables
-    # only take the first ignoring the rest
-
-    load_statement = P.build_load_statement(
-        tablename,
-        options="--add-index=track "
-        " --header-names=%s,%s"
-        " --allow-empty-file"
-        " --replace-header" % (column, header))
-
-    statement = """cgat combine_tables
-    --regex-start="## HISTOGRAM"
-    --missing-value=0
-    --take=2
-    %(filenames)s
-    | %(load_statement)s
-    >> %(outfile)s
-    """
-
-    to_cluster = False
-
-    P.run(statement)
-
-
-def loadPicardAlignmentStats(infiles, outfile):
-    '''load all output from Picard's CollectMultipleMetrics into database.
-
-    Loads tables into database with prefix derived from outfile:
-       * [outfile]_alignment_summary_metric
-       * [outfile]_insert_size_metrics
-       * [outfile]_quality_by_cycle_metrics
-       * [outfile]_quality_distribution_metrics
-       * [outfile]_insert_size_metrics
-
-    Arguments
-    ---------
-    infiles : string
-        Filenames of files with picard metric information. Each file
-        corresponds to a different track.
-    outfile : string
-        Logfile. The table name will be derived from `outfile`.
-
-    '''
-
-    loadPicardMetrics(infiles, outfile, "alignment_summary_metrics")
-
-    # insert size metrics only available for paired-ended data
-    loadPicardMetrics(infiles, outfile, "insert_size_metrics")
-
-    histograms = (("quality_by_cycle_metrics", "cycle"),
-                  ("quality_distribution_metrics", "quality"),
-                  ("insert_size_metrics", "insert_size"))
-
-    for suffix, column in histograms:
-        loadPicardHistogram(infiles, outfile, suffix, column)
-
-
-def loadPicardDuplicationStats(infiles, outfiles):
-    '''load picard duplicate filtering stats into database.
-
-    Loads two tables into the database
-       * picard_duplication_metrics
-       * picard_complexity_histogram
-
-    Arguments
-    ---------
-    infiles : string
-        Filenames of files with picard metric information. Each file
-        corresponds to a different track.
-    outfile : string
-        Logfile. The table name will be derived from `outfile`.
-    '''
-    # SNS: added to enable naming consistency
-
-    outfile_metrics, outfile_histogram = outfiles
-
-    suffix = "picard_duplication_metrics"
-
-    # the loading functions expect "infile_name.pipeline_suffix" as the infile
-    # names.
-    infile_names = [x[:-len("." + suffix)] for x in infiles]
-
-    loadPicardMetrics(infile_names, outfile_metrics, suffix, "",
-                      tablename="picard_duplication_metrics")
-
-    infiles_with_histograms = []
-
-    # The complexity histogram is only present for PE data, so we must check
-    # because by design the pipeline does not track endedness
-    for infile in infile_names:
-        with_hist = False
-        with open(".".join([infile, suffix]), "r") as open_infile:
-            for line in open_infile:
-                if line.startswith("## HISTOGRAM"):
-                    infiles_with_histograms.append(infile)
-                    break
-
-    if len(infiles_with_histograms) > 0:
-        loadPicardHistogram(infiles_with_histograms,
-                            outfile_histogram,
-                            suffix,
-                            "coverage_multiple",
-                            "",
-                            tablename="picard_complexity_histogram")
-    else:
-        with open(outfile_histogram, "w") as ofh:
-            ofh.write("No histograms detected, no data loaded.")
-
-
-def loadPicardDuplicateStats(infiles, outfile, pipeline_suffix=".bam"):
-    '''load picard duplicate filtering stats.
-
-    Arguments
-    ---------
-    infiles : string
-        Filenames of files with picard metric information. Each file
-        corresponds to a different track.
-    outfile : string
-        Logfile. The table name will be derived from `outfile`.
-    pipeline_suffix : string
-        Suffix appended to pipeline output file, will be removed to
-        define track.
-    '''
-
-    loadPicardMetrics(
-        infiles, outfile, "duplicate_metrics",
-        pipeline_suffix=pipeline_suffix)
-    loadPicardHistogram(infiles,
-                        outfile,
-                        "duplicate_metrics",
-                        "duplicates",
-                        pipeline_suffix=pipeline_suffix)
-
-
-def loadPicardCoverageStats(infiles, outfile):
-    '''import coverage statistics into database.
-
-    Arguments
-    ---------
-    infiles : string
-        Filenames of files with picard metric information. Each file
-        corresponds to a different track.
-    outfile : string
-        Logfile. The table name will be derived from `outfile`.
-    '''
-
-    outf = P.get_temp_file(".")
-    first = True
-    for f in infiles:
-        track = P.snip(os.path.basename(f), ".cov")
-        lines = [x for x in open(f, "r").readlines()
-                 if not x.startswith("#") and x.strip()]
-        if first:
-            outf.write("%s\t%s" % ("track", lines[0]))
-        first = False
-        outf.write("%s\t%s" % (track, lines[1]))
-    outf.close()
-    P.load(outf.name,
-           outfile,
-           options="--ignore-empty --add-index=track")
-    os.unlink(outf.name)
-
-
-def buildBAMStats(infile, outfile):
-    '''Count number of reads mapped, duplicates, etc. using
-    bam2stats.py
-
-    Arguments
-    ---------
-    infile : string
-        Input file in :term:`BAM` format
-    outfile : string
-        Output file in :term:`tsv` format.
-
-    '''
-
-    statement = '''cgat bam2stats
-    --force-output
-    --output-filename-pattern=%(outfile)s.%%s
-    < %(infile)s
-    > %(outfile)s'''
-    P.run(statement)
-
-
-def loadBAMStats(infiles, outfile):
-    '''load output of :func:`buildBAMStats` into database.
-
-    Arguments
-    ---------
-    infiles : string
-        Input files, output from :func:`buildBAMStats`.
-    outfile : string
-        Logfile. The table name will be derived from `outfile`.
-    '''
-
-    header = ",".join([P.snip(os.path.basename(x), ".readstats")
-                       for x in infiles])
-    filenames = " ".join(["<( cut -f 1,2 < %s)" % x for x in infiles])
-    tablename = P.to_table(outfile)
-
-    load_statement = P.build_load_statement(
-        tablename,
-        options="--add-index=track "
-        " --allow-empty-file")
-
-    E.info("loading bam stats - summary")
-    statement = """cgat combine_tables
-    --header-names=%(header)s
-    --missing-value=0
-    --ignore-empty
-    %(filenames)s
-    | perl -p -e "s/bin/track/"
-    | cgat table2table --transpose
-    | %(load_statement)s
-    > %(outfile)s"""
-
-    to_cluster = False
-
-    P.run(statement)
-
-    for suffix in ("nm", "nh"):
-        E.info("loading bam stats - %s" % suffix)
-        filenames = " ".join(["%s.%s" % (x, suffix) for x in infiles])
-
-        load_statement = P.build_load_statement(
-            "%s_%s" % (tablename, suffix),
-            options="--allow-empty-file")
-
-        statement = """cgat combine_tables
-        --header-names=%(header)s
-        --skip-titles
-        --missing-value=0
-        --ignore-empty
-        %(filenames)s
-        | perl -p -e "s/bin/%(suffix)s/"
-        | %(load_statement)s
-        >> %(outfile)s """
-
-        to_cluster = False
-
-        P.run(statement)
-
-    # load mapping qualities, there are two columns per row
-    # 'all_reads' and 'filtered_reads'
-    # Here, only filtered_reads are used (--take=3)
-    for suffix in ("mapq",):
-        E.info("loading bam stats - %s" % suffix)
-        filenames = " ".join(["%s.%s" % (x, suffix) for x in infiles])
-
-        load_statement = P.build_load_statement(
-            "%s_%s" % (tablename, suffix),
-            options=" --allow-empty-file")
-
-        statement = """cgat combine_tables
-        --header-names=%(header)s
-        --skip-titles
-        --missing-value=0
-        --ignore-empty
-        --take=3
-        %(filenames)s
-        | perl -p -e "s/bin/%(suffix)s/"
-        | %(load_statement)s
-        >> %(outfile)s """
-
-        to_cluster = False
-
-        P.run(statement)
-
-
-def buildPicardRnaSeqMetrics(infiles, strand, outfile):
-    '''run picard:RNASeqMetrics
-
-
-
-    Arguments
-    ---------
-    infiles : string
-        Input filename in :term:`BAM` format.
-        Genome file in refflat format
-            (http://genome.ucsc.edu/goldenPath/gbdDescriptionsOld.html#RefFlat)
-    outfile : string
-        Output filename with picard output.
-
-    '''
-    job_memory = PICARD_MEMORY
-    picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals()
-    job_threads = 3
-    infile, genome = infiles
-
-    if BamTools.getNumReads(infile) == 0:
-        E.warn("no reads in %s - no metrics" % infile)
-        iotools.touch_file(outfile)
-        return
-
-    statement = '''picard %(picard_opts)s CollectRnaSeqMetrics
-    REF_FLAT=%(genome)s
-    INPUT=%(infile)s
-    ASSUME_SORTED=true
-    OUTPUT=%(outfile)s
-    STRAND=%(strand)s
-    VALIDATION_STRINGENCY=SILENT
-    '''
-    P.run(statement)
-
-
-def loadPicardRnaSeqMetrics(infiles, outfiles):
-    '''load picard rna stats into database.
-
-    Loads tables into the database
-       * picard_rna_metrics
-       * picard_rna_histogram
-
-    Arguments
-    ---------
-    infile : string
-        Filenames of files with picard metric information. Each file
-        corresponds to a different track.
-    outfiles : string
-        Logfile. The table names will be derived from `outfile`.
-    '''
-
-    outfile_metrics, outfile_histogram = outfiles
-
-    suffix = "picard_rna_metrics"
-
-    # the loading functions expect "infile_name.pipeline_suffix" as the infile
-    # names.
-    infile_names = [x[:-len("." + suffix)] for x in infiles]
-
-    loadPicardMetrics(infile_names, outfile_metrics, suffix, "",
-                      tablename="picard_rna_metrics")
-
-    infiles_with_histograms = []
-
-    # Checking if histogram is present (?is this necessary)
-    for infile in infile_names:
-        with_hist = False
-        with open(".".join([infile, suffix]), "r") as open_infile:
-            for line in open_infile:
-                if line.startswith("## HISTOGRAM"):
-                    infiles_with_histograms.append(infile)
-                    break
-
-    if len(infiles_with_histograms) > 0:
-        loadPicardHistogram(infiles_with_histograms,
-                            outfile_histogram,
-                            suffix,
-                            "coverage_multiple",
-                            "",
-                            tablename="picard_rna_histogram")
-    else:
-        with open(outfile_histogram, "w") as ofh:
-            ofh.write("No histograms detected, no data loaded.")
-
-
-def loadIdxstats(infiles, outfile):
-    '''take list of file paths to samtools idxstats output files
-    and merge to create single dataframe containing mapped reads per
-    contig for each track. This dataframe is then loaded into
-    database.
-
-    Loads tables into the database
-        * idxstats_reads_per_chromosome
-
-    Arguments
-    ---------
-    infiles : list
-        list where each element is a string of the filename containing samtools
-        idxstats output. Filename format is expected to be 'sample.idxstats'
-    outfile : string
-        Logfile. The table name will be derived from `outfile`.
-    '''
-
-    outf = P.get_temp_file(".")
-    dfs = []
-    for f in infiles:
-        track = P.snip(f, ".idxstats").split('/')[-1]
-
-        if not os.path.exists(f):
-            E.warn("File %s missing" % f)
-            continue
-
-        # reformat idx stats
-        df = pandas.read_csv(f, sep='\t', header=None)
-        df.columns = ['region', 'length', 'mapped', 'unmapped']
-
-        # calc total reads mapped & unmappedpep
-        total_reads = df.unmapped.sum() + df.mapped.sum()
-        total_mapped_reads = df.mapped.sum()
-
-        reformatted_df = pandas.DataFrame([['total_mapped_reads', total_mapped_reads],
-                                           ['total_reads', total_reads],
-                                           ['track', track]], columns=(['region', 'mapped']))
-
-        # reformat the df
-        df = df.append(reformatted_df, ignore_index=True)
-        df.set_index('region', inplace=True)
-        df1 = df[['mapped']].T
-        # set track as index
-        df1.set_index('track', inplace=True)
-        dfs.append(df1)
-
-    # merge dataframes into single table
-    master_df = pandas.concat(dfs)
-    master_df.drop('*', axis=1, inplace=True)
-    # transform dataframe to avoid reaching column limit
-    master_df = master_df.T
-    master_df.to_csv(outf, sep='\t', index=True)
-    outf.close()
-
-    P.load(outf.name,
-           outfile,
-           options="--ignore-empty --add-index=track")
-    os.unlink(outf.name)
diff --git a/cgatpipelines/tasks/script_template.R b/cgatpipelines/tasks/script_template.R
new file mode 100644
index 000000000..84ee66d8d
--- /dev/null
+++ b/cgatpipelines/tasks/script_template.R
@@ -0,0 +1,66 @@
+#' filtering single cell data based on QC metrics
+#'
+#' WARNING: This script is work-in-progress
+#' 
+#' Example usage:
+#' 
+#' cgat sc-counts2counts --counts-filename=featurecounts.tsv --phenotypes-filename=phenodata.tsv --factor=group,mouse_id,collection_date,slice_depth,slice_number,pipette_visual,timepoint > filtered_counts.tsv
+#'
+#' `feature_counts.tsv` is a table (tab-separated) of ngenes x ncells,
+#' that is the genes are in rows and the columns are cells.
+#'
+#' `phenodata.tsv` is a table (tab-separated) of ncells x nfeatures,
+#' that is rows are cells and features are in columns. The table should contain
+#' a column called `sample_id` that will match the columns in the table
+#' `feature_counts.tsv`.
+#'
+#' Features can then be selected in the `--factor` option to be
+#' plotted.
+#'
+#' -> todo: parameterize detection of ERCC (pattern?)
+#' -> todo: parameterize definition of mitochondrial genes - currently hardcoded for mouse.
+
+## conda dependencies: bioconductor-scater r-cairo
+
+suppressMessages(library(futile.logger))
+suppressMessages(library(getopt))
+
+source(file.path(Sys.getenv("R_ROOT"), "experiment.R"))
+
+
+run <- function(opt) {
+
+    ## do something
+    
+}
+
+main <- function() {
+
+    option_list <- list(
+        make_option(
+            "--counts-filename",
+            dest = "counts_filename",
+            type = "character",
+            default = "featurecounts.tsv",
+            help = paste("filename with input data of counts")
+        ),
+        make_option(
+            "--factor",
+            dest = "factors",
+            type = "character",
+            default = "group,collection_date",
+            help = paste("factors to colour QC plots by.")
+        )
+    )
+    opt <- experiment_start(option_list = option_list,
+                            description = description)
+
+    if (!is.null(opt$factors)) {
+        opt$factors = unlist(strsplit(opt$factors, ","))
+    }
+    run(opt)
+    
+    experiment_stop()
+}
+
+main()
diff --git a/cgatpipelines/tasks/venn.R b/cgatpipelines/tasks/venn.R
new file mode 100644
index 000000000..16f3ca19e
--- /dev/null
+++ b/cgatpipelines/tasks/venn.R
@@ -0,0 +1,132 @@
+#' Wrapper for VennDiagram with basic customization and command line functionality
+#'
+#' WARNING: This script is work-in-progress
+#' 
+#' Example usage:
+#' 
+#' cgat sc-counts2counts --counts-filename=featurecounts.tsv --phenotypes-filename=phenodata.tsv --factor=group,mouse_id,collection_date,slice_depth,slice_number,pipette_visual,timepoint > filtered_counts.tsv
+#'
+#' `feature_counts.tsv` is a table (tab-separated) of ngenes x ncells,
+#' that is the genes are in rows and the columns are cells.
+#'
+#' `phenodata.tsv` is a table (tab-separated) of ncells x nfeatures,
+#' that is rows are cells and features are in columns. The table should contain
+#' a column called `sample_id` that will match the columns in the table
+#' `feature_counts.tsv`.
+#'
+#' Features can then be selected in the `--factor` option to be
+#' plotted.
+#'
+#' -> todo: parameterize detection of ERCC (pattern?)
+#' -> todo: parameterize definition of mitochondrial genes - currently hardcoded for mouse.
+
+## conda dependencies: bioconductor-scater r-cairo
+
+suppressMessages(library(futile.logger))
+suppressMessages(library(getopt))
+suppressMessages(library(VennDiagram))
+
+
+source(file.path(Sys.getenv("R_ROOT"), "experiment.R"))
+
+
+run <- function(opt) {
+  
+  dataframes <- lapply(opt$filenames, read.table)
+  names(dataframes) <- opt$names
+  x1 <- lapply(dataframes, function(x) subset(x, padj < opt$pvalue))
+  x <- lapply(x1, function(x) subset(x, abs(log2FoldChange) >= opt$lfc))
+  if(opt$sign == "positive"){
+    x <- lapply(x, function(y) subset(y, log2FoldChange >0))}
+  if(opt$sign == "negative"){
+    x <- lapply(x, function(y) subset(y, log2FoldChange <0))}
+  x <- lapply(x, rownames)
+  flog.info(str(x))
+  lapply(calculate.overlap(x), write, "venn.txt", append=TRUE, ncolumns=1000)
+
+
+  venn.diagram(x,
+               filename = 'venn_diagramm.png',
+               output = TRUE ,
+               imagetype="png" ,
+               height = 480 ,
+               width = 480 ,
+               resolution = 300,
+               compression = "lzw",
+               lwd = 2,
+               lty = 'blank',
+               fill = opt$colours,
+               cex = 0.8,
+               fontface = "bold",
+               fontfamily = "sans",
+               cat.cex = 0.4,
+               cat.fontface = "bold",
+               cat.default.pos = "outer",
+               cat.fontfamily = "sans"
+  )
+}
+
+main <- function() {
+  
+  option_list <- list(
+    make_option(
+      "--filenames",
+      dest = "filenames",
+      type = "character",
+      default = "file1.tsv,file2.tsv",
+      help = paste("filenames with results data")
+    ),
+    make_option(
+      "--colours",
+      dest = "colours",
+      type = "character",
+      default = "blue,red",
+      help = paste("Venn Diagram Colours")
+    ),
+    make_option(
+      "--names",
+      dest = "names",
+      type = "character",
+      default = "A,B",
+      help = paste("Names")
+    ),
+    make_option(
+      "--pvalue",
+      dest = "pvalue",
+      type = "numeric",
+      default = 0.05,
+      help = paste("p value threshold")
+    ),
+    make_option(
+      "--lfc",
+      dest = "lfc",
+      type = "numeric",
+      default = 0,
+      help = paste("LFC value threshold")
+    ),
+    make_option(
+      "--sign",
+      dest = "sign",
+      type = "character",
+      default = "",
+      help = paste("optional - possibilities are positive/negative")
+    )
+  )
+  opt <- experiment_start(option_list = option_list,
+                          description = description)
+  
+  if (!is.null(opt$filenames)) {
+    opt$filenames = unlist(strsplit(opt$filenames, ","))
+  }
+  if (!is.null(opt$colours)) {
+    opt$colours = unlist(strsplit(opt$colours, ","))
+  }
+  if (!is.null(opt$names)) {
+    opt$names = unlist(strsplit(opt$names, ","))
+  }
+  run(opt)
+  
+  experiment_stop()
+}
+
+main()
diff --git a/cgatpipelines/tools/pipeline_bamstats.py b/cgatpipelines/tools/pipeline_bamstats.py
index 5df60e88f..f2d12771a 100644
--- a/cgatpipelines/tools/pipeline_bamstats.py
+++ b/cgatpipelines/tools/pipeline_bamstats.py
@@ -291,22 +291,6 @@ def buildRrnaIntervals(infile, outfile):
 #########################################################################
 
 
-@follows(mkdir("StrandSpec.dir"))
-@transform("*.bam",
-           suffix(".bam"),
-           r"StrandSpec.dir/\1.strand")
-def strandSpecificity(infile, outfile):
-    '''This function will determine the strand specificity of your library
-    from the bam file'''
-
-    statement = (
-        "cgat bam2libtype "
-        "--max-iterations 10000 "
-        "< {infile} "
-        "> {outfile}".format(**locals()))
-    return P.run(statement)
-
-
 @follows(mkdir("BamFiles.dir"))
 @transform("*.bam",
            regex("(.*).bam$"),
@@ -356,6 +340,26 @@ def buildPicardDuplicationStats(infile, outfile):
                                          PICARD_MEMORY)
 
 
+@follows(mkdir("RSeQC.dir"))
+@follows(countReads)
+@transform(intBam,
+           regex("BamFiles.dir/(.*).bam$"),
+           r"RSeQC.dir/\1.read_distribution")
+def runRSeQC(infile, outfile):
+
+    bed_file = "/ceph/project/talbotlab/jscaber/iPS-C9TrapYan/bamstats-fullgenome/hg38_GENCODE.v38.bed"
+
+    job_memory = "6G"
+
+    statement = '''
+    read_distribution.py
+         -i %(infile)s
+         -r %(bed_file)s
+    > %(outfile)s
+    '''
+
+    P.run(statement)
+
 @follows(mkdir("BamStats.dir"))
 @follows(countReads)
 @transform(intBam,
@@ -804,14 +808,6 @@ def loadTranscriptProfile(infiles, outfile):
     bamstats.loadTranscriptProfile(infiles, outfile)
 
 
-@P.add_doc(bamstats.loadStrandSpecificity)
-@jobs_limit(PARAMS.get("jobs_limit_db", 1), "db")
-@follows(loadTranscriptProfile)
-@merge(strandSpecificity, "strand_spec.load")
-def loadStrandSpecificity(infiles, outfile):
-    ''' merge strand specificity data into a single table'''
-    bamstats.loadStrandSpecificity(infiles, outfile)
-
 
 @merge((loadBAMStats, loadPicardStats, loadContextStats), "view_mapping.load")
 def createViewMapping(infile, outfile):
@@ -856,8 +852,7 @@ def views():
          loadIdxStats,
          loadExonValidation,
          loadPicardRnaSeqMetrics,
-         loadTranscriptProfile,
-         loadStrandSpecificity)
+         loadTranscriptProfile)
 def full():
     '''a dummy task to run all tasks in the pipeline'''
     pass
diff --git a/cgatpipelines/tools/pipeline_exome.py b/cgatpipelines/tools/pipeline_exome.py
index 30e3adf3a..1c2c06bdc 100644
--- a/cgatpipelines/tools/pipeline_exome.py
+++ b/cgatpipelines/tools/pipeline_exome.py
@@ -18,19 +18,18 @@
    3. Local realignment and BQSR in GATK and deduplication in Picard
    4. Calculate the ratio of reads on the X and Y chromosomes to assert sex
    5. Variant calling in families (SNVs & indels) using GATK HaplotypeCaller
-   6. HapMap genotyping in individuals using GATK HaplotypeCaller
-   7. Comparison of Hapmap genotypes to assess relatedness (VCFtools)
-   8. Variant annotation using SNPeff, GATK VariantAnnotator, and SnpSift
-   9. Variant quality score recalibration (GATK)
-   10. Flags variants within genes of interest (such as known disease genes)
+   6. Comparison of join VCF genotypes to assess relatedness (somalier)
+   7. Variant annotation using SNPeff, GATK VariantAnnotator, and SnpSift
+   8. Variant quality score recalibration (GATK)
+   9. Flags variants within genes of interest (such as known disease genes)
       (GATK) (optional)
-   11. Filters potential de novo variants
-   12. Filters potential de novo variants using lower stringency criteria
-   13. Filters potential dominant mutations
-   14. Filters potential homozygous recessive mutations
-   15. Filters potential compound heterozygous mutations
-   16. Generates summary statistics for unfiltered vcf file
-   17. Generates report
+   10. Filters potential de novo variants
+   11. Filters potential de novo variants using lower stringency criteria
+   12. Filters potential dominant mutations
+   13. Filters potential homozygous recessive mutations
+   14. Filters potential compound heterozygous mutations
+   15. Generates summary statistics for unfiltered vcf file
+   16. Generates report
 
 .. note::
 
@@ -152,6 +151,7 @@
 * samtools >= 1.1
 * GATK >= 2.7
 * snpEff >= 4.0
+* somalier
 * Gemini >= ?
 * VCFtools >= 0.1.8a
 
@@ -163,8 +163,8 @@
 # load modules
 from ruffus import transform, mkdir, follows, merge, regex, suffix, \
     jobs_limit, files, collate, add_inputs, formatter, \
-    active_if
-from ruffus.combinatorics import permutations
+    active_if, originate, subdivide
+from ruffus.combinatorics import permutations, product
 import sys
 import os
 import csv
@@ -177,9 +177,8 @@
 import cgatcore.iotools as iotools
 from cgatcore import pipeline as P
 import cgatpipelines.tasks.mapping as mapping
-import cgatpipelines.tasks.mappingqc as mappingqc
+import cgatpipelines.tasks.bamstats as bamstats
 import cgatpipelines.tasks.exome as exome
-import cgatpipelines.tasks.exomeancestry as exomeancestry
 
 ###############################################################################
 ###############################################################################
@@ -196,9 +195,14 @@
 matches = glob.glob("*.fastq.1.gz") + glob.glob(
     "*.fastq.gz") + glob.glob("*.sra") + glob.glob("*.csfasta.gz")
 
+PICARD_MEMORY = PARAMS["picard_memory"]
+GATK_MEMORY = PARAMS["gatk_memory"]
+ANNOTATION_MEMORY = PARAMS["annotation_memory"]
+
+PANELS = glob.glob("*.panel.tsv")
+
+FILTERS = glob.glob ("*.filter.tsv")
 
-def getGATKOptions():
-    return "-l mem_free=1.4G"
 
 ###############################################################################
 ###############################################################################
@@ -207,12 +211,10 @@ def getGATKOptions():
 # The following functions are designed to upload meta-data to the csvdb
 # These haven't been fully implemented yet
 
-
 @jobs_limit(PARAMS.get("jobs_limit_db", 1), "db")
 @files(PARAMS["roi_bed"], "roi.load")
 def loadROI(infile, outfile):
     '''Import regions of interest bed file into SQLite.'''
-    scriptsdir = PARAMS["general_scriptsdir"]
     header = "chr,start,stop,feature"
     tablename = P.to_table(outfile)
     statement = '''cat %(infile)s
@@ -224,14 +226,10 @@ def loadROI(infile, outfile):
             > %(outfile)s  '''
     P.run(statement)
 
-###############################################################################
-
-
 @jobs_limit(PARAMS.get("jobs_limit_db", 1), "db")
 @files(PARAMS["roi_to_gene"], "roi2gene.load")
 def loadROI2Gene(infile, outfile):
     '''Import genes mapping to regions of interest bed file into SQLite.'''
-    scriptsdir = PARAMS["general_scriptsdir"]
     tablename = P.to_table(outfile)
     statement = '''cat %(infile)s
             | cgat csv2db %(csv2db_options)s
@@ -241,14 +239,10 @@ def loadROI2Gene(infile, outfile):
             > %(outfile)s  '''
     P.run(statement)
 
-###############################################################################
-
-
 @jobs_limit(PARAMS.get("jobs_limit_db", 1), "db")
 @files(PARAMS["samples"], "samples.load")
 def loadSamples(infile, outfile):
     '''Import sample information into SQLite.'''
-    scriptsdir = PARAMS["general_scriptsdir"]
     tablename = P.to_table(outfile)
     statement = '''cat %(infile)s
             | cgat csv2db %(csv2db_options)s
@@ -258,11 +252,60 @@ def loadSamples(infile, outfile):
             > %(outfile)s  '''
     P.run(statement)
 
+@originate("genome.dict")
+def createSequenceDictionary(outfile):
+    '''creates a sequence dictionary required for running
+    gatk/picard tools'''
+    genome_file = os.path.abspath(
+        os.path.join(PARAMS["genome_dir"], PARAMS["genome"] + ".fa"))
+
+    statement = '''picard CreateSequenceDictionary 
+                        -R %(genome_file)s -O %(outfile)s'''
+    P.run(statement)
+
+@originate("ensembl.dict")
+def createEnsemblDictionary(outfile):
+    '''creates a dictionary converting ensembl gene_id
+    to symbol/gene_name'''
+    geneset = PARAMS['geneset']
+
+    statement = '''zcat %(geneset)s|
+    cgat gtf2tsv --attributes-as-columns|grep -v ^#|
+    awk -F '\\t' '{print $9, $16}'|uniq > %(outfile)s'''
+
+    P.run(statement)
+
+@mkdir("panels")
+@subdivide([x for x in PANELS],
+           regex("(\S+).panel.tsv"),
+           add_inputs(createEnsemblDictionary),
+           r"panels/\1.bed")
+def createPanelBed(infiles, outfile):
+
+    geneset = PARAMS['geneset']
+    panel_path,dict_path = infiles
+    ensembl_dict = pd.read_csv(dict_path, sep=" ")
+    with open(panel_path) as f:
+        panellist = [line.strip() for line in f]
+    panel = ensembl_dict[ensembl_dict["gene_name"].isin(panellist)]
+    panelgenes = " -e ".join(panel['gene_id'].to_list())
+    
+
+    statement = '''zcat  %(geneset)s
+    |grep -e %(panelgenes)s|cgat gtf2gtf --method=sort|
+    cgat gff2bed --is-gtf --log %(outfile)s.log | sort -k1,1 -k2,2n|
+    cgat bed2bed --method=merge --merge-by-name 
+    --log %(outfile)s.log > %(outfile)s'''
+
+    P.run(statement)
+
 ###############################################################################
 ###############################################################################
 ###############################################################################
-# Alignment to a reference genome
+# Pre-Processing for Variant Discovery
 
+###############################################################################
+# Alignment & post-alignment QC & GATK preparation
 
 @follows(mkdir("bam"))
 @transform(INPUT_FORMATS, REGEX_FORMATS, r"bam/\1.bam")
@@ -274,109 +317,71 @@ def mapReads(infiles, outfile):
     statement = m.build((infiles,), outfile)
     P.run(statement, job_memory="8G", job_threads=PARAMS["bwa_threads"])
 
-###############################################################################
-###############################################################################
-###############################################################################
-# Post-alignment QC
-
 
 @transform(mapReads, regex(r"bam/(\S+).bam"), r"bam/\1.picard_stats")
 def PicardAlignStats(infile, outfile):
     '''Run Picard CollectMultipleMetrics on each BAM file'''
-    genome = PARAMS["bwa_index_dir"] + "/" + PARAMS["genome"] + ".fa"
-    mappingqc.buildPicardAlignmentStats(infile, outfile, genome)
-
-###############################################################################
+    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
+    bamstats.buildPicardAlignmentStats(infile, outfile, genome, PICARD_MEMORY)
 
 
 @jobs_limit(PARAMS.get("jobs_limit_db", 1), "db")
 @merge(PicardAlignStats, "picard_stats.load")
 def loadPicardAlignStats(infiles, outfile):
     '''Merge Picard alignment stats into single table and load into SQLite.'''
-    mappingqc.loadPicardAlignmentStats(infiles, outfile)
-
-###############################################################################
-###############################################################################
-###############################################################################
-# GATK
+    bamstats.loadPicardAlignmentStats(infiles, outfile)
 
 
 @follows(loadPicardAlignStats, mkdir("gatk"))
-@transform(mapReads, regex(r"bam/(\S+).bam"), r"gatk/\1.readgroups.bam")
-def GATKReadGroups(infile, outfile):
+@transform(mapReads, regex(r"bam/(\S+).bam"),
+           add_inputs(createSequenceDictionary),
+           r"gatk/\1.readgroups.bam")
+def GATKReadGroups(infiles, outfile):
     '''Reorders BAM according to reference fasta and adds read groups using
     GATK'''
-    '''Reorders BAM according to reference fasta and add read groups using
-    SAMtools, realigns around indels and recalibrates base quality
-    scores using GATK
-
-    '''
 
+    infile, dictionary = infiles
     track = re.sub(r'-\w+-\w+\.bam', '', os.path.basename(infile))
-    tmpdir_gatk = P.get_temp_dir('.')
-    job_options = getGATKOptions()
     job_threads = PARAMS["gatk_threads"]
     library = PARAMS["readgroup_library"]
     platform = PARAMS["readgroup_platform"]
     platform_unit = PARAMS["readgroup_platform_unit"]
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa"
-    exome.GATKReadGroups(infile, outfile, genome,
+    exome.GATKReadGroups(infile, outfile, dictionary,
                                  library, platform,
-                                 platform_unit, track)
+                                 platform_unit, track,
+                                 GATK_MEMORY)
     iotools.zap_file(infile)
 
-###############################################################################
-###############################################################################
+
 ###############################################################################
 # Remove duplicates, realign and recalibrate lane-by-lane
 
-
 @transform(GATKReadGroups,
            regex(r"gatk/(\S+).readgroups.bam"),
            r"gatk/\1.dedup.bam")
 def RemoveDuplicatesLane(infile, outfile):
     '''Merge Picard duplicate stats into single table and load into SQLite.'''
-    mappingqc.buildPicardDuplicateStats(infile, outfile)
+    bamstats.buildPicardDuplicateStats(infile, outfile, PICARD_MEMORY)
     iotools.zap_file(infile)
 
-###############################################################################
-
 
 @jobs_limit(PARAMS.get("jobs_limit_db", 1), "db")
 @merge(RemoveDuplicatesLane, "picard_duplicate_stats_lane.load")
 def loadPicardDuplicateStatsLane(infiles, outfile):
     '''Merge Picard duplicate stats into single table and load into SQLite.'''
-    mappingqc.loadPicardDuplicateStats(infiles, outfile)
-
-###############################################################################
+    bamstats.loadPicardDuplicateStats(infiles, outfile)
 
 
 @transform(RemoveDuplicatesLane,
            regex(r"gatk/(\S+).dedup.bam"),
-           r"gatk/\1.realigned2.bam")
-def GATKIndelRealignLane(infile, outfile):
-    '''realigns around indels using GATK'''
-    threads = PARAMS["gatk_threads"]
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa"
-    intervals = PARAMS["roi_intervals"]
-    padding = PARAMS["roi_padding"]
-    exome.GATKIndelRealign(infile, outfile, genome, intervals, padding,
-                                   threads)
-    iotools.zap_file(infile)
-
-###############################################################################
-
-
-@transform(GATKIndelRealignLane,
-           regex(r"gatk/(\S+).realigned2.bam"),
            r"gatk/\1.bqsr.bam")
 def GATKBaseRecal(infile, outfile):
     '''recalibrates base quality scores using GATK'''
     intrack = P.snip(os.path.basename(infile), ".bam")
     outtrack = P.snip(os.path.basename(outfile), ".bam")
     dbsnp = PARAMS["gatk_dbsnp"]
-    solid_options = PARAMS["gatk_solid_options"]
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa"
+    options = PARAMS["gatk_baserecalibrator_options"]
+    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
     intervals = PARAMS["roi_intervals"]
     padding = PARAMS["roi_padding"]
     if PARAMS["targetted"]:
@@ -384,15 +389,14 @@ def GATKBaseRecal(infile, outfile):
         shutil.copyfile(intrack + ".bai", outtrack + ".bai")
     else:
         exome.GATKBaseRecal(infile, outfile, genome, intervals,
-                                    padding, dbsnp, solid_options)
+                                    padding, dbsnp, options,
+                                    GATK_MEMORY)
     iotools.zap_file(infile)
 
-###############################################################################
-###############################################################################
+
 ###############################################################################
 # Merge BAMs across different lanes for the same sample
 
-
 @collate(GATKBaseRecal,
          regex(r"gatk/(.*).bqsr.bam"),
          r"gatk/\1.merged.bam")
@@ -413,12 +417,10 @@ def mergeBAMs(infiles, outfile):
     for inputfile in infiles:
         iotools.zap_file(inputfile)
 
-###############################################################################
-###############################################################################
+
 ###############################################################################
 # Remove duplicates sample-by-sample
 
-
 @transform(mergeBAMs,
            regex(r"gatk/(\S+).merged.bam"),
            add_inputs(r"gatk/\1.merged.bam.count"),
@@ -428,582 +430,429 @@ def RemoveDuplicatesSample(infiles, outfile):
     infile, countfile = infiles
     countf = open(countfile, "r")
     if countf.read() != '1':
-        mappingqc.buildPicardDuplicateStats(infile, outfile)
+        bamstats.buildPicardDuplicateStats(infile, outfile, PICARD_MEMORY)
     else:
         shutil.copyfile(infile, outfile)
         shutil.copyfile(infile + ".bai", outfile + ".bai")
         iotools.zap_file(infile)
 
 
-###############################################################################
-
-
 @jobs_limit(PARAMS.get("jobs_limit_db", 1), "db")
 @merge(RemoveDuplicatesSample, "picard_duplicate_stats_sample.load")
 def loadPicardDuplicateStatsSample(infiles, outfile):
     '''Merge Picard duplicate stats into single table and load into SQLite.'''
-    mappingqc.loadPicardDuplicateStats(infiles, outfile)
-
-###############################################################################
-###############################################################################
-###############################################################################
-# Realign sample-by-sample
+    bamstats.loadPicardDuplicateStats(infiles, outfile)
 
 
-@transform(RemoveDuplicatesSample,
-           regex(r"gatk/(\S+).dedup2.bam"),
-           add_inputs(r"gatk/\1.merged.bam.count"),
-           r"gatk/\1.realigned.bam")
-def GATKIndelRealignSample(infiles, outfile):
-    '''realigns around indels using GATK'''
-    infile, countfile = infiles
-    threads = PARAMS["gatk_threads"]
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa"
-    intervals = PARAMS["roi_intervals"]
-    padding = PARAMS["roi_padding"]
-    countf = open(countfile, "r")
-    if countf.read() > '1':
-        exome.GATKIndelRealign(infiles[0], outfile, genome, intervals,
-                                       padding, threads)
-    else:
-        shutil.copyfile(infile, outfile)
-        shutil.copyfile(infile + ".bai", outfile + ".bai")
-#    iotools.zap_file(infile)
-
-###############################################################################
-###############################################################################
 ###############################################################################
 # Coverage of targetted area
 
 
-@transform(GATKIndelRealignSample,
-           regex(r"gatk/(\S+).realigned.bam"),
+@transform(RemoveDuplicatesSample,
+           regex(r"gatk/(\S+).dedup2.bam"),
            r"gatk/\1.cov")
 def buildCoverageStats(infile, outfile):
     '''Generate coverage statistics for regions of interest from a bed
     file using Picard'''
     baits = PARAMS["roi_baits"]
     regions = PARAMS["roi_regions"]
-    mappingqc.buildPicardCoverageStats(infile, outfile,
-                                               baits, regions)
-
-###############################################################################
+    bamstats.buildPicardCoverageStats(infile, outfile,
+                                               baits, regions, PICARD_MEMORY)
 
 
 @jobs_limit(PARAMS.get("jobs_limit_db", 1), "db")
 @merge(buildCoverageStats, "coverage_stats.load")
 def loadCoverageStats(infiles, outfile):
     '''Import coverage statistics into SQLite'''
-    mappingqc.loadPicardCoverageStats(infiles, outfile)
-
-###############################################################################
-###############################################################################
-###############################################################################
-# Guess sex
-
+    bamstats.loadPicardCoverageStats(infiles, outfile)
 
-@follows(mkdir("xy_ratio"))
-@transform(GATKIndelRealignSample,
-           regex(r"gatk/(\S+).realigned.bam"),
-           r"xy_ratio/\1.sex")
-def calcXYratio(infile, outfile):
-    '''Guess the sex of a sample based on ratio of reads
-    per megabase of sequence on X and Y'''
-    exome.guessSex(infile, outfile)
-
-###############################################################################
 
 
-@merge(calcXYratio, "xy_ratio/xy_ratio.tsv")
-def mergeXYRatio(infiles, outfile):
-    '''merge XY ratios from all samples and load into database'''
-    inlist = " ".join(infiles)
-    statement = '''cgat combine_tables
-                   --add-file-prefix --regex-filename="xy_ratio/(\S+).sex"
-                   --no-titles --missing-value=0 --ignore-empty
-                   -L %(outfile)s.log -v 6
-                   --cat=Track %(inlist)s
-                   > %(outfile)s'''
-    P.run(statement)
-
-###############################################################################
-
-
-@jobs_limit(PARAMS.get("jobs_limit_db", 1), "db")
-@transform(mergeXYRatio, regex(r"xy_ratio/xy_ratio.tsv"),
-           r"xy_ratio/xy_ratio.load")
-def loadXYRatio(infile, outfile):
-    '''load into database'''
-    P.load(infile, outfile, "--header-names=Track,X,Y,XY_ratio")
-
 ###############################################################################
 ###############################################################################
 ###############################################################################
-# Variant Calling
+# GATK germline cohort calling
 
 
 @follows(mkdir("variants"))
-@transform(GATKIndelRealignSample, regex(r"gatk/(\S+).realigned.bam"),
+@transform(RemoveDuplicatesSample, regex(r"gatk/(\S+).dedup2.bam"),
            r"variants/\1.haplotypeCaller.g.vcf")
 def haplotypeCaller(infile, outfile):
     '''Call SNVs and indels using GATK HaplotypeCaller in individuals'''
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa"
-    job_options = getGATKOptions()
+    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
     job_threads = PARAMS["gatk_threads"]
     dbsnp = PARAMS["gatk_dbsnp"]
     intervals = PARAMS["roi_intervals"]
     padding = PARAMS["roi_padding"]
     options = PARAMS["gatk_hc_options"]
     exome.haplotypeCaller(infile, outfile, genome, dbsnp,
-                                  intervals, padding, options)
+                                  intervals, padding, options, GATK_MEMORY)
 
-###############################################################################
-
-
-@merge(haplotypeCaller, "variants/all_samples.vcf")
-def genotypeGVCFs(infiles, outfile):
-    '''Joint genotyping of all samples together'''
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa"
-    inputfiles = " --variant ".join(infiles)
-    options = PARAMS["gatk_genotype_options"]
-    exome.genotypeGVCFs(inputfiles, outfile, genome, options)
-
-###############################################################################
-###############################################################################
-###############################################################################
-# HapMap Genotypes
-
-
-@follows(mkdir("hapmap"))
-@files(PARAMS["hapmap_vcf"], "hapmap/hapmap_exome.bed")
-def SelectExonicHapmapVariants(infile, outfile):
-    '''Select variants from HapMap project that are located within
-    exome target regions. Assumes bgzipped & tabix indexed Hapmap VCF file.'''
-    bed = PARAMS["roi_regions"]
-    statement = '''tabix -B %(infile)s %(bed)s |
-                   awk '{OFS="\\t" if (!/^#/){print $1,$2-1,$2}}'
-                   > %(outfile)s''' % locals()
-    P.run(statement)
-
-###############################################################################
-
-
-@follows(SelectExonicHapmapVariants)
-@transform(GATKIndelRealignSample,
-           regex(r"gatk/(\S+).realigned.bam"),
-           add_inputs("hapmap/hapmap_exome.bed"),
-           r"hapmap/\1.hapmap.vcf")
-def HapMapGenotype(infiles, outfile):
-    '''Genotype HapMap SNPs using HaplotypeCaller in each individual'''
-    infile, intervals = infiles
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa"
-    dbsnp = PARAMS["gatk_dbsnp"]
-    padding = PARAMS["hapmap_padding"]
-    options = PARAMS["hapmap_hc_options"]
-    exome.haplotypeCaller(infile, outfile, genome, dbsnp,
-                                  intervals, padding, options)
-
-###############################################################################
 
+@merge(haplotypeCaller, "variants/genomicsdb.log")
+def consolidateGVCFs(infiles, outfile):
+    '''generates a GenomicsDB workspace from all GVCF files, this is an
+       easy-access database of all samples developed by the Intel-Broad team.'''
+    inputlen = len(infiles)
+    inputfiles = " -V ".join(infiles)
+    
+    
+    DB_MEMORY  = PARAMS["gatk_dbmem"]
 
-@transform(HapMapGenotype, regex(r"hapmap/(\S+).hapmap.vcf"),
-           r"hapmap/\1.hapmap.vcf.gz")
-def indexVCFs(infile, outfile):
-    '''Genotype HapMap SNPs using HaplotypeCaller in each individual'''
-    statement = '''bgzip -c %(infile)s > %(outfile)s &&
-                   tabix -p vcf %(outfile)s; '''
-    P.run(statement)
+    exome.consolidateGVCFs(inputfiles, outfile, inputlen, DB_MEMORY)
 
-###############################################################################
-###############################################################################
-###############################################################################
-# Compare Hapmap genotypes to assess relatedness
-
-
-@follows(indexVCFs, mkdir("hapmap/vcfcompare"))
-@permutations("hapmap/*.hapmap.vcf.gz",
-              formatter("hapmap/(\S+).hapmap.vcf.gz$"),
-              2,
-              r"hapmap/vcfcompare/{basename[0][0]}_vs_{basename[1][0]}.vcfcompare")
-def vcfCompare(infiles, outfile):
-    '''Compare HapMap genotypes from each pair of calls '''
-    sample1, sample2 = infiles
-    name1 = P.snip(os.path.basename(sample1), ".hapmap.vcf.gz")
-    name2 = P.snip(os.path.basename(sample2), ".hapmap.vcf.gz")
-    statement = '''vcf-compare -g -m %(name1)s:%(name2)s
-                   %(sample1)s %(sample2)s > %(outfile)s'''
-    P.run(statement)
 
-###############################################################################
-
-
-@transform(vcfCompare,
-           regex(r"hapmap/vcfcompare/(\S+).vcfcompare"),
-           r"hapmap/vcfcompare/\1.ndr")
-def parseVcfCompare(infile, outfile):
-    '''Extract non-reference discordance rate for each comparison'''
-    statement = '''cat %(infile)s
-                   | grep "Non-reference Discordance Rate (NDR):"
-                   | cut -f 3
-                   > %(outfile)s'''
-    P.run(statement)
-
-###############################################################################
-
-
-@merge(parseVcfCompare, "hapmap/vcfcompare/ndr.tsv")
-def mergeNDR(infiles, outfile):
-    '''Merge NDR values for all files'''
-    outf = open(outfile, "w")
-    outf.write("sample1\tsample2\tNDR\n")
-    for infile in infiles:
-        inbase = P.snip(os.path.basename(infile), ".ndr")
-        sample1, sample2 = inbase.split("_vs_")
-        inf = open(infile, "r")
-        ndr = inf.readline().strip()
-        result = "\t".join([sample1, sample2, ndr])
-        outf.write(result + "\n")
-        inf.close()
-    outf.close()
-
-###############################################################################
-
-
-@jobs_limit(PARAMS.get("jobs_limit_db", 1), "db")
-@transform(mergeNDR, regex(r"hapmap/vcfcompare/ndr.tsv"),
-           r"hapmap/vcfcompare/ndr.load")
-def loadNDR(infile, outfile):
-    '''Load NDR into database'''
-    P.load(infile, outfile)
-
-###############################################################################
-###############################################################################
-###############################################################################
-# Variant Annotation
-
-
-@transform(genotypeGVCFs,
-           regex(r"variants/all_samples.vcf"),
-           r"variants/all_samples.snpeff.vcf")
-def annotateVariantsSNPeff(infile, outfile):
-    '''Annotate variants using SNPeff'''
-    job_memory = PARAMS["annotation_memory"]
-    job_threads = PARAMS["annotation_threads"]
-    snpeff_genome = PARAMS["annotation_snpeff_genome"]
-    config = PARAMS["annotation_snpeff_config"]
-    statement = (
-        "snpEff -Xmx%(job_memory)s "
-        "eff "
-        "-c %(config)s "
-        "-v %(snpeff_genome)s "
-        "-o gatk %(infile)s > %(outfile)s" % locals())
-    P.run(statement, job_memory=PARAMS["annotation_memory"])
-
-###############################################################################
-
-
-@transform(annotateVariantsSNPeff,
-           regex(r"variants/all_samples.snpeff.vcf"),
-           r"variants/all_samples.snpeff.table")
-def vcfToTableSnpEff(infile, outfile):
-    '''Converts vcf to tab-delimited file'''
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa"
-    columns = PARAMS["annotation_snpeff_to_table"]
-    exome.vcfToTable(infile, outfile, genome, columns)
-
-
-@jobs_limit(PARAMS.get("jobs_limit_db", 1), "db")
-@transform(vcfToTableSnpEff, regex(r"variants/(\S+).table"),
-           r"variants/\1.table.load")
-def loadTableSnpEff(infile, outfile):
-    '''Load VCF annotations into database'''
-    P.load(infile, outfile, options="--retry --ignore-empty")
-
-
-###############################################################################
-# GATK Variant Annotator
-
-
-@merge(GATKIndelRealignSample, "gatk/all_samples.list")
-def listOfBAMs(infiles, outfile):
-    '''generates a file containing a list of BAMs for use in VQSR'''
-    with iotools.open_file(outfile, "w") as outf:
-        for infile in infiles:
-            outf.write(infile + '\n')
-
-###############################################################################
+@transform(consolidateGVCFs, regex(r"variants/genomicsdb.log"),
+          "variants/all_samples.vcf")
+def genotypeGVCFs(infile, outfile):
+    '''Joint genotyping of all samples together'''
+    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
+    options = PARAMS["gatk_genotype_options"]
+    
+    exome.genotypeGVCFs(infile, outfile, genome, options)
 
 
-@follows(annotateVariantsSNPeff)
 @transform(genotypeGVCFs,
            regex(r"variants/all_samples.vcf"),
-           add_inputs(listOfBAMs, r"variants/all_samples.snpeff.vcf"),
-           r"variants/all_samples.annotated.vcf")
-def variantAnnotator(infiles, outfile):
-    '''Annotate variant file using GATK VariantAnnotator'''
-    vcffile, bamlist, snpeff_file = infiles
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa"
-    dbsnp = PARAMS["gatk_dbsnp"]
-    annotations = PARAMS["gatk_variant_annotations"]
-    exome.variantAnnotator(vcffile, bamlist, outfile, genome,
-                                   dbsnp, annotations, snpeff_file)
+           r"variants/all_samples_excesshet.vcf.gz")
+def filterExcessHets(infile, outfile):
+    '''
+    Hard filter on Excess Heterozygotes
+    Should not make a difference on small cohorts unless related
+    54.69 corresponds to a Z score of -4.5
+    '''
+    job_memory = GATK_MEMORY
+    statement = '''
+    gatk --java-options "-Xmx%(job_memory)s -Xms%(job_memory)s" 
+    VariantFiltration
+    -V %(infile)s
+    --filter-expression "ExcessHet > 54.69"
+    --filter-name ExcessHet
+    -O %(outfile)s
+    > %(outfile)s.log 2>&1
+    '''
+    P.run(statement)
 
 
 ###############################################################################
 # SNP Recalibration
 
+@transform(filterExcessHets,
+           regex(r"variants/all_samples_excesshet.vcf.gz"),
+           r"variants/all_samples_sitesonly.vcf.gz")
+def sitesOnlyVcf(infile,outfile):
+    '''
+    Create sites-only VCF with MakeSitesOnlyVcf
+    '''
+    job_memory = GATK_MEMORY
+    statement = '''gatk MakeSitesOnlyVcf
+    -I %(infile)s
+    -O %(outfile)s
+    > %(outfile)s.log 2>&1 '''
+    P.run(statement)
+
 
-@transform(variantAnnotator,
-           regex(r"variants/all_samples.annotated.vcf"),
+@transform(sitesOnlyVcf,
+           regex(r"variants/all_samples_sitesonly.vcf.gz"),
            r"variants/all_samples.snp_vqsr.recal")
 def variantRecalibratorSnps(infile, outfile):
     '''Create variant recalibration file'''
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa"
+    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
     dbsnp = PARAMS["gatk_dbsnp"]
-    job_options = getGATKOptions()
     job_threads = PARAMS["gatk_threads"]
-    track = P.snip(outfile, ".recal")
     kgenomes = PARAMS["gatk_kgenomes"]
     hapmap = PARAMS["gatk_hapmap"]
     omni = PARAMS["gatk_omni"]
     mode = 'SNP'
     exome.variantRecalibrator(infile, outfile, genome, mode, dbsnp,
-                                      kgenomes, hapmap, omni)
-
-###############################################################################
+                                      kgenomes, hapmap, omni, GATK_MEMORY)
 
 
 @follows(variantRecalibratorSnps)
-@transform(variantAnnotator,
-           regex(r"variants/all_samples.annotated.vcf"),
+@transform(filterExcessHets,
+           regex(r"variants/all_samples_excesshet.vcf.gz"),
            add_inputs(r"variants/all_samples.snp_vqsr.recal",
                       r"variants/all_samples.snp_vqsr.tranches"),
-           r"variants/all_samples.snp_vqsr.vcf")
-def applyVariantRecalibrationSnps(infiles, outfile):
+           r"variants/all_samples.snp_vqsr.vcf.gz")
+def applyVQSRSnps(infiles, outfile):
     '''Perform variant quality score recalibration using GATK '''
     vcf, recal, tranches = infiles
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa"
+    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
     mode = 'SNP'
-    exome.applyVariantRecalibration(vcf, recal, tranches,
-                                            outfile, genome, mode)
+    exome.applyVQSR(vcf, recal, tranches,
+                                            outfile, genome, mode, GATK_MEMORY)
 
 ###############################################################################
 # Indel recalibration
 
 
-@transform(applyVariantRecalibrationSnps,
-           regex(r"variants/all_samples.snp_vqsr.vcf"),
-           r"variants/all_samples.vqsr.recal")
+@transform(sitesOnlyVcf,
+           regex(r"variants/all_samples_sitesonly.vcf.gz"),
+           r"variants/all_samples.indel_vqsr.recal")
 def variantRecalibratorIndels(infile, outfile):
     '''Create variant recalibration file'''
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa"
-    job_options = getGATKOptions()
+    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
     job_threads = PARAMS["gatk_threads"]
-    track = P.snip(outfile, ".recal")
     mills = PARAMS["gatk_mills"]
     dbsnp = PARAMS["gatk_dbsnp"]
+    axiom = PARAMS["gatk_axiom"]
     mode = 'INDEL'
     exome.variantRecalibrator(infile, outfile, genome, mode,
-                                      mills=mills)
-
-###############################################################################
+                                      mills=mills, axiom=axiom, dbsnp=dbsnp, gatkmem=GATK_MEMORY)
 
 
 @follows(variantRecalibratorIndels)
-@transform(applyVariantRecalibrationSnps,
-           regex(r"variants/all_samples.snp_vqsr.vcf"),
-           add_inputs(r"variants/all_samples.vqsr.recal",
-                      r"variants/all_samples.vqsr.tranches"),
-           r"variants/all_samples.vqsr.vcf")
-def applyVariantRecalibrationIndels(infiles, outfile):
+@transform(applyVQSRSnps,
+           regex(r"variants/all_samples.snp_vqsr.vcf.gz"),
+           add_inputs(r"variants/all_samples.indel_vqsr.recal",
+                      r"variants/all_samples.indel_vqsr.tranches"),
+           r"variants/all_samples.vqsr.vcf.gz")
+def applyVQSRIndels(infiles, outfile):
     '''Perform variant quality score recalibration using GATK '''
     vcf, recal, tranches = infiles
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa"
+    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
     mode = 'INDEL'
-    exome.applyVariantRecalibration(vcf, recal, tranches,
-                                            outfile, genome, mode)
-
+    exome.applyVQSR(vcf, recal, tranches,
+                                            outfile, genome, mode, GATK_MEMORY)
 
 ###############################################################################
-# SnpSift
-
-@transform(applyVariantRecalibrationIndels,
-           regex(r"variants/all_samples.vqsr.vcf"),
-           r"variants/all_samples_dbnsfp.snpsift.vcf")
-def annotateVariantsDBNSFP(infile, outfile):
-    '''Add annotations using SNPsift'''
-    job_options = "-l mem_free=6G"
-    job_threads = PARAMS["annotation_threads"]
-    dbNSFP = PARAMS["annotation_dbnsfp"]
-    dbN_annotators = PARAMS["annotation_dbnsfpannotators"]
-    if len(dbN_annotators) == 0:
-        annostring = ""
-    else:
-        annostring = "-f %s" % dbN_annotators
+# coverage over candidate genes
 
-    statement = """SnpSift.sh dbnsfp -db %(dbNSFP)s -v %(infile)s
-                   %(annostring)s >
-                   %(outfile)s;"""
-    P.run(statement)
+@merge(RemoveDuplicatesSample, "gatk/all_samples.list")
+def listOfBAMs(infiles, outfile):
+    '''generates a file containing a list of BAMs'''
+    with iotools.open_file(outfile, "w") as outf:
+        for infile in infiles:
+            outf.write(infile + '\n')
 
+@active_if(PARAMS["coverage_calculate"] == 1)
+@transform(listOfBAMs, regex(r"gatk/all_samples.list"), r"candidate.sample_interval_summary")
+def candidateCoverage(infile, outfile):
+    '''Calculate coverage over exons of candidate genes'''
+    all_exons = PARAMS["coverage_all_exons"]
+    candidates = PARAMS["coverage_candidates"]
+    candidates = candidates.replace(",", " -e ")
+    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
+    threshold = PARAMS["coverage_threshold"]
+    statement = '''zcat %(all_exons)s | grep -e %(candidates)s
+                   | awk '{print $1 ":" $4 "-" $5}' - | sed 's/chr//' - >
+                   candidate.interval_list ; ''' % locals()
+    statement += '''zcat %(all_exons)s | grep -e %(candidates)s
+                    | awk '{print $16}' - | sed 's/"//g;s/;//g' - >
+                    candidate_gene_names.txt ;''' % locals()
+    statement += '''GenomeAnalysisTK -T DepthOfCoverage -R %(genome)s
+                    -o candidate -I %(infile)s -ct %(threshold)s -L candidate.interval_list ;''' % locals()
+    P.run(statement)
 
-@transform(annotateVariantsDBNSFP,
-           regex(r"variants/all_samples_dbnsfp.snpsift.vcf"),
-           r"variants/all_samples_clinvar.snpsift.vcf")
-def annotateVariantsClinvar(infile, outfile):
-    '''Add annotations using SNPsift'''
-    job_options = "-l mem_free=6G"
-    job_threads = PARAMS["annotation_threads"]
-    clinvar = PARAMS["annotation_clinvar"]
-    intout = outfile.replace("samples", "samples_clinvar")
-    statement = """SnpSift.sh annotate %(clinvar)s
-                %(infile)s > %(outfile)s;"""
+@active_if(PARAMS["coverage_calculate"] == 1)
+@transform(candidateCoverage, regex(r"candidate.sample_interval_summary"), r"candidate_coverage_plot.pdf")
+def candidateCoveragePlots(infile, outfile):
+    '''Produce plots of coverage'''
+    rscript = PARAMS["coverage_rscript"]
+    threshold = PARAMS["coverage_threshold"]
+    statement = '''Rscript %(rscript)s %(infile)s candidate_gene_names.txt %(threshold)s %(outfile)s ;'''
     P.run(statement)
 
 
-@transform(annotateVariantsClinvar,
-           regex(r"variants/all_samples_clinvar.snpsift.vcf"),
-           r"variants/all_samples_exac.snpsift.vcf")
-def annotateVariantsExAC(infile, outfile):
-    '''Add annotations using SNPsift'''
-    job_options = "-l mem_free=6G"
-    job_threads = PARAMS["annotation_threads"]
-    exac = PARAMS["annotation_exac"]
-    statement = """SnpSift.sh annotate
-                %(exac)s
-                %(infile)s > %(outfile)s;"""
-    P.run(statement)
 
+###############################################################################
+###############################################################################
+###############################################################################
+# Variant Annotation
 
-@transform(annotateVariantsExAC,
-           regex(r"variants/all_samples_exac.snpsift.vcf"),
-           r"variants/all_samples_gwasc.snpsift.vcf")
-def annotateVariantsGWASC(infile, outfile):
-    '''Add annotations using SNPsift'''
-    job_options = "-l mem_free=6G"
-    job_threads = PARAMS["annotation_threads"]
 
-    gwas_catalog = PARAMS["annotation_gwas_catalog"]
-    statement = """SnpSift.sh gwasCat -db %(gwas_catalog)s
-                   %(infile)s > %(outfile)s;"""
-    P.run(statement)
+###############################################################################
+# Somalier - assessing relatedness of samples
 
+@transform(applyVQSRIndels,
+           regex(r"variants/all_samples.vqsr.vcf.gz"),
+           r"variants/somalier/somalier.log")
+def extractSomalier(infile,outfile):
+    '''extract list of polymorphic sites using Somalier'''
 
-@transform(annotateVariantsGWASC,
-           regex(r"variants/all_samples_gwasc.snpsift.vcf"),
-           r"variants/all_samples_phastcons.snpsift.vcf")
-def annotateVariantsPhastcons(infile, outfile):
-    '''Add annotations using SNPsift'''
-    job_options = "-l mem_free=6G"
-    job_threads = PARAMS["annotation_threads"]
-    genomeind = "%s/%s.fa.fai" % (
-        PARAMS['general_genome_dir'],
-        PARAMS['general_genome'])
-    phastcons = PARAMS["annotation_phastcons"]
-    intout = outfile.replace("samples", "samples_phastc")
-    statement = """ln -sf %(genomeind)s %(phastcons)s/genome.fai &&
-                   SnpSift.sh phastCons %(phastcons)s %(infile)s >
-                   %(outfile)s;"""
+    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
+    somalier = PARAMS['annotation_somalier_path']
+    sites = PARAMS['annotation_somalier_sites']
+    outdir = os.path.dirname(outfile)
+    statement = '''%(somalier)s extract 
+    -d %(outdir)s
+    --sites %(sites)s
+    -f %(genome)s 
+    %(infile)s
+    > %(outfile)s 2>&1
+    '''
     P.run(statement)
 
 
-@transform(annotateVariantsPhastcons,
-           regex(r"variants/all_samples_phastcons.snpsift.vcf"),
-           r"variants/all_samples_1000G.snpsift.vcf")
-def annotateVariants1000G(infile, outfile):
-    '''Add annotations using SNPsift'''
-    job_options = "-l mem_free=6G"
-    job_threads = PARAMS["annotation_threads"]
+@transform(extractSomalier,
+           regex(r"variants/somalier/somalier.log"),
+           r"variants/somalier/relate.log")
+def relateSomalier(infile,outfile):
+    '''Calculate relatedness based on 1000G project'''
 
-    vcfs = []
-    for f in os.listdir(PARAMS["annotation_tgdir"]):
-        if f.endswith(".vcf.gz"):
-            if PARAMS['test'] == 1:
-                if "chr14" in f:
-                    vcfs.append("%s/%s" % (PARAMS['annotation_tgdir'], f))
-            else:
-                vcfs.append("%s/%s" % (PARAMS['annotation_tgdir'], f))
-
-    T = P.get_temp_filename(".")
-    shutil.copy(infile, T)
-    tempin = T
-    tempout = P.get_temp_filename(".")
-
-    for vcf in vcfs:
-        statement = """SnpSift.sh annotate
-                       %(vcf)s
-                       %(tempin)s > %(tempout)s &&
-                       mv %(tempout)s %(tempin)s"""
-        P.run(statement)
+    somalier = PARAMS['annotation_somalier_path']
+    outdir = os.path.dirname(outfile)
+    statement ='''%(somalier)s relate
+    %(outdir)s/*.somalier
+    > %(outfile)s 2>&1'''
+    P.run(statement)
 
-    shutil.move(tempin, outfile)
 
+@transform(extractSomalier,
+           regex(r"variants/somalier/somalier.log"),
+           r"variants/somalier/ancestry.log")
+def ancestrySomalier(infile,outfile):
+    '''Calculate Ancestry based on 1000G project'''
+
+    somalier = PARAMS['annotation_somalier_path']
+    labels = PARAMS['annotation_somalier_labels']
+    reference = PARAMS['annotation_somalier_1kg']
+    outdir = os.path.dirname(outfile)
+    statement ='''%(somalier)s ancestry 
+    --labels %(labels)s %(reference)s/*.somalier ++ 
+    %(outdir)s/*.somalier
+    > %(outfile)s 2>&1'''
+    P.run(statement)
 
-@transform(annotateVariants1000G,
-           regex(r"variants/all_samples_1000G.snpsift.vcf"),
-           r"variants/all_samples_dbsnp.snpsift.vcf")
-def annotateVariantsDBSNP(infile, outfile):
-    '''Add annotations using SNPsift'''
-    job_options = "-l mem_free=6G"
-    job_threads = PARAMS["annotation_threads"]
 
-    dbsnp = PARAMS["annotation_dbsnp"]
-    statement = """SnpSift.sh annotate
-                %(dbsnp)s
-                %(infile)s > %(outfile)s;"""
+###############################################################################
+# Expansionhunter
 
-    P.run(statement)
 
+@follows(mkdir("expansionhunter"))
+@transform(RemoveDuplicatesSample, regex(r"gatk/(\S+).dedup2.bam"),
+           r"expansionhunter/\1.vcf")
+def expansionHunter(infile, outfile):
+    
+    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
+    catalog = PARAMS["expansionhunter"]
+    outfile = P.snip(outfile)
 
-@follows(annotateVariantsDBSNP)
-def annotateVariantsSNPsift():
-    pass
+    statement = '''ExpansionHunter
+    --reads %(infile)s
+    --reference %(genome)s
+    --variant-catalog %(catalog)s
+    --output-prefix %(outfile)s
+    > %(outfile)s.log 2>&1
+    '''
+    P.run(statement, job_condaenv="expansionhunter")
 
+###############################################################################
+# VEP
 
-@transform(annotateVariantsDBSNP,
-           regex(r"variants/all_samples_dbsnp.snpsift.vcf"),
-           r"variants/all_samples.vep.vcf")
+@transform(applyVQSRIndels,
+           regex(r"variants/all_samples.vqsr.vcf.gz"),
+           r"variants/all_samples.vep.vcf.gz")
 def annotateVariantsVEP(infile, outfile):
-    '''
-    Adds annotations as specified in the pipeline.yml using Ensembl
-    variant effect predictor (VEP).
-    '''
+   
+    #Adds annotations as specified in the pipeline.yml using Ensembl
+    #variant effect predictor (VEP).
+    
     # infile - VCF
     # outfile - VCF with vep annotations
-    job_options = "-l mem_free=6G"
-    job_threads = 4
+    job_memory = PARAMS["annotation_memory"]
+    job_threads = PARAMS["annotation_threads"]
 
-    VEP = PARAMS["annotation_vepannotators"].split(",")
-    vep_annotators = PARAMS["annotation_vepannotators"]
-    vep_path = PARAMS["annotation_veppath"]
+    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
+    vep_options = PARAMS["annotation_vepoptions"]
     vep_cache = PARAMS["annotation_vepcache"]
     vep_species = PARAMS["annotation_vepspecies"]
     vep_assembly = PARAMS["annotation_vepassembly"]
-    if len(vep_annotators) != 0:
-        annostring = vep_annotators
-        statement = '''perl %(vep_path)s/variant_effect_predictor.pl
-                       --cache --dir %(vep_cache)s --vcf
-                       --species %(vep_species)s
-                       --fork 2
-                       --assembly %(vep_assembly)s --input_file %(infile)s
-                       --output_file %(outfile)s --force_overwrite
-                       %(annostring)s --offline;'''
-        P.run(statement)
+    spliceAI_snv = PARAMS["annotation_spliceAI_snv"]
+    spliceAI_indel = PARAMS["annotation_spliceAI_indel"]
+    dbNSFP_path = PARAMS["annotation_dbnsfp_path"]
+    dbNSFP_columns = PARAMS["annotation_dbnsfp_columns"]
+    if dbNSFP_columns is None:
+        dbNSFP_columns = "ALL"
+
+    statement = '''vep
+    -i %(infile)s
+    --cache --dir %(vep_cache)s --vcf
+    --species %(vep_species)s
+    --fork %(job_threads)s
+    --compress_output bgzip
+    --assembly %(vep_assembly)s
+    --plugin SameCodon
+    -o %(outfile)s --force_overwrite'''
+    if dbNSFP_path is not None:
+        statement +='''
+        --plugin dbNSFP,%(dbNSFP_path)s,%(dbNSFP_columns)s'''
+    if spliceAI_snv is not None:
+        statement +='''
+        --plugin SpliceAI,snv=%(spliceAI_snv)s,indel=%(spliceAI_indel)s'''
+    if vep_options is not None:
+        statement += "  %(vep_options)s" 
+    statement += ''' > %(outfile)s.log 2>&1;
+    tabix -p vcf %(outfile)s'''
+    P.run(statement)
+
+
+
+@subdivide([x for x in FILTERS],
+           regex("(\S+).filter.tsv"),
+           add_inputs(annotateVariantsVEP),
+           r"variants/all_samples.\1.vcf.gz")
+def filterVEP(infiles, outfile):
+    '''
+    Filter variants using vep-filter.
+    More docs
+    '''
+    
+    filter_file,infile = infiles
+    with open(filter_file) as f:
+        filters = f.read()
+    outfile = P.snip(outfile, ".gz")
+
+    statement = '''filter_vep
+    --gz
+    -i %(infile)s
+    -o %(outfile)s
+    --only_matched
+    --force_overwrite
+    -f "%(filters)s";
+    > %(outfile)s.log  2>&1;
+    bgzip %(outfile)s;
+    tabix -p vcf %(outfile)s.gz
+    '''
+    P.run(statement)
+
+
+@product(filterVEP, formatter("variants/all_samples.(?P<FILTER>.*).vcf.gz$"),
+         createPanelBed, formatter("panels/(?P<PANEL>.*).bed$"),
+         "results/{FILTER[0][0]}/{PANEL[1][0]}.vcf.gz")
+def filterPanel(infiles, outfile):
+    '''
+    Filter variants using vep-filter.
+    More docs
+    '''
+    vcf, panelbed = infiles
+    outfile = P.snip(outfile, ".gz")
+
+    statement = '''gatk
+    SelectVariants
+    -V %(vcf)s
+    -O %(outfile)s
+    -L %(panelbed)s
+    > %(outfile)s.log  2>&1;
+    bgzip %(outfile)s;
+    tabix -p vcf %(outfile)s.gz
+    '''
+    P.run(statement)
 
 
-@follows(mkdir("variant_tables"))
-@transform(GATKIndelRealignSample, regex(r"gatk/(.*).realigned.bam"),
-           add_inputs(annotateVariantsVEP), r"variant_tables/\1.tsv")
+@product(RemoveDuplicatesSample, formatter("gatk/(?P<TRACK>.*).dedup2.bam$"),
+         filterPanel, formatter("results/(?P<FILTER>.*)/(?P<PANEL>.*).vcf.gz$"),
+         "results/{FILTER[1][0]}/{PANEL[1][0]}/{TRACK[0][0]}.tsv")
 def makeAnnotationsTables(infiles, outfile):
     '''
-    Converts the multi sample vcf generated with Haplotype caller into
+    Converts the multi sample vcf into
     a single table for each sample contain only positions called as variants
     in that sample and all columns from the vcf.
     '''
     bamname = infiles[0]
     inputvcf = infiles[1]
     TF = P.get_temp_filename(".")
-    samplename = bamname.replace(".realigned.bam",
+    samplename = bamname.replace(".dedup2.bam",
                                  ".bam").replace("gatk/", "")
 
     statement = '''bcftools view -h %(inputvcf)s |
@@ -1021,7 +870,8 @@ def makeAnnotationsTables(infiles, outfile):
             cols2.append(line.split("\t")[0])
     l1 = "\t".join(cols2)
     l2 = "\t".join(colds)
-    out = open(outfile, "w")
+    os.unlink(TF)
+    out = open(TF, "w")
     out.write('''CHROM\tPOS\tQUAL\tID\tFILTER\tREF1\tALT\tGT\t%s\n\
                  chromosome\tposition\tquality\tid\tfilter\tref\talt\t\
                  genotype\t%s\n''' % (l1, l2))
@@ -1029,239 +879,81 @@ def makeAnnotationsTables(infiles, outfile):
     cstring = "\\t".join(cols)
     cstring = "%CHROM\\t%POS\\t%QUAL\\t%ID\\t%FILTER\\t%REF\\t\
                %ALT\\t[%GT]\\t" + cstring
-    if PARAMS['test'] == 1:
-        statement = '''bcftools query
-                   -f '%(cstring)s\\n'
-                   -i 'FILTER=="PASS" && GT!="0/0" && GT!="./."'
-                   %(inputvcf)s >> %(outfile)s'''
-    else:
-        statement = '''bcftools query -s %(samplename)s
-                   -f '%(cstring)s\\n'
-                   -i 'FILTER=="PASS" && GT!="0/0" && GT!="./."'
-                   %(inputvcf)s >> %(outfile)s'''
-    P.run(statement)
-
-
-###############################################################################
-# Ancestry Functions #
-HAPMAP = "%s/*txt.gz" % PARAMS['hapmap_loc']
-
-
-@merge(HAPMAP, ["snpdict.json",
-                "randomsnps.tsv"])
-def makeRandomSNPSet(infiles, outfiles):
-    '''
-    Generates a random set of SNPs to use to characterise the ancestry
-    and relatedness of the samples.
-
-    1. Finds all SNPs which have a known genotype frequency for all 11
-       hapmap ancestries
-    2. Picks a random 50000 of these SNPs
-    3. Stores a list of these SNPs as randomsnps.tsv
-    4. Builds a dictionary of dictionaries of dictionaries where:
-           At Level 1 each key is a HapMap ancestry ID
-               each of these keys leads to another dictionary - level 2.
-           At Level 2 each key is a dbSNP SNP ID
-               each of these keys leads to another dictionary - level 3.
-           At Level 3 each key is a genotype:
-               the values of these keys are the genotype frequency of this
-               genotype at this SNP in this ancestry.
-       e.g. snpdict['ASW']['rs000000001']['CT'] -->
-            frequency of the CT genotype at the rs0000001 SNP in the
-            ASW population
-    5.  Stores this dictionary in json format as randomsnps.json
-    '''
-    rs = PARAMS['general_randomseed']
-    exomeancestry.MakeSNPFreqDict(infiles, outfiles, rs, submit=True)
-
-
-@follows(mkdir("sample_genotypes.dir"))
-@transform(makeAnnotationsTables, regex("variant_tables/(.*).tsv"),
-           add_inputs(makeRandomSNPSet), r"sample_genotypes.dir/\1.tsv")
-def getSampleGenotypes(infiles, outfile):
-    '''
-    Fetches the genotype from the variant tables for all samples
-    for SNPs in the hapmap sample from makeRandomSNPSet.
-
-    Complex sites are ignored (as simple SNPs are sufficient for these
-    calculations).
-    These are:
-        Sites which failed QC (column 3 in the variant table is not PASS)
-        Sites with more than 2 alleles defined (column 6 in the variant table
-        contains more than one alternative allele)
-        SNPs with more than one ID
-        Indels
-    '''
-    snps = set([line.strip()
-                for line in iotools.open_file(infiles[1][1]).readlines()])
-    exomeancestry.GenotypeSNPs(infiles[0], snps, outfile, submit=True)
-
-
-@merge(getSampleGenotypes, "calledsnps.tsv")
-def concatenateSNPs(infiles, outfile):
-    '''
-    Lists all the SNPs in the sample from makeRandomSNPSet where a variant has
-    been called in any of the input files.
-    Stores the reference genotype, chromosome and position to use later.
-    '''
-    snps = set()
-    for f in infiles:
-        with iotools.open_file(f) as inp:
-            for line in inp:
-                line = line.strip().split("\t")
-                snps.add("%s\t%s\t%s\t%s" % (line[0], line[4],
-                                             line[1], line[2]))
-    out = iotools.open_file(outfile, "w")
-    for snp in snps:
-        out.write("%s\n" % (snp))
-    out.close()
-
-
-@follows(mkdir("sample_freqs"))
-@transform(getSampleGenotypes, regex("sample_genotypes.dir/(.*).tsv"),
-           add_inputs(concatenateSNPs, makeRandomSNPSet),
-           [r"sample_freqs/\1.tsv", r"sample_freqs/\1_anc.tsv"])
-def calculateAncestry(infiles, outfiles):
-    '''
-    Takes the data stored in MakeRandomSNPSet and the genotype of each sample
-    at each site in calledsnps.tsv and tabulates the frequency of this
-    genotype in each of the HapMap ancestry categories.
-    The overall probability of each ancestry is then calculated as the
-    product of these frequencies. These can only be used in comparison to
-    each other - to show which of the 11 ancestries is most probable.
-    '''
-    calledsnps = infiles[1]
-    snpdict = infiles[2][0]
-    exomeancestry.CalculateAncestry(infiles[0], calledsnps, snpdict,
-                                            outfiles, submit=True)
-
-
-@merge(calculateAncestry, "ancestry_estimate.tsv")
-def mergeAncestry(infiles, outfile):
-    '''
-    Merges the output of the ancestry calculations for each sample into a
-    single table.
-    Draws a plot showing the score (probabilty) for each individual
-    for their assigned ancestry and the second closest match.
-    x = individual
-    y = score
-    large diamonds represent the best match and small triangles the second
-    best match
-    '''
-    out = iotools.open_file(outfile, "w")
-    for f in infiles:
-        infile = f[1]
-        scores = []
-        ancs = []
-        for line in iotools.open_file(infile).readlines():
-            line = line.strip().split("\t")
-            anc = line[0]
-            score = decimal.Decimal(line[1])
-            ancs.append(anc)
-            scores.append(score)
-        z = list(zip(ancs, scores))
-        s = sorted(z, key=lambda x: x[1])[::-1]
-        out.write("%s\t%s\t%s\t%s\t%s\n" % (f[0],
-                                            s[0][0], s[0][1],
-                                            s[1][0], s[1][1]))
-    out.close()
-    exomeancestry.PlotAncestry(outfile)
-
-
-@active_if(len(matches) > 1)
-@merge((calculateAncestry, concatenateSNPs),
-       ["all_samples.ped", "all_samples.map"])
-def makePed(infiles, outfiles):
-    '''
-    Generates the required input for the Plink and King software packages.
-       - PED file - columns are SNPs and rows are samples, each cell is the
-         genotype of the sample at this SNP
-       - MAP file - rows are SNPs in the same order as the columns in the ped
-         file, each row shows chromosome, snp id and position.
-    '''
-    exomeancestry.MakePEDFile(infiles, outfiles)
-
-
-@active_if(len(matches) > 1)
-@transform(makePed, suffix(".ped"), ".ibs0")
-def runPlinkandKing(infiles, outfile):
-    '''
-    Uses King software to calculate relatedness of pairs of samples as
-    described here - http://people.virginia.edu/~wc9c/KING/manual.html
-    Plink is used just to reformat the PED files into the format required by
-    King.
-    '''
-    pref = infiles[0].split(".")[0]
-    k = PARAMS['king_path']
-    p = PARAMS['king_plink']
-    statement = """
-    %(p)s/plink --file %(pref)s --make-bed --no-fid --noweb --map3
-          --no-parents --no-sex --no-pheno &&
-    %(k)s/king -b plink.bed --binary --prefix %(pref)s &&
-    %(k)s/king -b %(pref)s.bgeno --kinship --ibs --prefix %(pref)s"""
+    statement = '''bcftools query -s %(samplename)s
+                -f '%(cstring)s\\n'
+                -i 'FILTER=="PASS" && GT!="0/0" && GT!="./."'
+                %(inputvcf)s >> %(TF)s'''
     P.run(statement)
 
-
-@active_if(len(matches) > 1)
-@merge(runPlinkandKing, "family_estimate.tsv")
-def calculateFamily(infile, outfile):
-    '''
-    Translates and filters the output from King.
-    Pairs of related samples are written to the output file along with the
-    degree of relatedness.  Degrees are decided using thresholds from
-    the King documentation, here
-    http://people.virginia.edu/~wc9c/KING/manual.html
-    '''
-    exomeancestry.CalculateFamily(infile, outfile)
+    maxInt = sys.maxsize
+    while True:
+        # decrease the maxInt value by factor 10 
+        # as long as the OverflowError occurs.
+        try:
+            csv.field_size_limit(maxInt)
+            break
+        except OverflowError:
+            maxInt = int(maxInt/10)
+
+    df = pd.read_table(TF, sep = '\t', engine='python', quoting=3)
+    CSQ_index = df['CSQ'][0].split("Format:_")[-1].rstrip('\"')
+    df = df.drop(0)
+
+    if len(df)==0:
+        df.drop('CSQ',axis=1,inplace=True)
+        temp_df = pd.DataFrame(columns=CSQ_index.split("|"))
+    else:
+        df['CSQ'] = df.CSQ.str.split(",")
+        df = df.explode('CSQ').reset_index(drop=True)
+        temp_df = df.CSQ.str.split("|", expand = True)
+        temp_df.columns = CSQ_index.split("|")
+        df.drop('CSQ',axis=1,inplace=True)
+    df = df.join(temp_df, rsuffix="_VEP")
+    if "SYMBOL" in df.columns:
+        col = df.pop("SYMBOL")
+        df.insert(0, "SYMBOL", col)
+    if "HGVSp_VEP" in df.columns:
+        col = df.pop("HGVSp_VEP")
+        df.insert(0, "HGVSp_VEP", col)
+    if  len(df)!=0 and "CANONICAL" in df.columns and "PolyPhen" in df.columns:
+        # Select canonical variant, then most deleterious
+        df_final = df.sort_values(by = ['CANONICAL', 'PolyPhen'], ascending = [False, False]).drop_duplicates(["CHROM", "POS"])
+        df_final.to_csv(outfile, sep="\t")
+        outfile_full = P.snip(outfile, ".tsv") + ".tsv_full"
+        df.to_csv(outfile_full, sep="\t")
+    else:
+        df.to_csv(outfile, sep="\t")
+    os.unlink(TF)
 
 
-@active_if(len(matches) > 1)
-@merge(makeAnnotationsTables, "sex_estimate.tsv")
-def calculateSex(infiles, outfile):
-    '''
-    Approximates the sex of the sample using the data in the variant table.
-    Basic estimate based on heterozygosity on the X chromosome - genotypes
-    are taken for all called variants on X passing QC and the percentage
-    of heterozygotes is taken.
-    This tends to produce two clear populations so Kmeans clustering
-    is used to split the data into two - male and female.  Samples which are
-    unclear are marked in the output.
-    '''
-    exomeancestry.CalculateSex(infiles, outfile, submit=True)
 
+@collate(makeAnnotationsTables, formatter("results/(?P<FILTER>.*)/(?P<PANEL>.*)/.*.tsv$"),
+         "results/{FILTER[0]}/{PANEL[0]}_merged.tsv")
+def mergeAnnotationsTables(infiles, outfile):
 
-@merge((mergeAncestry, calculateFamily, calculateSex), "summary.tsv")
-def summarise(infiles, outfile):
-    '''
-    Builds a summary table for ancestry, family and sex.
-    '''
-    ancestry = pd.read_csv(infiles[0], sep="\t", header=None)
-    if len(matches) > 1:
-        family = pd.read_csv(infiles[1], sep="\t", header=None)
-        sex = pd.read_csv(infiles[2], sep="\t", header=None)
-        ancestry[0] = [a[-1] for a in ancestry[0].str.split("/")]
-        family[0] = [a[-1] for a in family[0].str.split("/")]
-        sex[0] = [a[-1] for a in sex[0].str.split("/")]
-        sex = sex.drop([1, 2], 1)
-        sex.columns = ['id', 'sex', 'sex_significance']
-        family.columns = ['id', 'related_to', 'degree_relatedness']
-        family['degree_relatedness'] = family['degree_relatedness'].astype(int)
-        ancestry = ancestry.drop([2, 3, 4], 1)
-        ancestry.columns = ['id', 'ancestry']
-        summary = ancestry.merge(sex).merge(family, 'left')
-        summary = summary.fillna("NA")
-        summary.to_csv(outfile, sep="\t")
+    result = False
+    for infile in infiles:
+        lines = 0
+        with open(infile) as f:
+            for line in f:
+                lines = lines + 1
+        if lines > 1:
+            result = True
+            break
+
+    path = os.path.dirname(infiles[1])
+    infiles = " ".join(infiles)
+
+    if result == True:
+        statement = ''' cgat tables2table %(infiles)s
+        --cat patient_id
+        --regex-filename '%(path)s/(\S+).tsv'
+        | awk '!/^#/{print > "%(outfile)s" ;next} 1'
+        > %(outfile)s.log 2>&1'''
     else:
-        ancestry[0] = [a[-1] for a in ancestry[0].str.split("/")]
-        ancestry = ancestry.drop([2, 3, 4], 1)
-        ancestry.columns = ['id', 'ancestry']
-        ancestry = ancestry.fillna("NA")
-        ancestry.to_csv(outfile, sep="\t")
-
+        statement = '''touch %(outfile)s'''
+    P.run(statement)
 
-@follows(summarise)
-def ancestry():
-    pass
 
 
 ###############################################################################
@@ -1280,144 +972,25 @@ def qualityFilterVariants(infile, outfiles):
     '''
     qualstring = PARAMS['filtering_quality']
     qualfilter = PARAMS['filtering_quality_ft']
-    exome.filterQuality(infile, qualstring, qualfilter,
-                                outfiles, submit=True)
-
-
-@follows(mkdir("variant_tables_rare"))
-@transform(qualityFilterVariants, regex("variant_tables_highqual/(.*).tsv"),
-           [r'variant_tables_rare/\1.tsv',
-            r'variant_tables_rare/\1_failed.tsv'])
-def rarityFilterVariants(infiles, outfiles):
-    '''
-    Filter out variants which are common in any of the exac or other
-    population datasets as specified in the pipeline.yml.
-    '''
-    infile = infiles[0]
-    thresh = PARAMS['filtering_rarethresh']
-    freqs = PARAMS['filtering_freqs']
-    exac = PARAMS['filtering_exac']
-    exome.filterRarity(infile, exac, freqs, thresh, outfiles,
-                               submit=True)
-
-
-@follows(mkdir("variant_tables_damaging"))
-@transform(rarityFilterVariants, regex("variant_tables_rare/(.*).tsv"),
-           [r'variant_tables_damaging/\1.tsv',
-            r'variant_tables_damaging/\1_failed.tsv'])
-def damageFilterVariants(infiles, outfiles):
-    '''
-    Filter variants which have not been assessed as damaging by any
-    of the specified tools.
-    Tools and thresholds can be specified in the pipeline.yml.
-
-    Does not account for multiple alt alleles - if any ALT allele has
-    been assessed as damaging with any tool the variant is kept,
-    regardless of if this is the allele called in the sample.
-
-    '''
-    infile = infiles[0]
-    exome.filterDamage(infile, PARAMS['filtering_damage'], outfiles,
-                               submit=True)
-
-
-@follows(mkdir("variant_tables_family"))
-@transform(damageFilterVariants, regex("variant_tables_damaging/(.*).tsv"),
-           add_inputs(calculateFamily),
-           [r'variant_tables_family/\1.tsv',
-            r'variant_tables_family/\1_failed.tsv'])
-def familyFilterVariants(infiles, outfiles):
-    '''
-    Filter variants according to the output of calculateFamily -
-    only variants shared by both members of a family will be kept.
-    '''
-    if len(matches) > 1:
-        infile = infiles[0][0]
-
-        infilenam = infile.split("/")[-1]
-        infilestem = "/".join(infile.split("/")[:-1])
-
-        # figure out who is related to who
-        families = [line.strip().split("\t")[:2]
-                    for line in iotools.open_file(infiles[1]).readlines()]
-        infam = [line[0] for line in families] + [line[1] for line in families]
-
-        # no relatives - copy the input file to the output file and generate
-        # a blank "failed" file
-        if infilenam not in infam or PARAMS['filtering_family'] == 0:
-            shutil.copy(infile, outfiles[0])
-            o = iotools.open_file(outfiles[1], "w")
-            o.close()
-        else:
-            for line in families:
-                if infilenam in line:
-                    i = line.index(infilenam)
-                    if i == 0:
-                        infile2 = "%s/%s" % (infilestem, line[1])
-                    else:
-                        infile2 = "%s/%s" % (infilestem, line[0])
-            exome.filterFamily(infile, infile2, outfiles)
+    outfile1 = outfiles[0]
+    outfile2 = outfiles[1]
+    
+    if qualstring is not None:
+        exome.filterQuality(infile, qualstring, qualfilter,
+                            outfiles, submit=True)
     else:
-        infile = infiles[0][0]
-        shutil.copy(infile, outfiles[0])
-        out = iotools.open_file(outfiles[1], "w")
-        out.close()
+        statement = '''cp %(infile)s %(outfile1)s &&
+        touch %(outfile2)s'''
+        P.run(statement)
 
 
-@follows(familyFilterVariants)
+@follows(filterVEP)
 def filter():
     pass
 
-
-@follows(mkdir("genelists"))
-@transform(familyFilterVariants, regex("variant_tables_family/(.*).tsv"),
-           [r'genelists/\1.tsv',
-            r'genelists/\1.bed'])
-def makeGeneLists(infiles, outfiles):
-    infile = infiles[0]
-    outfile = outfiles[1]
-    genes = PARAMS['general_geneset']
-
-    # four %s because they need to be escaped in generating the statement
-    # then again when submitting the P.run(statement)
-    statement = '''awk 'NR > 2 {printf("%%%%s\\t%%%%s\\t%%%%s\\n",\
-    $1, $2, $2 + 1)}'\
-    %(infile)s |\
-    bedtools intersect -wo -a stdin -b %(genes)s > %(outfile)s''' % locals()
-    P.run(statement)
-
-    geneids = set()
-    with iotools.open_file(outfile) as inp:
-        for line in inp:
-            line = line.strip().split("\t")
-            details = line[11].split(";")
-            for detail in details:
-                r = re.search('gene_id', detail)
-                if r:
-                    geneid = detail.split(" ")[-1]
-                    geneids.add(geneid.replace("\"", ""))
-    out = iotools.open_file(outfiles[0], "w")
-    for geneid in geneids:
-        out.write("%s\n" % geneid)
-    out.close()
-
-
-@follows(mkdir('final_variant_tables'))
-@collate((makeGeneLists, familyFilterVariants), regex('(.*)/(.*).tsv'),
-         r'final_variant_tables/\2.tsv')
-def finalVariantTables(infiles, outfile):
-    genes = infiles[0]
-    variants = infiles[1][0]
-    cols = PARAMS['filtering_columns'].split(",")
-    exome.CleanVariantTables(genes, variants, cols, outfile,
-                                     submit=True)
-
-###############################################################################
-###############################################################################
 ###############################################################################
 # Genes of interest
 
-
 @active_if(PARAMS["annotation_add_genes_of_interest"] == 1)
 @transform((annotateVariantsVEP),
            regex(r"variants/all_samples.snpsift.vcf"),
@@ -1429,7 +1002,7 @@ def findGenes(infile, outfile):
     geneList = P.as_list(PARAMS["annotation_genes_of_interest"])
     expression = '\'||SNPEFF_GENE_NAME==\''.join(geneList)
     statement = '''GenomeAnalysisTK -T VariantFiltration
-                   -R %%(genome_dir)s/%%(gatkgenome)s.fa
+                   -R %%(genome_dir)s/%%(genome)s.fa
                    --variant %(infile)s
                    --filterExpression "SNPEFF_GENE_NAME=='%(expression)s'"
                    --filterName "GENE_OF_INTEREST" -o %(outfile)s''' % locals()
@@ -1451,9 +1024,9 @@ def findGenes(infile, outfile):
            r"variants/all_samples.snpsift.table")
 def vcfToTable(infile, outfile):
     '''Converts vcf to tab-delimited file'''
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa"
+    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
     columns = PARAMS["gatk_vcf_to_table"]
-    exome.vcfToTable(infile, outfile, genome, columns)
+    exome.vcfToTable(infile, outfile, genome, columns, GATK_MEMORY)
 
 
 @jobs_limit(PARAMS.get("jobs_limit_db", 1), "db")
@@ -1463,528 +1036,6 @@ def loadVariantAnnotation(infile, outfile):
     '''Load VCF annotations into database'''
     P.load(infile, outfile, options="--retry --ignore-empty")
 
-##############################################################################
-###############################################################################
-###############################################################################
-# Confirm parentage (do novo trios only)
-
-
-@follows(loadVariantAnnotation)
-@transform("*Trio*.ped",
-           regex(r"(\S*Trio\S+).ped"),
-           add_inputs(r"variants/all_samples.snpsift.vcf"),
-           r"variants/\1.parentage")
-def confirmParentage(infiles, outfile):
-    '''Filter variants according to autosomal recessive disease model'''
-    pedfile, infile = infiles
-    pedigree = csv.DictReader(open(pedfile), delimiter='\t', fieldnames=[
-        'family', 'sample', 'father', 'mother', 'sex', 'status'])
-    trio = P.snip(os.path.basename(pedfile), ".ped")
-    trio = trio.replace(".", "_").replace("-", "_")
-    database = PARAMS["database_name"]
-    proband = None
-    mother = None
-    father = None
-    for row in pedigree:
-        if row['status'] == '2':
-            proband = row['sample'].replace(".", "_").replace("-", "_")
-            mother = row['mother'].replace(".", "_").replace("-", "_")
-            father = row['father'].replace(".", "_").replace("-", "_")
-    E.info("proband:" + proband)
-    E.info("mother:" + mother)
-    E.info("father:" + father)
-
-    query = '''SELECT '%(trio)s' as family,
-               hom_nonref_trio/(hom_nonref_parents+0.0) as hom_nonref_conc,
-               child_mat_het_nonref/(maternal_hom_nonref+0.0) as maternal_conc,
-               child_pat_het_nonref/(paternal_hom_nonref+0.0) as paternal_conc,
-                *
-               FROM
-               (SELECT count(*) as hom_nonref_trio
-               FROM %(trio)s_genes_table
-               where  CHROM NOT IN ('X', 'Y')
-               AND %(mother)s_PL LIKE '%%%%,0'
-               AND cast(%(mother)s_DP as INTEGER) > 10
-               AND  %(father)s_PL LIKE '%%%%,0'
-               AND cast(%(father)s_DP as INTEGER) > 10
-               AND %(proband)s_PL LIKE '%%%%,0'
-               AND cast(%(proband)s_DP as INTEGER) > 10),
-               (SELECT count(*) as hom_nonref_parents
-               FROM %(trio)s_genes_table
-               where  CHROM NOT IN ('X', 'Y')
-               AND %(mother)s_PL LIKE '%%%%,0'
-               AND cast(%(mother)s_DP as INTEGER) > 10
-               AND  %(father)s_PL LIKE '%%%%,0'
-               AND cast(%(father)s_DP as INTEGER) > 10
-               AND cast(%(proband)s_DP as INTEGER) > 10),
-               (SELECT count(*) as maternal_hom_nonref
-               FROM %(trio)s_genes_table
-               where  CHROM NOT IN ('X', 'Y')
-               AND %(mother)s_PL LIKE '%%%%,0'
-               AND cast(%(mother)s_DP as INTEGER) > 10
-               AND  %(father)s_PL LIKE '0,%%%%'
-               AND cast(%(father)s_DP as INTEGER) > 10
-               AND cast(%(proband)s_DP as INTEGER) > 10),
-               (SELECT count(*) as child_mat_het_nonref
-               FROM %(trio)s_genes_table
-               where  CHROM NOT IN ('X', 'Y')
-               AND %(mother)s_PL LIKE '%%%%,0'
-               AND cast(%(mother)s_DP as INTEGER) > 10
-               AND  %(father)s_PL LIKE '0,%%%%'
-               AND cast(%(father)s_DP as INTEGER) > 10
-               AND %(proband)s_PL LIKE '%%%%,0,%%%%'
-               AND cast(%(proband)s_DP as INTEGER) > 10),
-               (SELECT count(*) as paternal_hom_nonref
-               FROM %(trio)s_genes_table
-               where  CHROM NOT IN ('X', 'Y')
-               AND  %(father)s_PL LIKE '%%%%,0'
-               AND cast(%(father)s_DP as INTEGER) > 10
-               AND  %(mother)s_PL LIKE '0,%%%%'
-               AND cast(%(mother)s_DP as INTEGER) > 10
-               AND cast(%(proband)s_DP as INTEGER) > 10),
-               (SELECT count(*) as child_pat_het_nonref
-               FROM %(trio)s_genes_table
-               where  CHROM NOT IN ('X', 'Y')
-               AND  %(father)s_PL LIKE '%%%%,0'
-               AND cast(%(father)s_DP as INTEGER) > 10
-               AND  %(mother)s_PL LIKE '0,%%%%'
-               AND cast(%(mother)s_DP as INTEGER) > 10
-               AND %(proband)s_PL LIKE '%%%%,0,%%%%'
-               AND cast(%(proband)s_DP as INTEGER) > 10)''' % locals()
-    statement = '''sqlite3 %(database)s "%(query)s" > %(outfile)s
-                   2> %(outfile)s.log''' % locals()
-    P.run(statement)
-
-###############################################################################
-###############################################################################
-###############################################################################
-# De novos
-
-
-@follows(loadVariantAnnotation)
-@transform("*Trio*.ped",
-           regex(r"(\S*Trio\S+).ped"),
-           add_inputs(r"variants/all_samples.snpsift.vcf"),
-           r"variants/\1.filtered.vcf")
-def deNovoVariants(infiles, outfile):
-    '''Filter de novo variants based on provided jexl expression'''
-    job_options = getGATKOptions()
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa"
-    pedfile, infile = infiles
-    pedigree = csv.DictReader(
-        iotools.open_file(pedfile), delimiter='\t', fieldnames=[
-            'family', 'sample', 'father', 'mother', 'sex', 'status'])
-    for row in pedigree:
-        if row['status'] == '2':
-            father = row['father']
-            mother = row['mother']
-            child = row['sample']
-    select = '''vc.getGenotype("%(father)s").getDP()>=10&&vc.getGenotype("%(mother)s").getDP()>=10&&vc.getGenotype("%(child)s").getPL().0>20&&vc.getGenotype("%(child)s").getPL().1==0&&vc.getGenotype("%(child)s").getPL().2>0&&vc.getGenotype("%(father)s").getPL().0==0&&vc.getGenotype("%(father)s").getPL().1>20&&vc.getGenotype("%(father)s").getPL().2>20&&vc.getGenotype("%(mother)s").getPL().0==0&&vc.getGenotype("%(mother)s").getPL().1>20&&vc.getGenotype("%(mother)s").getPL().2>20&&vc.getGenotype("%(child)s").getAD().1>=3&&((vc.getGenotype("%(child)s").getAD().1)/(vc.getGenotype("%(child)s").getDP().floatValue()))>=0.25&&(vc.getGenotype("%(father)s").getAD().1==0||(vc.getGenotype("%(father)s").getAD().1>0&&((vc.getGenotype("%(father)s").getAD().1)/(vc.getGenotype("%(father)s").getDP().floatValue()))<0.05))&&(vc.getGenotype("%(mother)s").getAD().1==0||(vc.getGenotype("%(mother)s").getAD().1>0&&((vc.getGenotype("%(mother)s").getAD().1)/(vc.getGenotype("%(mother)s").getDP().floatValue()))<0.05))&&(SNPEFF_IMPACT=="HIGH"||SNPEFF_IMPACT=="MODERATE")''' % locals()
-    exome.selectVariants(infile, outfile, genome, select)
-
-
-@transform(deNovoVariants,
-           regex(r"variants/(\S+).filtered.vcf"),
-           r"variants/\1.filtered.table")
-def tabulateDeNovos(infile, outfile):
-    '''Tabulate de novo variants'''
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa"
-    columns = PARAMS["gatk_vcf_to_table"]
-    exome.vcfToTable(infile, outfile, genome, columns)
-
-
-@jobs_limit(PARAMS.get("jobs_limit_db", 1), "db")
-@transform(tabulateDeNovos,
-           regex(r"variants/(\S+).filtered.table"),
-           r"variants/\1.filtered.table.load")
-def loadDeNovos(infile, outfile):
-    '''Load de novos into database'''
-    P.load(infile, outfile,
-           options="--retry --ignore-empty --allow-empty-file")
-
-###############################################################################
-
-
-@follows(loadVariantAnnotation)
-@transform("*Trio*.ped",
-           regex(r"(\S*Trio\S+).ped"),
-           add_inputs(r"variants/all_samples.snpsift.vcf"),
-           r"variants/\1.denovos.vcf")
-def lowerStringencyDeNovos(infiles, outfile):
-    '''Filter lower stringency de novo variants based on provided jexl
-    expression'''
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa"
-    pedfile, infile = infiles
-    pedigree = csv.DictReader(
-        iotools.open_file(pedfile), delimiter='\t', fieldnames=[
-            'family', 'sample', 'father', 'mother', 'sex', 'status'])
-    for row in pedigree:
-        if row['status'] == '2':
-            father = row['father']
-            mother = row['mother']
-            child = row['sample']
-    select = '''vc.getGenotype("%(child)s").getPL().1==0&&vc.getGenotype("%(father)s").getPL().0==0&&vc.getGenotype("%(mother)s").getPL().0==0&&(SNPEFF_IMPACT=="HIGH"||SNPEFF_IMPACT=="MODERATE")''' % locals(
-    )
-    exome.selectVariants(infile, outfile, genome, select)
-
-
-@transform(lowerStringencyDeNovos,
-           regex(r"variants/(\S+).denovos.vcf"),
-           r"variants/\1.denovos.table")
-def tabulateLowerStringencyDeNovos(infile, outfile):
-    '''Tabulate lower stringency de novo variants'''
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa"
-    columns = PARAMS["gatk_vcf_to_table"]
-    exome.vcfToTable(infile, outfile, genome, columns)
-
-
-@jobs_limit(PARAMS.get("jobs_limit_db", 1), "db")
-@transform(tabulateLowerStringencyDeNovos,
-           regex(r"variants/(\S+).denovos.table"),
-           r"variants/\1.denovos.table.load")
-def loadLowerStringencyDeNovos(infile, outfile):
-    '''Load lower stringency de novos into database'''
-    P.load(infile, outfile,
-           options="--retry --ignore-empty --allow-empty-file")
-
-###############################################################################
-###############################################################################
-###############################################################################
-# Dominant
-
-
-@follows(loadVariantAnnotation)
-@transform("*Multiplex*.ped",
-           regex(r"(\S*Multiplex\S+).ped"),
-           add_inputs(r"variants/all_samples.snpsift.vcf"),
-           r"variants/\1.dominant.vcf")
-def dominantVariants(infiles, outfile):
-    '''Filter variants according to autosomal dominant disease model'''
-    pedfile, infile = infiles
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa"
-    pedigree = csv.DictReader(open(pedfile), delimiter='\t',
-                              fieldnames=['family', 'sample', 'father',
-                                          'mother', 'sex', 'status'])
-    affecteds = []
-    unaffecteds = []
-    for row in pedigree:
-        if row['status'] == '2':
-            affecteds += [row['sample']]
-        if row['status'] == '1':
-            unaffecteds += [row['sample']]
-    affecteds_exp = '").getPL().1==0&&vc.getGenotype("'.join(affecteds)
-    affecteds_exp2 = '").getAD().1>=1&&vc.getGenotype("'.join(affecteds)
-    if len(unaffecteds) == 0:
-        unaffecteds_exp = ''
-    else:
-        unaffecteds_exp = '&&vc.getGenotype("' + \
-            ('").isHomRef()&&vc.getGenotype("'.join(unaffecteds)) + \
-            '").isHomRef()'
-    # for some weird reason the 1000G filter doesn't work on these particular
-    # files - will add later when I've figured out what's wrong
-    # currently 1000G filter is performed at the report stage (not in csvdb)
-    select = '''vc.getGenotype("%(affecteds_exp)s").getPL().1==0&&vc.getGenotype("%(affecteds_exp2)s").getAD().1>=1%(unaffecteds_exp)s&&(SNPEFF_IMPACT=="HIGH"||SNPEFF_IMPACT=="MODERATE")''' % locals()
-    exome.selectVariants(infile, outfile, genome, select)
-
-###############################################################################
-
-
-@transform(dominantVariants,
-           regex(r"variants/(\S+).dominant.vcf"),
-           r"variants/\1.dominant.table")
-def tabulateDoms(infile, outfile):
-    '''Tabulate dominant disease candidate variants'''
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa"
-    columns = PARAMS["gatk_vcf_to_table"]
-    exome.vcfToTable(infile, outfile, genome, columns)
-
-###############################################################################
-
-
-@jobs_limit(PARAMS.get("jobs_limit_db", 1), "db")
-@transform(tabulateDoms, regex(r"variants/(\S+).dominant.table"),
-           r"variants/\1.dominant.table.load")
-def loadDoms(infile, outfile):
-    '''Load dominant disease candidates into database'''
-    P.load(infile, outfile,
-           options="--retry --ignore-empty --allow-empty-file")
-
-###############################################################################
-###############################################################################
-###############################################################################
-# Recessive
-
-
-@follows(loadVariantAnnotation)
-@transform("*.ped",
-           regex(
-               r"(\S*Trio\S+|\S*Multiplex\S+).ped"),
-           add_inputs(r"variants/all_samples.snpsift.vcf"),
-           r"variants/\1.recessive.vcf")
-def recessiveVariants(infiles, outfile):
-    '''Filter variants according to autosomal recessive disease model'''
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa"
-    pedfile, infile = infiles
-    pedigree = csv.DictReader(open(pedfile), delimiter='\t',
-                              fieldnames=['family', 'sample', 'father',
-                                          'mother', 'sex', 'status'])
-    affecteds = []
-    parents = []
-    unaffecteds = []
-    for row in pedigree:
-        if row['status'] == '2':
-            affecteds += [row['sample']]
-            if row['father'] != '0':
-                parents += [row['father']]
-            if row['mother'] != '0':
-                parents += [row['mother']]
-        elif row['status'] == '1' and row['sample'] not in parents:
-            unaffecteds += [row['sample']]
-    affecteds_exp = '").getPL().2==0&&vc.getGenotype("'.join(affecteds)
-    if len(unaffecteds) == 0:
-        unaffecteds_exp = ''
-    else:
-        unaffecteds_exp = '&&vc.getGenotype("' + \
-            ('").getPL().2!=0&&vc.getGenotype("'.join(unaffecteds)) + \
-            '").getPL().2!=0'
-    if len(parents) == 0:
-        parents_exp = ''
-    else:
-        parents_exp = '&&vc.getGenotype("' + \
-            ('").getPL().1==0&&vc.getGenotype("'.join(parents)) + \
-            '").getPL().1==0'
-    # need a way of specifying that other unaffecteds eg. sibs can't be
-    # homozygous for alt allele
-    select = '''vc.getGenotype("%(affecteds_exp)s").getPL().2==0%(unaffecteds_exp)s%(parents_exp)s&&(SNPEFF_IMPACT=="HIGH"||SNPEFF_IMPACT=="MODERATE")''' % locals(
-    )
-    exome.selectVariants(infile, outfile, genome, select)
-
-###############################################################################
-
-
-@transform(recessiveVariants,
-           regex(r"variants/(\S+).recessive.vcf"),
-           r"variants/\1.recessive.table")
-def tabulateRecs(infile, outfile):
-    '''Tabulate potential homozygous recessive disease variants'''
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa"
-    columns = PARAMS["gatk_vcf_to_table"]
-    exome.vcfToTable(infile, outfile, genome, columns)
-
-###############################################################################
-
-
-@jobs_limit(PARAMS.get("jobs_limit_db", 1), "db")
-@transform(tabulateRecs, regex(r"variants/(\S+).recessive.table"),
-           r"variants/\1.recessive.table.load")
-def loadRecs(infile, outfile):
-    '''Load homozygous recessive disease candidates into database'''
-    P.load(infile, outfile,
-           options="--retry --ignore-empty --allow-empty-file")
-
-###############################################################################
-###############################################################################
-###############################################################################
-# X-linked
-
-
-@follows(loadVariantAnnotation)
-@transform("*.ped", regex(r"(\S*Trio\S+|\S*Multiplex\S+).ped"),
-           add_inputs(r"variants/all_samples.snpsift.vcf"),
-           r"variants/\1.xlinked.vcf")
-def xlinkedVariants(infiles, outfile):
-    '''Find maternally inherited X chromosome variants in male patients'''
-    track = P.snip(os.path.basename(outfile), ".vcf")
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa"
-    pedfile, infile = infiles
-    pedigree = csv.DictReader(open(pedfile), delimiter='\t',
-                              fieldnames=['family', 'sample', 'father',
-                                          'mother', 'sex', 'status'])
-    affecteds = []
-    mothers = []
-    male_unaffecteds = []
-    female_unaffecteds = []
-    for row in pedigree:
-        if row['status'] == '2' and row['mother'] != '0':
-            if row['mother'] not in mothers:
-                mothers += [row['mother']]
-            affecteds += [row['sample']]
-        elif row['status'] == '1' and row['sex'] == '1':
-            male_unaffecteds += [row['sample']]
-        elif row['status'] == '1' and row['sex'] != '1' and row['sample'] not in mothers:
-            female_unaffecteds += [row['sample']]
-    if len(affecteds) == 0:
-        affecteds_exp = ''
-    else:
-        affecteds_exp = '").isHomRef()&&!vc.getGenotype("'.join(affecteds)
-#        affecteds_exp = '&&!vc.getGenotype("' + \
-#        ('").isHomRef()&&!vc.getGenotype("'.join(affecteds)) + \
-#        '").isHomRef()'
-    if len(mothers) == 0:
-        mothers_exp = ''
-    else:
-        mothers_exp = '&&vc.getGenotype("' + \
-                      ('").getPL().1==0&&vc.getGenotype("'.join(mothers)) + \
-                      '").getPL().1==0'
-    if len(male_unaffecteds) == 0:
-        male_unaffecteds_exp = ''
-    else:
-        male_unaffecteds_exp = '&&vc.getGenotype("' + \
-                               ('").isHomRef()&&vc.getGenotype("'.join(
-                                   male_unaffecteds)) + \
-                               '").isHomRef()'
-    if len(female_unaffecteds) == 0:
-        female_unaffecteds_exp = ''
-    else:
-        female_unaffecteds_exp = '&&vc.getGenotype("' + \
-                                 ('").getPL().2!=0&&vc.getGenotype("'.join(
-                                     female_unaffecteds)) + \
-            '").getPL().2!=0'
-    select = '''CHROM=="X"&&!vc.getGenotype("%(affecteds_exp)s").isHomRef()%(mothers_exp)s%(male_unaffecteds_exp)s%(female_unaffecteds_exp)s&&(SNPEFF_IMPACT=="HIGH"||SNPEFF_IMPACT=="MODERATE")''' % locals()
-    exome.selectVariants(infile, outfile, genome, select)
-
-###############################################################################
-
-
-@transform(xlinkedVariants,
-           regex(r"variants/(\S+).xlinked.vcf"),
-           r"variants/\1.xlinked.table")
-def tabulateXs(infile, outfile):
-    '''Tabulate potential X-linked disease variants'''
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa"
-    columns = PARAMS["gatk_vcf_to_table"]
-    exome.vcfToTable(infile, outfile, genome, columns)
-
-###############################################################################
-
-
-@jobs_limit(PARAMS.get("jobs_limit_db", 1), "db")
-@transform(tabulateXs, regex(r"variants/(\S+).xlinked.table"),
-           r"variants/\1.xlinked.table.load")
-def loadXs(infile, outfile):
-    '''Load X-linked disease candidates into database'''
-    P.load(infile, outfile,
-           options="--retry --ignore-empty --allow-empty-file")
-
-###############################################################################
-###############################################################################
-###############################################################################
-# Compound hets
-
-
-# why does this not work from snpsift VCF file? DS
-# @transform(annotateVariantsSNPsift,
-#    regex(r"variants/(\S*Trio\S+|\S*Multiplex\S+).haplotypeCaller.snpsift.vcf"),
-@transform(annotateVariantsSNPeff,
-           regex(
-               r"variants/all_samples.snpeff.vcf"),
-           add_inputs(r"all_samples.ped", r"gatk/all_samples.list"),
-           r"variants/all_samples.phased.vcf")
-def phasing(infiles, outfile):
-    '''phase variants with GATK'''
-    infile, pedfile, bamlist = infiles
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa"
-    statement = '''GenomeAnalysisTK -T PhaseByTransmission
-                   -R %(genome)s
-                   -V %(infile)s
-                   -ped %(pedfile)s
-                   -mvf %(infile)s.mvf
-                   -o %(outfile)s ;''' % locals()
-    P.run(statement)
-
-###############################################################################
-
-
-@transform(phasing, regex(r"variants/all_samples.phased.vcf"),
-           add_inputs(r"gatk/all_samples.list"),
-           r"variants/all_samples.rbp.vcf")
-def readbackedphasing(infiles, outfile):
-    '''phase variants with ReadBackedPhasing'''
-    job_memory = "32G"
-    infile, bamlist = infiles
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa"
-    statement = '''GenomeAnalysisTK -T ReadBackedPhasing -nt 4
-                   -R %(genome)s
-                   -I %(bamlist)s
-                   -V %(infile)s
-                   -o %(outfile)s; ''' % locals()
-    P.run(statement)
-
-###############################################################################
-
-
-@follows(readbackedphasing)
-@transform("*.ped",
-           regex(r"(\S*Multiplex\S+|\S*Trio\S+).ped"),
-           add_inputs(r"no_multiallelic_all_samples.rbp.vcf",
-                      r"all_samples.ped"),
-           r"variants/\1.compound_hets.table")
-def compoundHets(infiles, outfile):
-    '''Identify potentially pathogenic compound heterozygous variants
-    (phasing with GATK followed by compound het calling using Gemini'''
-    family, infile, pedfile = infiles
-    family_id = P.snip(os.path.basename(family), ".ped")
-    statement = '''gemini load -v %(infile)s
-                   -p %(pedfile)s -t snpEff %(family_id)s.db &&
-                   gemini comp_hets
-                   --families %(family_id)s
-                   --columns "chrom, start, end, ref, alt, codon_change, gene, qual, depth"
-                   --filter
-                   "(impact_severity = 'HIGH' OR impact_severity = 'MED')
-                   AND (in_esp = 0 OR aaf_esp_all < 0.01)
-                   AND (in_1kg = 0 OR aaf_1kg_all < 0.01)"
-                   %(family_id)s.db > %(outfile)s;'''
-    # rm -f %(family_id)s.db'''
-    P.run(statement)
-
-###############################################################################
-
-
-@jobs_limit(PARAMS.get("jobs_limit_db", 1), "db")
-@transform(compoundHets, regex(r"variants/(\S+).compound_hets.table"),
-           r"variants/\1.compound_hets.table.load")
-def loadCompoundHets(infile, outfile):
-    '''Load compound heterozygous variants into database'''
-    P.load(infile, outfile,
-           options="--retry --ignore-empty --allow-empty-file")
-
-###############################################################################
-###############################################################################
-###############################################################################
-# coverage over candidate genes
-
-
-@active_if(PARAMS["coverage_calculate"] == 1)
-@transform(listOfBAMs, regex(r"gatk/all_samples.list"), r"candidate.sample_interval_summary")
-def candidateCoverage(infile, outfile):
-    '''Calculate coverage over exons of candidate genes'''
-    all_exons = PARAMS["coverage_all_exons"]
-    candidates = PARAMS["coverage_candidates"]
-    candidates = candidates.replace(",", " -e ")
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa"
-    threshold = PARAMS["coverage_threshold"]
-    statement = '''zcat %(all_exons)s | grep -e %(candidates)s
-                   | awk '{print $1 ":" $4 "-" $5}' - | sed 's/chr//' - >
-                   candidate.interval_list ; ''' % locals()
-    statement += '''zcat %(all_exons)s | grep -e %(candidates)s
-                    | awk '{print $16}' - | sed 's/"//g;s/;//g' - >
-                    candidate_gene_names.txt ;''' % locals()
-    statement += '''GenomeAnalysisTK -T DepthOfCoverage -R %(genome)s
-                    -o candidate -I %(infile)s -ct %(threshold)s -L candidate.interval_list ;''' % locals()
-    P.run(statement)
-
-###############################################################################
-
-
-@active_if(PARAMS["coverage_calculate"] == 1)
-@transform(candidateCoverage, regex(r"candidate.sample_interval_summary"), r"candidate_coverage_plot.pdf")
-def candidateCoveragePlots(infile, outfile):
-    '''Produce plots of coverage'''
-    rscript = PARAMS["coverage_rscript"]
-    threshold = PARAMS["coverage_threshold"]
-    statement = '''Rscript %(rscript)s %(infile)s candidate_gene_names.txt %(threshold)s %(outfile)s ;'''
-    P.run(statement)
-
 ###############################################################################
 ###############################################################################
 ###############################################################################
@@ -2006,7 +1057,6 @@ def buildVCFstats(infile, outfile):
 @merge(buildVCFstats, "vcf_stats.load")
 def loadVCFstats(infiles, outfile):
     '''Import variant statistics into SQLite'''
-    scriptsdir = PARAMS["general_scriptsdir"]
     filenames = " ".join(infiles)
     tablename = P.to_table(outfile)
     E.info("Loading vcf stats...")
@@ -2032,9 +1082,7 @@ def loadVCFstats(infiles, outfile):
 # Targets
 
 
-@follows(loadVariantAnnotation,
-         finalVariantTables,
-         ancestry)
+@follows(loadVariantAnnotation)
 def testFromVariantRecal():
     pass
 
@@ -2054,70 +1102,24 @@ def mapping_tasks():
 
 @follows(GATKBaseRecal,
          loadPicardDuplicateStatsLane,
-         GATKIndelRealignSample,
+         RemoveDuplicatesSample,
          loadCoverageStats)
 def gatk():
     pass
 
 
-@follows(loadXYRatio,
-         indexVCFs,
-         loadNDR)
-def sampleFeatures():
-    pass
-
-
 @follows(haplotypeCaller,
          genotypeGVCFs)
 def callVariants():
     pass
 
 
-@follows(loadTableSnpEff,
-         listOfBAMs,
-         loadVariantAnnotation,
-         finalVariantTables)
+@follows(listOfBAMs,
+         loadVariantAnnotation)
 def annotation():
     pass
 
 
-@follows(loadDeNovos)
-def denovo():
-    pass
-
-
-@follows(loadLowerStringencyDeNovos)
-def denovo2():
-    pass
-
-
-@follows(loadDoms)
-def dominant():
-    pass
-
-
-@follows(loadRecs)
-def recessive():
-    pass
-
-
-@follows(loadXs)
-def xlinked():
-    pass
-
-
-@follows(loadCompoundHets)
-def compoundHet():
-    pass
-
-
-@follows(denovo,
-         dominant,
-         recessive,
-         compoundHet)
-def filtering():
-    pass
-
 
 @follows(buildVCFstats,
          loadVCFstats)
@@ -2127,11 +1129,8 @@ def vcfstats():
 
 @follows(mapping_tasks,
          gatk,
-         # sampleFeatures,
          callVariants,
          annotation,
-         filtering,
-         ancestry,
          makeAnnotationsTables)
 def full():
     pass
diff --git a/cgatpipelines/tools/pipeline_exome/pipeline.yml b/cgatpipelines/tools/pipeline_exome/pipeline.yml
index 1d8158e1c..ec86a4889 100644
--- a/cgatpipelines/tools/pipeline_exome/pipeline.yml
+++ b/cgatpipelines/tools/pipeline_exome/pipeline.yml
@@ -4,16 +4,11 @@
 ## Exome pipeline parameters
 ##########################################################
 
-
-
-# Script directory
-scriptsdir: /ifs/devel/katherineb/cgat/scripts
-
 # the genome to use
-genome: hg19
+genome: hg38
 
 # location of indexed genome for SAMtools
-genome_dir: /ifs/mirror/genomes/gatk
+genome_dir: /path/to/genome/dir
 
 ########################################################
 # directory where exported data is located
@@ -25,20 +20,19 @@ samples: ''
 web_dir: ../web
 
 # If you are not running exome data but targetted sequencing on a smaller region then change to 1
+# Effect of this option is to skip base quality recalibration step
 targetted: 0
 
-#pedfile=/ifs/projects/proj024/analysis/illumina_fam8-16/fastq_decon_o5_filt_qc/mapping_human_g1k/samples.ped
-
-# seed for random number generation
-randomseed: 1000
-
 # gtf file containing exon annotations
-geneset: /ifs/mirror/annotations/hg19_ensembl75_hierarchical/ensembl.dir/geneset_all.gtf.gz
+geneset: /path/to/ensembl.dir/geneset_all.gtf.gz
+
+#picard memory
+picard_memory: 9G
 
 bwa:
 
     # location od BWA indexed genome
-    index_dir: /ifs/mirror/genomes/gatk
+    index_dir: /path/to/genome
 
     # threads
     threads: 8
@@ -50,7 +44,7 @@ bwa:
     mem_options: ''
 
     # BWA end-pairing parameters
-    sampe_options: ''
+    sample_options: ''
 
 readgroup:
 
@@ -62,110 +56,118 @@ readgroup:
 
     sample: blood
 
-dedup:
-
-    method: picard
-
 gatk:
 
     threads: 8
 
-    dbsnp: /ifs/mirror/genomes/gatk/dbsnp_138.hg19.vcf
-
-    hapmap: /ifs/mirror/genomes/gatk/hapmap_3.3.hg19.sites.vcf
+    memory: 2G
 
-    omni: /ifs/mirror/genomes/gatk/1000G_omni2.5.hg19.sites.vcf
+    # memory required for merging GVCF files
+    # PLEASE NOTE: this is the memory allocated to picard
+    # 1.2x this memory will be requested from the cluster to provide sufficent memory for the database tool.
+    dbmem: 20G
 
-    kgenomes: /ifs/mirror/genomes/gatk/1000G_phase1.snps.high_confidence.hg19.sites.vcf
+    # path to dbsnp VCF (provided by GATK)
+    dbsnp: /path/to/dbsnp.vcf
 
-    mills: /ifs/mirror/genomes/gatk/Mills_and_1000G_gold_standard.indels.hg19.sites.vcf
+    # path to hapmap VCF (provided by GATK)
+    hapmap: /path/to/hapmap.vcf
 
-    solid_options: ''
+    # path to OMNI VCF (privided by GATK)
+    omni: /path/to/1000G_omni
 
-    hc_options: -A StrandOddsRatio -A DepthPerSampleHC
+    # path to 1000G VDF (provided by GATK)
+    kgenomes: /path/to/1000G_phase1.snps.high_confidence
 
-    genotype_options: ''
+    mills: /path/to/Mills_and_1000G_gold_standard
 
-    variant_annotations: [AlleleBalance,AlleleBalanceBySample,AlleleCountBySample,BaseCounts,BaseQualityRankSumTest,ChromosomeCounts,ClippingRankSumTest,Coverage,DepthPerAlleleBySample,FisherStrand,GCContent,GenotypeSummaries,HardyWeinberg,HomopolymerRun,LikelihoodRankSumTest,LowMQ,MappingQualityRankSumTest,MappingQualityZero,MappingQualityZeroBySample,NBaseCount,QualByDepth,RMSMappingQuality,ReadPosRankSumTest,SampleList,SpanningDeletions,StrandBiasBySample,StrandOddsRatio,TandemRepeatAnnotator,VariantType]
+    axiom: /path/to/axiom
 
-    vcf_to_table: -F CHROM -F POS -F ID -F REF -F ALT -F QUAL -F FILTER -F AC -F AF -F AN -F BaseQRankSum -F DB -F DP -F Dels -F FS -F HaplotypeScore -F MLEAC -F MLEAF -F MQ -F MQ0 -F MQRankSum -F QD -F ReadPosRankSum -F SB -F SNPEFF_EFFECT -F SNPEFF_IMPACT -F SNPEFF_FUNCTIONAL_CLASS -F SNPEFF_CODON_CHANGE -F SNPEFF_AMINO_ACID_CHANGE -F SNPEFF_GENE_NAME -F SNPEFF_GENE_BIOTYPE -F SNPEFF_TRANSCRIPT_ID -F SNPEFF_EXON_ID -F dbNSFP_GERP++_RS -F dbNSFP_GERP++_NR -F dbNSFP_Ensembl_transcriptid -F dbNSFP_Uniprot_acc -F dbNSFP_Interpro_domain -F dbNSFP_SIFT_score -F dbNSFP_Polyphen2_HVAR_pred -F dbNSFP_29way_logOdds -F dbNSFP_1000Gp1_AF -F dbNSFP_1000Gp1_AFR_AF -F dbNSFP_1000Gp1_EUR_AF -F dbNSFP_1000Gp1_AMR_AF -F dbNSFP_1000Gp1_ASN_AF -F dbNSFP_ESP6500_AA_AF -F dbNSFP_ESP6500_EA_AF -F RSPOS -F SSR -F SAO -F VP -F VC -F PM -F TPA -F PMC -F MUT -F VLD -F OTHERKG -F PH3 -F CDA -F MTP -F OM -F CAF -F COMMON -GF GT -GF AD -GF GQ -GF PL -GF PQ -GF TP -GF AB -GF DP
+    # Additional options for GATK baserecalibrator
+    baserecalibrator_options: ''
 
-hapmap:
-
-    loc: /ifs/mirror/hapmap/genotype_frequencies
+    # Additional options for GATK HaplotypeCaller
+    hc_options: -A StrandOddsRatio -A DepthPerSampleHC
 
-    # bgzip compressede and tabix indexed VCF file for Hapmap
-    vcf: /ifs/mirror/genomes/gatk/hapmap_3.3.hg19.sites.vcf
+    # Additional options for GATK GenotypeGVCFs
+    genotype_options: ''
 
-    padding: 0
+    vcf_to_table: -F CHROM -F POS -F ID -F REF -F ALT -F QUAL -F FILTER -F AC -F AF -F AN -F BaseQRankSum -F DB -F DP -F Dels -F FS -F HaplotypeScore -F MLEAC -F MLEAF -F MQ -F MQ0 -F MQRankSum -F QD -F ReadPosRankSum -F SB -F SNPEFF_EFFECT -F SNPEFF_IMPACT -F SNPEFF_FUNCTIONAL_CLASS -F SNPEFF_CODON_CHANGE -F SNPEFF_AMINO_ACID_CHANGE -F SNPEFF_GENE_NAME -F SNPEFF_GENE_BIOTYPE -F SNPEFF_TRANSCRIPT_ID -F SNPEFF_EXON_ID -F dbNSFP_GERP++_RS -F dbNSFP_GERP++_NR -F dbNSFP_Ensembl_transcriptid -F dbNSFP_Uniprot_acc -F dbNSFP_Interpro_domain -F dbNSFP_SIFT_score -F dbNSFP_Polyphen2_HVAR_pred -F dbNSFP_29way_logOdds -F dbNSFP_1000Gp1_AF -F dbNSFP_1000Gp1_AFR_AF -F dbNSFP_1000Gp1_EUR_AF -F dbNSFP_1000Gp1_AMR_AF -F dbNSFP_1000Gp1_ASN_AF -F dbNSFP_ESP6500_AA_AF -F dbNSFP_ESP6500_EA_AF -F RSPOS -F SSR -F SAO -F VP -F VC -F PM -F TPA -F PMC -F MUT -F VLD -F OTHERKG -F PH3 -F CDA -F MTP -F OM -F CAF -F COMMON -GF GT -GF AD -GF GQ -GF PL -GF PQ -GF TP -GF AB -GF DP
 
-    hc_options: --emitRefConfidence GVCF
 
+# annotation is done using Ensembl! VEP
 annotation:
 
     memory: 6G
 
-    threads: 3
-
-    #Config file specifies that the genome must be downloaded into your home directory
-    snpeff_config: /ifs/apps/bio/snpEff-4.3/snpEff.config
-
-    snpeff_genome: hg19
+    threads: 4
+
+    # path to dbNSFP database (aggregation database for annotations)
+    # leave blank if not required
+    dbnsfp_path: /path/to/dbNFSP.txt.gz
+
+    # dbNSFP columns to include
+    # If left empty will fetch ALL annotations.
+    # Examples: Uniprot_acc, Interpro_domain, 
+    # SIFT_pred, Polyphen2_HDIV_pred, Polyphen2_HVAR_pred, 
+    # LRT_pred, MutationTaster_pred, GERP++_NR, GERP++_RS, 
+    # phastCons100way_vertebrate, 1000Gp1_AF, 1000Gp1_AFR_AF
+    # ESP6500_AA_AF, ESP6500_EA_AF, MutationTaster_pred, 
+    # MutationAssessor_pred, FATHMM_pred, PROVEAN_pred, 
+    # CADD_phred, MetaSVM_pred, 1000Gp3_AC, 1000Gp3_AF, 
+    # 1000Gp3_AFR_AC, 1000Gp3_AFR_AF, 1000Gp3_EUR_AC,
+    # ESP6500_EA_AC, ESP6500_EA_AF, ExAC_AC, ExAC_AF,
+    # ExAC_Adj_AC, ExAC_Adj_AF, ExAC_AFR_AC, ExAC_AFR_AF
+    dbnsfp_columns:
 
-    # path to dbNSFP database
-    dbnsfp: /ifs/mirror/snpsift/data_2016/dbNFSP.txt.gz
+    # path to VEP scripts
+    veppath: /path/to/ensembl-vep-80/variant_effect_predictor
 
-    #dbNSFP columns to include
-    dbnsfpannotators: [SIFT_pred,Polyphen2_HDIV_pred,Polyphen2_HVAR_pred,MutationTaster_pred]
+    # location of cached annotations
+    # cached annotations should have the same version number
+    # as the VEP executable 
+    vepcache: /path/to/VEP
 
-    # path to clinvar vcf
-    clinvar: /ifs/mirror/clinvar/clinvar_20160531.vcf.gz
+    # species name
+    vepspecies: homo_sapiens
 
-    # path to ExAC vcf
-    exac: /ifs/mirror/ExAC/sites/ExAC.r0.3.1.sites.vep.vcf.gz
+    # assembly - needs to correspond to data in cache directory
+    vepassembly: GRCh38
 
-    # path to dbsnp vcf
-    dbsnp: /ifs/mirror/dbsnp/All_20160527.vcf.gz
+    # vep parameters - see vep docs here
+    # http://www.ensembl.org/info/docs/tools/vep/script/vep_options.html
+    # format as for command line e.g. as --sift b --regulatory --domains
+    # Option -e covers almost all annotations
+    vepoptions: -e
 
-    # path to chromosome by chromosome 1000g vcfs
-    tgdir: /ifs/mirror/1000_genomes/phase3
+    # Path to Somalier package (currently not available as conda package)
+    # Website: https://github.com/brentp/somalier
+    somalier_path: /path/to/somalier
 
-    # path to gwas catalog files
-    gwas_catalog: /ifs/mirror/gwascatalog/gwas_catalog_v1.0-downloaded_2015-07-14.txt
+    # Path to Somalier VCF specifying sites used for determination of relatedness
+    # Provided by Somalier Project
+    somalier_sites: /path/to/sites.hg38.vcf.gz
 
-    # path to "phastcons100way" files
-    phastcons: /ifs/mirror/phastcons
+    # Path to 1000G data folder (provided by Somalier project)
+    somalier_1kg: /path/to/1kg-somalier
 
-    snpeff_to_table: -F CHROM -F POS -F ID -F REF -F ALT -F EFF
+    # Path to Somalier Ancestry Labels for 1000G (provided by Somalier project)
+    somalier_labels: /path/to/ancestry-labels-1kg.tsv
 
     add_genes_of_interest: 0
 
     genes_of_interest: ''
 
-    # vep parameters - see vep docs here
-    # http://www.ensembl.org/info/docs/tools/vep/script/vep_options.html
-
-    # options to pass to vep - this string is appended directly
-    # to the command when vep is called so should be formatted
-    # e.g. as
-    # --sift b --regulatory --domains
-    vepannotators: --sift b --variant_class --regulatory --domains
-
-    # path to VEP scripts
-    veppath: /ifs/apps/bio/ensembl-vep-80/variant_effect_predictor
-
-    # location of cached annotations
-    vepcache: /ifs/mirror/VEP
-
-    # species name
-    vepspecies: homo_sapiens
-
-    # assembly - needs to correspond to data in cache directory
-    vepassembly: GRCh37
 
-    # filtering of vcfs to select variants of interest
+# filtering of vcfs to select variants of interest
+# filtering is done using 
 filtering:
 
+    # Quality Filtering can be done either by VQSR, a GATK machine-learning
+    # algorithm, or by hard-filtering QC scores based on prespecified conditions
+    quality: [VQSR OR hard]
+
+    # 
     # quality filters, seperated by commas, as column'symbol'score where
     # column is the column to filter
     # symbol is '>', '<', '>=', '<=' (including quote marks)
@@ -179,55 +181,27 @@ filtering:
     # any = keep variants filling any of the quality criteria specified above
     quality_ft: all
 
-    rarethresh: 0.01
-
-    # columns containing comma delimiated allele frequencies to filter by
-    freqs: [EUR_AF,EAS_AF,AFR_AF,AMR_AF,SAS_AF,ESP_AF_POPMAX,ESP_AF_GLOBAL]
-
-    #populations to use when filtering Exac allele frequencies
-    #xxx for any columns where an AC_xxx and AN_xxx column is provided in the
-    #Exac VCF, e.g. Adj will calculate allele frequency from
-    # the AC_Adj and AN_Adj columns
-    # currently available Adj (all populations), POPMAX (highest af in any population),
-    # specific populations e.g. EUR, AFR and others
-    exac: [Adj,POPMAX]
-
-    # columns containing information about predicted damage from this mutation
-    # and strings to search for in these columns
-    # columns containing the strings are kept
-    # specified as column|string-string-...,column|string...
-    # e.g. SNPEFF_IMPACT|HIGH-MODERATE,CSQ|HIGH would look for
-    # "HIGH" or "MODERATE" in the SNPEFF_IMPACT column and
-    # "HIGH" in the CSQ column
-    damage: [CSQ|HIGH,dbNSFP_MutationTaster_pred|A-D,dbNSFP_Polyphen2_HDIV_pred|D-P,dbNSFP_Polyphen2_HVAR_pred|D-P,dbNSFP_SIFT_pred|D,SNPEFF_IMPACT|HIGH]
-
-    # if family is 1, keep only variants shared by any family members identified in
-    # the calculateFamily step
-    family: 1
+    # VEP filtering
+    # Provide filtering command for filter_vep.
+    # Can be multiple commands separated
+    vep: 
 
     # columns to keep in the final variant tables (apart from CHROM,POS,ID,REF1,ALT,GT)
     columns: [GQ,DP,FS,SOR,POPMAX_calc,Adj_calc,EUR_AF_calc,EAS_AF_calc,AFR_AF_calc,SAS_AF_calc,ESP_AF_POPMAX_calc,ESP_AF_GLOBAL_calc,CSQ,dbNSFP_MutationTaster_pred,dbNSFP_Polyphen2_HDIV_pred,dbNSFP_Polyphen2_HVAR_pred,dbNSFP_SIFT_pred,SNPEFF_IMPACT]
 
-king:
-    #path to king
-    path: /ifs/apps/bio/king
-
-    #path to plink
-    plink: /ifs/apps/bio/plink-1.07/plink-1.07-x86_64
-
 coverage:
 
     # to activate calculating coverage of candidate genes, specified below.
-    calculate: 0
+    calculate: 1
 
     # supply gtf
-    all_exons: /ifs/mirror/annotations/hg19_ensembl75/geneset_coding_exons.gtf.gz
+    all_exons: /path/to/geneset_coding_exons.gtf.gz
 
     # comma-separated list of canonical ensembl identifiers for candidate genes of interest
     candidates: ''
 
     # path to r script for generating the coverage plots
-    rscript: /ifs/projects/proj024/analysis/GRIN1_coverage/coverage.r
+    rscript: /path/to/coverage.r
 
     # the minimum read depth for counting a base as covered
     threshold: 15
@@ -237,16 +211,16 @@ roi:
     bed: ''
 
     # Regions of interest
-    roi: /ifs/projects/proj048/agilent/S04380110_Regions.bed
+    roi: /path/to/agilent/S04380110_Regions.bed
 
-    intervals: /ifs/projects/proj048/agilent/S04380110_Regions.interval_list
+    intervals: /path/to/agilent/S04380110_Regions.interval_list
 
     # File mapping regions of interest to genes
-    to_gene: /ifs/projects/proj048/agilent/regions_to_genes.txt
+    to_gene: /path/to/agilent/regions_to_genes.txt
 
-    baits: /ifs/projects/proj048/agilent/S04380110_Probes_5col_samheader_ucsc.bed
+    baits: /path/to/agilent/S04380110_Probes_5col_samheader_ucsc.bed
 
-    regions: /ifs/projects/proj048/agilent/S04380110_Regions_5col_samheader_ucsc.bed
+    regions: /path/to/agilent/S04380110_Regions_5col_samheader_ucsc.bed
 
     padding: 100
 
diff --git a/cgatpipelines/tools/pipeline_exome_cancer.py b/cgatpipelines/tools/pipeline_exome_cancer.py
index 3b4dab741..a3edb709c 100644
--- a/cgatpipelines/tools/pipeline_exome_cancer.py
+++ b/cgatpipelines/tools/pipeline_exome_cancer.py
@@ -155,7 +155,7 @@
 import sqlite3
 import cgatcore.iotools as iotools
 import cgatpipelines.tasks.mapping as mapping
-import cgatpipelines.tasks.mappingqc as mappingqc
+import cgatpipelines.tasks.bamstats as bamstats
 from cgatcore import pipeline as P
 import re
 import cgatpipelines.tasks.exome as exome
@@ -187,9 +187,11 @@ def connect():
 
 PARAMS = P.PARAMS
 
+PICARD_MEMORY = PARAMS["picard_memory"]
+
 mapping.PARAMS = PARAMS
-mappingqc.PARAMS = PARAMS
 exome.PARAMS = PARAMS
+
 #########################################################################
 
 
@@ -251,7 +253,7 @@ def mapReads(infile, outfile):
 @merge(mapReads, "picard_duplicate_stats.load")
 def loadPicardDuplicateStats(infiles, outfile):
     '''Merge Picard duplicate stats into single table and load into SQLite.'''
-    mappingqc.loadPicardDuplicateStats(infiles, outfile)
+    bamstats.loadPicardDuplicateStats(infiles, outfile)
 
 
 #########################################################################
@@ -263,7 +265,7 @@ def loadPicardDuplicateStats(infiles, outfile):
 @merge("bam/*.picard_stats", "picard_stats.load")
 def loadPicardAlignStats(infiles, outfile):
     '''Merge Picard alignment stats into single table and load into SQLite.'''
-    mappingqc.loadPicardAlignmentStats(infiles, outfile)
+    bamstats.loadPicardAlignmentStats(infiles, outfile)
 
 #########################################################################
 
@@ -292,8 +294,8 @@ def buildCoverageStats(infile, outfile):
                 '''
     P.run(statement)
 
-    mappingqc.buildPicardCoverageStats(
-        infile, outfile, modified_baits, modified_baits)
+    bamstats.buildPicardCoverageStats(
+        infile, outfile, modified_baits, modified_baits, PICARD_MEMORY)
 
     iotools.zap_file(modified_baits)
 
@@ -301,7 +303,7 @@ def buildCoverageStats(infile, outfile):
 @follows(buildCoverageStats)
 @merge(buildCoverageStats, "coverage_stats.load")
 def loadCoverageStats(infiles, outfile):
-    mappingqc.loadPicardCoverageStats(infiles, outfile)
+    bamstats.loadPicardCoverageStats(infiles, outfile)
 
 #########################################################################
 #########################################################################
@@ -463,16 +465,16 @@ def runPicardOnRealigned(infile, outfile):
     genome = "%s/%s.fa" % (PARAMS["bwa_index_dir"],
                            PARAMS["genome"])
 
-    mappingqc.buildPicardAlignmentStats(infile, outfile, genome)
-    mappingqc.buildPicardAlignmentStats(infile_tumor,
-                                                outfile_tumor, genome)
+    bamstats.buildPicardAlignmentStats(infile, outfile, genome, PICARD_MEMORY)
+    bamstats.buildPicardAlignmentStats(infile_tumor,
+                                                outfile_tumor, genome, PICARD_MEMORY)
 
 
 @follows(runPicardOnRealigned)
 @merge("bam/*.realigned.picard_stats", "realigned_picard_stats.load")
 def loadPicardRealigenedAlignStats(infiles, outfile):
     '''Merge Picard alignment stats into single table and load into SQLite.'''
-    mappingqc.loadPicardAlignmentStats(infiles, outfile)
+    bamstats.loadPicardAlignmentStats(infiles, outfile)
 
 #########################################################################
 #########################################################################
diff --git a/cgatpipelines/tools/pipeline_exome_cancer/pipeline.yml b/cgatpipelines/tools/pipeline_exome_cancer/pipeline.yml
index 5ea8949bd..ce0761375 100644
--- a/cgatpipelines/tools/pipeline_exome_cancer/pipeline.yml
+++ b/cgatpipelines/tools/pipeline_exome_cancer/pipeline.yml
@@ -9,6 +9,8 @@
 # the genome to use
 genome: human_g1k_v37
 
+picard_memory: 9G
+
 web_dir: ../web
 
 report_engine: cgatreport
diff --git a/cgatpipelines/tools/pipeline_genesets.py b/cgatpipelines/tools/pipeline_genesets.py
index 20553b932..aaae674f6 100644
--- a/cgatpipelines/tools/pipeline_genesets.py
+++ b/cgatpipelines/tools/pipeline_genesets.py
@@ -638,7 +638,7 @@ def buildCdsTranscript(infile, outfile):
 
 
 @transform(buildUCSCGeneSet,
-           regex("*.gtf.gz"),
+           regex(".*.gtf.gz"),
            PARAMS['interface_geneset_exons_gtf'])
 def buildExonTranscript(infile, outfile):
     '''
diff --git a/cgatpipelines/tools/pipeline_longread.py b/cgatpipelines/tools/pipeline_longread.py
new file mode 100644
index 000000000..89f54af05
--- /dev/null
+++ b/cgatpipelines/tools/pipeline_longread.py
@@ -0,0 +1,180 @@
+"""===========================
+Pipeline template
+===========================
+
+.. Replace the documentation below with your own description of the
+   pipeline's purpose
+
+Overview
+========
+
+This pipeline computes the word frequencies in the configuration
+files :file:``pipeline.yml` and :file:`conf.py`.
+
+Usage
+=====
+
+See :ref:`PipelineSettingUp` and :ref:`PipelineRunning` on general
+information how to use cgat pipelines.
+
+Configuration
+-------------
+
+The pipeline requires a configured :file:`pipeline.yml` file.
+cgatReport report requires a :file:`conf.py` and optionally a
+:file:`cgatreport.yml` file (see :ref:`PipelineReporting`).
+
+Default configuration files can be generated by executing:
+
+   python <srcdir>/pipeline_@template@.py config
+
+Input files
+-----------
+
+None required except the pipeline configuration files.
+
+Requirements
+------------
+
+The pipeline requires the results from
+:doc:`pipeline_annotations`. Set the configuration variable
+:py:data:`annotations_database` and :py:data:`annotations_dir`.
+
+Pipeline output
+===============
+
+.. Describe output files of the pipeline here
+
+Glossary
+========
+
+.. glossary::
+
+
+Code
+====
+
+"""
+from ruffus import *
+import sys
+import os
+import glob
+import cgatcore.experiment as E
+from cgatcore import pipeline as P
+
+
+# load options from the config file
+PARAMS = P.get_parameters(
+    ["%s/pipeline.yml" % os.path.splitext(__file__)[0],
+     "../pipeline.yml",
+     "pipeline.yml"])
+
+NPUT_FORMATS = ("*.fastq.1.gz", "*.fastq.gz", "*.sra", "*.csfasta.gz")
+REGEX_FORMATS = regex(r"(\S+).(fastq.1.gz|fastq.gz|sra|csfasta.gz)")
+
+matches = glob.glob("*.fastq.1.gz") + glob.glob(
+    "*.fastq.gz") + glob.glob("*.sra") + glob.glob("*.csfasta.gz")
+
+PICARD_MEMORY = PARAMS["picard_memory"]
+GATK_MEMORY = PARAMS["gatk_memory"]
+ANNOTATION_MEMORY = PARAMS["annotation_memory"]
+
+PANELS = glob.glob("*.panel.tsv")
+
+FILTERS = glob.glob ("*.filter.tsv")
+
+
+# ---------------------------------------------------
+# Specific pipeline tasks
+@mkdir("results.dir")
+@transform(("/ceph/project/talbotlab/jscaber/nanopore/"),
+           regex("(.*)"),
+           r"/ceph/project/talbotlab/jscaber/nanopore/results.dir")
+def nextflowEpi2me(infolder, outfile):
+    '''count the number of words in the pipeline configuration files.'''
+
+    genome_file = os.path.abspath(
+        os.path.join(PARAMS["genome_dir"], PARAMS["genome"] + ".fa"))
+    target_regions = PARAMS["targets"]
+
+    # the command line statement we want to execute
+    statement = '''nextflow run epi2me-labs/wf-cas9
+        --fastq %(infolder)s 
+        --ref_genome %(genome_file)s
+        --targets %(target_regions)s 
+        --out_dir %(outfile)s
+        --full_report -profile singularity
+    
+    '''
+    P.run(statement)
+
+
+@mkdir("last.dir")
+@transform(("nextflow/fastq_pass/fastq_pass_ontarget.fastq"),
+           regex("(.*)"),
+           r"last.dir/fastq_pass_on_target.par")
+def lastTrain(infile, outfile):
+    '''count the number of words in the pipeline configuration files.'''
+
+    genome_file = os.path.abspath(
+        os.path.join(PARAMS["genome_dir"], PARAMS["genome"] + ".fa"))
+    last_index = PARAMS["LAST_index"]
+
+    # the command line statement we want to execute
+    statement = '''last-train
+        -P10 -Q0 %(last_index)s %(infile)s > %(outfile)s
+    '''
+    P.run(statement)
+
+@mkdir("last.dir")
+@transform(lastTrain,
+           regex("(.*).par"),
+           add_inputs(r"nextflow/fastq_pass/fastq_pass_ontarget.fastq"),
+           r"\1.maf")
+def lastAl(infiles, outfile):
+    '''count the number of words in the pipeline configuration files.'''
+
+    infile1,infile2 = infiles
+    genome_file = os.path.abspath(
+        os.path.join(PARAMS["genome_dir"], PARAMS["genome"] + ".fa"))
+    last_index = PARAMS["LAST_index"]
+
+    # the command line statement we want to execute
+    statement = '''lastal -P10 --split -p 
+        %(infile1)s %(last_index)s %(infile2)s > %(outfile)s
+    '''
+    P.run(statement)
+
+
+@transform(lastAl,
+           regex("(.*).maf"),
+           r"\1.txt")
+def tandemGenotypes(infile, outfile):
+    '''count the number of words in the pipeline configuration files.'''
+
+    genome_file = os.path.abspath(
+        os.path.join(PARAMS["genome_dir"], PARAMS["genome"] + ".fa"))
+    last_index = PARAMS["LAST_index"]
+    repeat_mask = PARAMS["repeat_mask"]
+
+
+    # the command line statement we want to execute
+    statement = '''tandem-genotypes -g target_regions.bed %(repeat_mask)s %(infile)s > %(outfile)s
+    '''
+    P.run(statement)
+
+# ---------------------------------------------------
+# Generic pipeline tasks
+@follows(nextflowEpi2me)
+def full():
+    pass
+
+
+def main(argv=None):
+    if argv is None:
+        argv = sys.argv
+    P.main(argv)
+
+
+if __name__ == "__main__":
+    sys.exit(P.main(sys.argv))
diff --git a/cgatpipelines/tools/pipeline_mapping.py b/cgatpipelines/tools/pipeline_mapping.py
index 607ccd661..3d6445674 100644
--- a/cgatpipelines/tools/pipeline_mapping.py
+++ b/cgatpipelines/tools/pipeline_mapping.py
@@ -225,7 +225,6 @@
 import cgat.BamTools.bamtools as BamTools
 import cgatpipelines.tasks.geneset as geneset
 import cgatpipelines.tasks.mapping as mapping
-import cgatpipelines.tasks.mappingqc as mappingqc
 
 # Pipeline configuration
 P.get_parameters(
@@ -247,7 +246,6 @@
     restrict_interface=True))
 
 geneset.PARAMS = PARAMS
-mappingqc.PARAMS = PARAMS
 
 # Helper functions mapping tracks to conditions, etc
 # determine the location of the input files (reads).
@@ -1982,7 +1980,7 @@ def mergeBAMFiles(infiles, outfile):
 
         infiles = " ".join(infiles)
         statement = '''
-        samtools merge %(outfile)s %(infiles)s >& %(outfile)s.log &&
+        samtools merge -f %(outfile)s %(infiles)s >& %(outfile)s.log &&
         samtools index %(outfile)s
         '''
         P.run(statement)
diff --git a/cgatpipelines/tools/pipeline_peakcalling.py b/cgatpipelines/tools/pipeline_peakcalling.py
index 423fd9afe..442dd2f1e 100644
--- a/cgatpipelines/tools/pipeline_peakcalling.py
+++ b/cgatpipelines/tools/pipeline_peakcalling.py
@@ -248,7 +248,7 @@
 import cgatcore.experiment as E
 import cgatcore.iotools as iotools
 from cgatcore import pipeline as P
-import cgatpipelines.tasks.mappingqc as mappingqc
+import cgatpipelines.tasks.bamstats as bamstats
 import cgatpipelines.tasks.peakcalling as peakcalling
 import cgat.BamTools.bamtools as Bamtools
 import cgatcore.database as DB
@@ -522,7 +522,7 @@ def loadIdxstats(infiles, outfile):
         idxstats output. Filename format is expected to be 'sample.idxstats'
     outfile : string
         Logfile. The table name will be derived from `outfile`.'''
-    mappingqc.loadIdxstats(infiles, outfile)
+    bamstats.loadIdxstats(infiles, outfile)
 
 
 @transform((filterChipBAMs, filterInputBAMs),
@@ -533,9 +533,10 @@ def buildPicardStats(infiles, outfile):
     infile = infiles[0]
     reffile = os.path.join(PARAMS["genome_dir"], PARAMS["genome"] + ".fa")
 
-    mappingqc.buildPicardAlignmentStats(infile,
-                                                outfile,
-                                                reffile)
+    bamstats.buildPicardAlignmentStats(infile,
+                                       outfile,
+                                       reffile,
+                                       picardmem="9G")
 
 
 @jobs_limit(PARAMS.get("jobs_limit_db", 1), "db")
@@ -543,7 +544,7 @@ def buildPicardStats(infiles, outfile):
 def loadPicardStats(infiles, outfile):
     '''merge alignment stats into single tables.
     '''
-    mappingqc.loadPicardAlignmentStats(infiles, outfile)
+    bamstats.loadPicardAlignmentStats(infiles, outfile)
 
 
 @follows(loadFilteringStats,
diff --git a/cgatpipelines/tools/pipeline_rnaseqdiffexpression.py b/cgatpipelines/tools/pipeline_rnaseqdiffexpression.py
index 552775a82..a9e80c85e 100644
--- a/cgatpipelines/tools/pipeline_rnaseqdiffexpression.py
+++ b/cgatpipelines/tools/pipeline_rnaseqdiffexpression.py
@@ -539,7 +539,7 @@ def buildSalmonIndex(infile, outfile):
        path to output file
     '''
 
-    job_memory = "30G"
+    job_memory = "40G"
     threads = 12
     if PARAMS["salmon_threads"] is not None:
         threads = PARAMS["salmon_threads"]
@@ -1037,6 +1037,7 @@ def count():
 def filterDESeq2(infiles, outfile, design_name, quantifier_name):
     ''' Load counts into RDS object and filter'''
 
+    job_memory = '20G'
     counts, design = infiles
     transcripts, genes = counts
     quantifier_name = quantifier_name.lower()
diff --git a/cgatpipelines/tools/pipeline_rnaseqqc.py b/cgatpipelines/tools/pipeline_rnaseqqc.py
index 3b9220396..6cced4a81 100644
--- a/cgatpipelines/tools/pipeline_rnaseqqc.py
+++ b/cgatpipelines/tools/pipeline_rnaseqqc.py
@@ -193,7 +193,7 @@
 import cgatcore.iotools as iotools
 import cgatpipelines.tasks.mapping as mapping
 import cgatpipelines.tasks.windows as windows
-import cgatpipelines.tasks.mappingqc as mappingqc
+import cgatpipelines.tasks.bamstats as bamstats
 import cgatpipelines.tasks.rnaseq as rnaseq
 from cgatcore import pipeline as P
 
@@ -634,12 +634,12 @@ def buildBAMStats(infile, outfile):
     P.run(statement)
 
 
-@P.add_doc(mappingqc.loadBAMStats)
+@P.add_doc(bamstats.loadBAMStats)
 @jobs_limit(PARAMS.get("jobs_limit_db", 1), "db")
 @merge(buildBAMStats, "bam_stats.load")
 def loadBAMStats(infiles, outfile):
     ''' load bam statistics into bam_stats table '''
-    mappingqc.loadBAMStats(infiles, outfile)
+    bamstats.loadBAMStats(infiles, outfile)
 
 
 @P.add_doc(windows.summarizeTagsWithinContext)
@@ -842,7 +842,7 @@ def buildRefFlat(infile, outfile):
     os.unlink(tmpflat)
 
 
-@P.add_doc(mappingqc.buildPicardRnaSeqMetrics)
+@P.add_doc(bamstats.buildPicardRnaSeqMetrics)
 @transform(mapReadsWithHisat,
            suffix(".bam"),
            add_inputs(buildRefFlat),
@@ -857,16 +857,16 @@ def buildPicardRnaSeqMetrics(infiles, outfile):
     else:
         strand = "NONE"
 
-    mappingqc.buildPicardRnaSeqMetrics(infiles, strand, outfile)
+    bamstats.buildPicardRnaSeqMetrics(infiles, strand, outfile, "9G")
 
 
-@P.add_doc(mappingqc.loadPicardRnaSeqMetrics)
+@P.add_doc(bamstats.loadPicardRnaSeqMetrics)
 @jobs_limit(PARAMS.get("jobs_limit_db", 1), "db")
 @merge(buildPicardRnaSeqMetrics, ["picard_rna_metrics.load",
                                   "picard_rna_histogram.load"])
 def loadPicardRnaSeqMetrics(infiles, outfiles):
     '''merge alignment stats into single tables.'''
-    mappingqc.loadPicardRnaSeqMetrics(infiles, outfiles)
+    bamstats.loadPicardRnaSeqMetrics(infiles, outfiles)
 
 
 ###################################################################
diff --git a/cgatpipelines/tools/pipeline_splicing.py b/cgatpipelines/tools/pipeline_splicing.py
index 1e9a9e0fb..9bee80f59 100644
--- a/cgatpipelines/tools/pipeline_splicing.py
+++ b/cgatpipelines/tools/pipeline_splicing.py
@@ -789,6 +789,9 @@ def collateMATS(infiles, outfile):
     MATS_fdr : string
        :term:`PARAMS`. User specified threshold for result counting
 
+    MATS_psi: string
+       :term: `PARAMS`. User specified PSI threshold for result counting
+
     outfile: string
         summary file containing number of results below FDR threshold
     '''
@@ -811,20 +814,51 @@ def collateMATS(infiles, outfile):
     for event in ["SE", "A5SS", "A3SS", "MXE", "RI"]:
         temp = pd.read_csv("%s/%s.MATS.JC.txt" %
                            (indir, event), sep='\t')
-        total.append(int(len(temp[(temp['FDR'] <
-                                float(PARAMS['MATS_fdr'])) & (abs(temp['IncLevelDifference']) > 0.1)])))
-        up.append(int(len(temp[(temp['FDR'] <
-                                float(PARAMS['MATS_fdr'])) & (temp['IncLevelDifference'] > 0.1)])))
-        down.append(int(len(temp[(temp['FDR'] <
-                                  float(PARAMS['MATS_fdr'])) & (temp['IncLevelDifference'] < -0.1)])))
+        # calculate mean coverage for particular junction for each variable
+        temp['SJC_SUM_1'] = [sum(x)/len(x) for x in temp.SJC_SAMPLE_1.str.split(',').apply(lambda x: [float(i) for i in x])]
+        temp['IJC_SUM_1'] = [sum(x)/len(x) for x in temp.IJC_SAMPLE_1.str.split(',').apply(lambda x: [float(i) for i in x])]
+        temp['SJC_SUM_2'] = [sum(x)/len(x) for x in temp.SJC_SAMPLE_2.str.split(',').apply(lambda x: [float(i) for i in x])]
+        temp['IJC_SUM_2'] = [sum(x)/len(x) for x in temp.IJC_SAMPLE_2.str.split(',').apply(lambda x: [float(i) for i in x])]
+        temp['1_MAX'] = temp[['SJC_SUM_1','IJC_SUM_1']].max(axis=1)
+        temp['2_MAX'] = temp[['SJC_SUM_2','IJC_SUM_2']].max(axis=1)
+        # three conditions to be met:
+        # 1. FDR thrshold
+        # 2. Fixed PSI threshold
+        # 3. minimum coverage in both conditions
+        total.append(int(len(temp[(temp['FDR'] < float(PARAMS['MATS_fdr'])) & 
+                                  (abs(temp['IncLevelDifference']) > PARAMS['MATS_psi']) &
+                                  (temp['1_MAX'] >= PARAMS['MATS_coverage']) &
+                                  (temp['2_MAX'] >= PARAMS['MATS_coverage'])])))
+        up.append(int(len(temp[(temp['FDR'] < float(PARAMS['MATS_fdr'])) &
+                               (temp['IncLevelDifference'] > PARAMS['MATS_psi']) &
+                               (temp['1_MAX'] >= PARAMS['MATS_coverage']) &
+                               (temp['2_MAX'] >= PARAMS['MATS_coverage'])])))
+        down.append(int(len(temp[(temp['FDR'] < float(PARAMS['MATS_fdr'])) &
+                                 (temp['IncLevelDifference'] < -PARAMS['MATS_psi']) &
+                                 (temp['1_MAX'] >= PARAMS['MATS_coverage']) &
+                                 (temp['2_MAX'] >= PARAMS['MATS_coverage'])])))
+
         temp = pd.read_csv("%s/%s.novelJunc.JC.tsv" %
                            (indir, event), sep='\t')
-        total_newJunc.append(int(len(temp[(temp['FDR'] <
-                                float(PARAMS['MATS_fdr'])) & (abs(temp['IncLevelDifference']) > 0.1)])))
-        up_newJunc.append(int(len(temp[(temp['FDR'] <
-                                float(PARAMS['MATS_fdr'])) & (temp['IncLevelDifference'] > 0.1)])))
-        down_newJunc.append(int(len(temp[(temp['FDR'] <
-                                  float(PARAMS['MATS_fdr'])) & (temp['IncLevelDifference'] < -0.1)])))
+        temp['SJC_SUM_1'] = [sum(x)/len(x) for x in temp.SJC_SAMPLE_1.str.split(',').apply(lambda x: [float(i) for i in x])]
+        temp['IJC_SUM_1'] = [sum(x)/len(x) for x in temp.IJC_SAMPLE_1.str.split(',').apply(lambda x: [float(i) for i in x])]
+        temp['SJC_SUM_2'] = [sum(x)/len(x) for x in temp.SJC_SAMPLE_2.str.split(',').apply(lambda x: [float(i) for i in x])]
+        temp['IJC_SUM_2'] = [sum(x)/len(x) for x in temp.IJC_SAMPLE_2.str.split(',').apply(lambda x: [float(i) for i in x])]
+        temp['1_MAX'] = temp[['SJC_SUM_1','IJC_SUM_1']].max(axis=1)
+        temp['2_MAX'] = temp[['SJC_SUM_2','IJC_SUM_2']].max(axis=1)
+        total_newJunc.append(int(len(temp[(temp['FDR'] < float(PARAMS['MATS_fdr'])) & 
+                                          (abs(temp['IncLevelDifference']) > PARAMS['MATS_psi']) &
+                                          (temp['1_MAX'] >= PARAMS['MATS_coverage']) &
+                                          (temp['2_MAX'] >= PARAMS['MATS_coverage'])])))
+        up_newJunc.append(int(len(temp[(temp['FDR'] < float(PARAMS['MATS_fdr'])) &
+                                       (temp['IncLevelDifference'] > PARAMS['MATS_psi']) &
+                                       (temp['1_MAX'] >= PARAMS['MATS_coverage']) &
+                                       (temp['2_MAX'] >= PARAMS['MATS_coverage'])])))
+        down_newJunc.append(int(len(temp[(temp['FDR'] < float(PARAMS['MATS_fdr'])) &
+                                         (temp['IncLevelDifference'] < -PARAMS['MATS_psi']) &
+                                         (temp['1_MAX'] >= PARAMS['MATS_coverage']) &
+                                         (temp['2_MAX'] >= PARAMS['MATS_coverage'])])))
+
         #experimental feature - deactivated       
         #temp = pd.read_csv("%s/%s.novelSS.JC.tsv" %
         #                   (indir, event), sep='\t')
@@ -949,8 +983,17 @@ def runPermuteMATS(infiles, outfile, design):
     for event in ["SE", "A5SS", "A3SS", "MXE", "RI"]:
         temp = pd.read_csv("%s/%s.MATS.JC.txt" %
                            (os.path.dirname(outfile), event), sep='\t')
-        collate.append(str(len(temp[(temp['FDR'] <
-                                float(PARAMS['MATS_fdr'])) & (abs(temp['IncLevelDifference']) > 0.1)])))
+        
+        temp['SJC_SUM_1'] = [sum(x)/len(x) for x in temp.SJC_SAMPLE_1.str.split(',').apply(lambda x: [float(i) for i in x])]
+        temp['IJC_SUM_1'] = [sum(x)/len(x) for x in temp.IJC_SAMPLE_1.str.split(',').apply(lambda x: [float(i) for i in x])]
+        temp['SJC_SUM_2'] = [sum(x)/len(x) for x in temp.SJC_SAMPLE_2.str.split(',').apply(lambda x: [float(i) for i in x])]
+        temp['IJC_SUM_2'] = [sum(x)/len(x) for x in temp.IJC_SAMPLE_2.str.split(',').apply(lambda x: [float(i) for i in x])]
+        temp['1_MAX'] = temp[['SJC_SUM_1','IJC_SUM_1']].max(axis=1)
+        temp['2_MAX'] = temp[['SJC_SUM_2','IJC_SUM_2']].max(axis=1)
+        collate.append(str(len(temp[(temp['FDR'] < float(PARAMS['MATS_fdr'])) & 
+                                  (abs(temp['IncLevelDifference']) > PARAMS['MATS_psi']) &
+                                  (temp['1_MAX'] >= PARAMS['MATS_coverage']) &
+                                  (temp['2_MAX'] >= PARAMS['MATS_coverage'])])))
     with open(outfile, "w") as f:
         f.write("Group1\tGroup2\tSE\tA5SS\tA3SS\tMXE\tRI\n")
         f.write('\t'.join(collate))
diff --git a/cgatpipelines/tools/pipeline_splicing/pipeline.yml b/cgatpipelines/tools/pipeline_splicing/pipeline.yml
index 505e2995b..6bf7975fc 100644
--- a/cgatpipelines/tools/pipeline_splicing/pipeline.yml
+++ b/cgatpipelines/tools/pipeline_splicing/pipeline.yml
@@ -44,17 +44,28 @@ annotations:
     dir: ''
 
 
-    ################################################################
-    #
-    # MATS options
-    #
-    ################################################################
+################################################################
+#
+# MATS options
+#
+################################################################
 MATS:
     # Cutoff splicing difference
     # The cutoff used in the null hypothesis test for differential splicing. 
     # The default is 0.0001 for 0.01% difference. Valid: 0 <= cutoff < 1
     cutoff: ''
 
+    # PSI threshold
+    # This allows for setting a fixed minimum PSI change (e.g. 0.1) for filtering
+    # results. This can be used with a more lenient cstat, to increase the number
+    # of results. The threshold is not hypothesis-tested
+    psi: ''
+
+    # Coverage threshold
+    # This allows for setting a fixed minimum number of reads that support this
+    # alternative splicing 
+    coverage: ''
+
     # Strandednes
     # fr-firststrand, fr-secondstrand or fr-unstranded
     libtype: ''
@@ -71,11 +82,11 @@ MATS:
 
 
 
-    ################################################################
-    #
-    # DEXSeq options
-    #
-    ################################################################
+################################################################
+#
+# DEXSeq options
+#
+ ################################################################
 DEXSeq:
 
     # Strand specific library prep
diff --git a/cgatpipelines/tools/pipeline_windows.py b/cgatpipelines/tools/pipeline_windows.py
index 108f44e6f..a09ba435f 100644
--- a/cgatpipelines/tools/pipeline_windows.py
+++ b/cgatpipelines/tools/pipeline_windows.py
@@ -120,7 +120,7 @@
 from cgatcore import pipeline as P
 import cgatpipelines.tasks.windows as windows
 import cgatpipelines.tasks.tracks as tracks
-import cgatpipelines.tasks.mappingqc as mappingqc
+import cgatpipelines.tasks.bamstats as bamstats
 
 
 #########################################################################
@@ -210,13 +210,12 @@ def loadTagCounts(infiles, outfile):
                      suffix=".tsv")
 
 
-# @P.add_doc(mappingqc.loadPicardDuplicateStats)
 @merge(prepareTags, "picard_duplicates.load")
 def loadPicardDuplicateStats(infiles, outfile):
     '''Merge Picard duplicate stats into single table and load into SQLite
     table picard_duplicates.
     '''
-    mappingqc.loadPicardDuplicateStats(
+    bamstats.loadPicardDuplicateStats(
         infiles, outfile, pipeline_suffix=".bed.gz")
 
 
diff --git a/conda/environments/cgat-flow.yml b/conda/environments/cgat-flow.yml
index 2224fcb59..e465dfba5 100644
--- a/conda/environments/cgat-flow.yml
+++ b/conda/environments/cgat-flow.yml
@@ -14,7 +14,6 @@ channels:
 dependencies:
 # python dependencies
 - python >= 3.6
-- rpy2 <= 2.9.4
 - tzlocal
 
 # exome.py
diff --git a/setup.py b/setup.py
index a1c95e7de..ec544f9ab 100644
--- a/setup.py
+++ b/setup.py
@@ -70,7 +70,7 @@
     raise SystemExit("""CGAT requires Python 3 or later.""")
 
 cgat_packages = find_packages()
-cgat_package_dirs = {'cgatpipelines': 'cgatpipelines'}
+cgat_package_dirs = {'cgatpipelines': 'cgatpipelines', 'Rtools': 'Rtools'}
 
 # Classifiers
 classifiers = """