cgat-developers · jscaber · Feb 10, 2022 · Feb 10, 2022 · Feb 14, 2022 · Feb 14, 2022
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -22,5 +22,8 @@ exclude scripts/*
 include scripts/__init__.py
 include scripts/version.py
 include scripts/cgat_ruffus_profile.py
+include cgatpipelines/Rtools/*
+include cgatpipelines/experiment.R
+
 
 # extensions
diff --git a/cgatpipelines/Rtools/diffexpression.R b/cgatpipelines/Rtools/diffexpression.R
@@ -22,11 +22,11 @@ suppressMessages(library(goseq))
 source(file.path(Sys.getenv("R_ROOT"), "experiment.R"))
 
 
-mart = useMart(biomart = "ENSEMBL_MART_ENSEMBL",dataset="hsapiens_gene_ensembl", host = "jul2018.archive.ensembl.org")
+mart = useMart(biomart = "ENSEMBL_MART_ENSEMBL",dataset="hsapiens_gene_ensembl", host = "https://aug2020.archive.ensembl.org")
 getmart <- function(values){
   data<- getBM(
     filters= "ensembl_gene_id", 
-    attributes= c("ensembl_gene_id", "external_gene_name", "description","entrezgene", 'chromosome_name',
+    attributes= c("ensembl_gene_id", "external_gene_name", "description","entrezgene_id", 'chromosome_name',
                    'start_position', 'end_position'),
     values= values,
     mart= mart,
@@ -35,7 +35,7 @@ getmart <- function(values){
   return(data)
 }
 
-start_plot <- function(section, height = 6, width = 6, type = "png", outdir="") {
+start_plot <- function(section, height = 10, width = 10, type = "png", outdir="") {
     file = get_output_filename(paste0(outdir,"/",section, ".", type))
     Cairo(file = file,
           type = type,
@@ -78,7 +78,8 @@ plotTPMs <- function(dftemp, contrast_name){
     ggplot(aes(x = contrast, y = value, color = contrast)) +
     geom_point(position = position_jitter(w = 0.15, h = 0)) +
     facet_wrap(~ var, scales = "free") + theme_bw() +
-    ylab("normalised counts") + xlab (contrast_name) + guides(color = "none")
+    ylab("normalised counts") + xlab (contrast_name) + guides(color = "none")+
+    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
 }
 
 
@@ -114,7 +115,7 @@ run <- function(opt) {
   flog.info("... plotting MA")
   ## MA Plot
   start_plot("MAPlot", outdir=opt$outdir)
-    DESeq2::plotMA(resLFC, ylim = c(-3,3))
+    DESeq2::plotMA(resLFC, ylim = c(ceiling(max(resLFC$log2FoldChange)),floor(min(resLFC$log2FoldChange))))
   end_plot()
 
   flog.info("... saving DE data")
@@ -178,6 +179,7 @@ run <- function(opt) {
 
   flog.info("... plotting user-defined genes")
   genelist <- unlist(strsplit(opt$userlist, ","))
+  genelist <- genelist[genelist %in% rownames(dds)]
   dftemp <- makeTPMtable(genelist, counts(dds, normalized=TRUE), colData(dds), opt$contrast)
   start_plot("Userdefined", outdir=opt$outdir)
     print(plotTPMs(dftemp, opt$contrast))

diff --git a/cgatpipelines/Rtools/exploratory.R b/cgatpipelines/Rtools/exploratory.R
@@ -33,11 +33,11 @@ source(file.path(Sys.getenv("R_ROOT"), "experiment.R"))
 # Dependencies: biomaRt package
 # Input: vector of ensembl gene ids
 # Output: data frame with description, symbol and entrez gene
-mart = useMart(biomart = "ENSEMBL_MART_ENSEMBL",dataset="hsapiens_gene_ensembl", host = "jul2018.archive.ensembl.org")
+mart = useMart(biomart = "ENSEMBL_MART_ENSEMBL",dataset="hsapiens_gene_ensembl", host = "https://aug2020.archive.ensembl.org")
 getmart <- function(values){
   data<- getBM(
     filters= "ensembl_gene_id", 
-    attributes= c("ensembl_gene_id", "external_gene_name", "description","entrezgene"),
+    attributes= c("ensembl_gene_id", "external_gene_name", "description","entrezgene_id"),
     values= values,
     mart= mart,
     useCache = FALSE)
@@ -74,35 +74,15 @@ run <- function(opt) {
     dds = DESeqDataSetFromMatrix(experiment$counts, experiment$sample, design = formula(opt$model))
   }
   futile.logger::flog.info(paste("reading  Experiment object", paste(dim(counts(experiment)), collapse = ",")))
-
-  ### SVA - ANALYSIS OF BIASES ###
-  futile.logger::flog.info(paste("Performing Surrogate Variable Analysis"))
-  dds <- estimateSizeFactors(dds)
-  dat  <- counts(dds, normalized = TRUE)
-  idx  <- rowMeans(dat) > 1
-  dat  <- dat[idx, ]
-  mod  <- model.matrix(formula(opt$model), colData(dds))
-  mod0 <- model.matrix(~ 1, colData(dds))
-  svseq <- svaseq(dat, mod, mod0, n.sv = 2)
-  for(factor in opt$factors){
-    start_plot(paste0('SVA for ', factor), outdir=opt$outdir)
-    par(mfrow = c(2, 1), mar = c(3,5,3,1))
-    for (i in 1:2) {
-      stripchart(svseq$sv[, i] ~ colData(dds)[, factor], vertical = TRUE, main = paste0("SV", i))
-      abline(h = 0)
-    }
-    end_plot()
-  }
-
+
   ### TRANSFORMATION OF DATA ###
   futile.logger::flog.info(paste("Transforming data"))
-  rld<- rlog(dds)
+  dds <- estimateSizeFactors(dds)
   vsd<- vst(dds)
   df <- bind_rows(
     as_tibble(log2(counts(dds, normalized=TRUE)[, 1:2]+1)) %>%
       mutate(transformation = "log2(x + 1)"),
-    as_tibble(assay(vsd)[, 1:2]) %>% mutate(transformation = "vst"),
-    as_tibble(assay(rld)[, 1:2]) %>% mutate(transformation = "rlog"))
+    as_tibble(assay(vsd)[, 1:2]) %>% mutate(transformation = "vst"))
   colnames(df)[1:2] <- c("x", "y")
   start_plot('Variance_Transformations', outdir=opt$outdir)
     print(ggplot(df, aes(x = x, y = y)) + geom_hex(bins = 80) +
@@ -120,11 +100,12 @@ run <- function(opt) {
     if(dim(pca$x)[1]>7){
       dim_pca = 8
     } else {
-      dim_pca = dim(pca$x)
+      dim_pca = dim(pca$x)[1]
     }
     scores <- data.frame(variable.group, pca$x[,1:dim_pca])
     start_plot(paste0('PCA_', factor), outdir=opt$outdir)
-      print(qplot(x=PC1, y=PC2, data=scores, colour=factor(variable.group)) +
+      print(ggplot(scores, aes(x=PC1, y=PC2, colour=factor(variable.group))) +
+	      geom_point() +
         theme(legend.position="right") +  
         labs(colour=factor, x=paste0("PC1 (", percentVar[1],"% of variance)"),
              y=paste0("PC2 (", percentVar[2],"% of variance)")) + 
@@ -133,7 +114,8 @@ run <- function(opt) {
         theme(text=element_text(family='serif')))
     end_plot()
     start_plot(paste0('PCA_13_', factor), outdir=opt$outdir)
-          print(qplot(x=PC1, y=PC3, data=scores, colour=factor(variable.group)) +
+          print(ggplot(scores, aes(x=PC1, y=PC3, colour=factor(variable.group))) +
+          geom_point() +
         theme(legend.position="right") +  
         labs(colour=factor, x=paste0("PC1 (", percentVar[1],"% of variance)"),
              y=paste0("PC3 (", percentVar[3],"% of variance)")) + 
@@ -146,7 +128,7 @@ run <- function(opt) {
   names(variable.group) <- opt$contrast
   scores <- data.frame(variable.group, pca$x[,1:dim_pca])
   start_plot(paste0('PCA_grid'), outdir=opt$outdir)
-  if(dim_pca>7){
+  if(dim_pca==8){
     print(ggplot(scores, aes(x = .panel_x, y = .panel_y, fill = variable.group, colour = variable.group)) + 
       geom_point(shape = 16, size = 0.5, position = 'auto') + 
       geom_autodensity(alpha = 0.3, colour = NA, position = 'identity') + 
@@ -171,14 +153,18 @@ run <- function(opt) {
   df <- as.data.frame(colData(dds)[,opt$factors])
   rownames(df) <- colData(dds)$track
   # Heatmap of Top 20 Expressed Genes
-  select <- order(rowMeans(counts(dds,normalized=TRUE)),decreasing=TRUE)[1:20]
+  selected <- order(rowMeans(counts(dds,normalized=TRUE)),decreasing=TRUE)[1:20]
   start_plot('Heatmap_topExpressed', outdir=opt$outdir)
-    pheatmap(assay(vsd)[select,], cluster_rows=FALSE, cluster_cols=FALSE, show_rownames=FALSE, annotation_col=df)
+    pheatmap(assay(vsd)[selected,], cluster_rows=FALSE, cluster_cols=FALSE, show_rownames=FALSE, annotation_col=df)
   end_plot()
   # Heatmap of Top 20 Variable Genes
   topVarGenes <- head(order(rowVars(assay(vsd)),decreasing=TRUE),20)
+  futile.logger::flog.info(paste("topVarGenes            ",topVarGenes))
   mat <- assay(vsd)[ topVarGenes, ]
+  futile.logger::flog.info(paste("mat           ",rownames(mat)))
   temp <- getmart(rownames(mat))
+  temp <- temp[!duplicated(temp$ensembl_gene_id), ]
+  futile.logger::flog.info(paste("temp            ",temp))
   row.names(temp) <- temp$ensembl_gene_id
   rownames(mat) <- temp[rownames(mat),"external_gene_name"]
   start_plot('Heatmap_topVariable', outdir=opt$outdir)
@@ -187,7 +173,9 @@ run <- function(opt) {
   # Heatmap of Genes of interest
   if (!is.null(opt$genes_of_interest)) {
     print(rownames(assay(vsd)))
-    mat <- assay(vsd)[opt$genes_of_interest, ]
+    goi <- opt$genes_of_interest
+    goi <- goi[goi %in% rownames(assay(vsd))]
+    mat <- assay(vsd)[goi, ]
     mat <- mat - rowMeans(mat)
     temp <- getmart(rownames(mat))
     row.names(temp) <- temp$ensembl_gene_id
@@ -227,6 +215,7 @@ run <- function(opt) {
 
   ### EXPLORE BATCH EFFECTS ###
   futile.logger::flog.info(paste("Exploring Batch Effects"))
+  futile.logger::flog.info(dim_pca)
   if(dim_pca == 8){
     for(factor in opt$factors){
       factor_transformed <- vsd
@@ -237,7 +226,8 @@ run <- function(opt) {
       percentVar <- round(100 * summary(pca)$importance[2,])
       scores <- data.frame(variable.group, sample.group, pca$x[,1:2])
       start_plot(paste0('PCA_', factor, '_removed'), outdir=opt$outdir)
-      print(qplot(x=PC1, y=PC2, data=scores, colour=factor(variable.group), shape=factor(sample.group)) +
+      print(ggplot(scores, aes(x=PC1, y=PC2, colour=factor(variable.group), shape=factor(sample.group))) + 
+            geom_point() +
             theme(legend.position="right") +  
             labs(colour=opt$contrast, shape=factor, x=paste0("PC1 (", percentVar[1],"% of variance)"),
                  y=paste0("PC2 (", percentVar[2],"% of variance)")) + 

diff --git a/cgatpipelines/Rtools/filtercounts.R b/cgatpipelines/Rtools/filtercounts.R
@@ -28,9 +28,10 @@ run <- function(opt) {
   ### READING DATA ###
   # Read in sampleData Table
   futile.logger::flog.info(paste("reading sampleData table from", normalizePath(opt$sampleData)))
-  sampleData <- read.table(opt$sampleData, header = TRUE)
+  sampleData <- read_tsv(opt$sampleData)
   sampleData <-sampleData[sampleData$include ==1, ]
   futile.logger::flog.info(paste("read sampleData ", paste(dim(sampleData), collapse = ",")))
+  futile.logger::flog.info(paste(sampleData))
   rownames(sampleData) <- sampleData$track
 
   futile.logger::flog.info(paste("reading in data from ", opt$source))
@@ -81,7 +82,8 @@ run <- function(opt) {
       flattenedfile=opt$flattenedFile)
   } else if(opt$source == "counts_table"){
     # Read in Data
-    raw <- read.table(file = gzfile(opt$counts_tsv), header=TRUE, row.name=1)
+    raw <- read.table(file = gzfile(opt$counts_tsv), header=TRUE, row.name=1, check.names = FALSE)
+    futile.logger::flog.info(paste("Colnames: ", colnames(raw)))
     experiment_tsv <- raw[,sampleData$track,drop=FALSE]
     if(opt$method == "deseq2"){
       dataset = DESeqDataSetFromMatrix(experiment_tsv, sampleData, design = formula(opt$model))
@@ -107,6 +109,9 @@ run <- function(opt) {
       futile.logger::flog.info(paste("Counts before filtering ", paste(dim(counts(dataset)), collapse = ",")))
       keep <- rowSums(counts(dataset)) >= 10
       dataset <- dataset[keep,]
+      #dataset <- dataset[Reduce('|', dataset[sapply(dataset, is.numeric)]),]
+      #dataset <- dataset[rowSums(dataset[,sapply(dataset, is.numeric)]==0)<6,]
+      #dataset <- subset(dataset, !rowSums(assay(dataset) == 0) > 6)
       counts_table <- counts(dataset)
       futile.logger::flog.info(paste("Counts after filtering ", paste(dim(counts(dataset)), collapse = ",")))
     } else if(opt$method == "dexseq"){

diff --git a/cgatpipelines/cgatflow.py b/cgatpipelines/cgatflow.py
@@ -21,7 +21,7 @@
 import sys
 import re
 import glob
-import imp
+import importlib.util
 import cgatpipelines
 
 
@@ -84,9 +84,10 @@ def main(argv=None):
     # remove 'cgatflow' from sys.argv
     del sys.argv[0]
 
-    (file, pathname, description) = imp.find_module(pipeline, paths)
-
-    module = imp.load_module(pipeline, file, pathname, description)
+    spec = importlib.util.spec_from_file_location(pipeline, path+"/"+pipeline+".py")
+    module = importlib.util.module_from_spec(spec)
+    sys.modules[pipeline] = module
+    spec.loader.exec_module(module)
 
     module.main(sys.argv)
 

diff --git a/cgatpipelines/tasks/bamstats.py b/cgatpipelines/tasks/bamstats.py
@@ -67,7 +67,7 @@ def buildPicardInsertSizeStats(infile, outfile, genome_file,
         Filename with genomic sequence.
     '''
     job_memory = picardmem
-    picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals()
+    picard_opts = '-Xmx%(job_memory)s -XX:+UseG1GC' % locals()
     job_threads = 3
 
     if BamTools.getNumReads(infile) == 0:
@@ -159,7 +159,7 @@ def buildPicardAlignmentStats(infile, outfile, genome_file,
     '''
 
     job_memory = picardmem
-    picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals()
+    picard_opts = '-Xmx%(job_memory)s -XX:+UseG1GC ' % locals()
     job_threads = 3
 
     if BamTools.getNumReads(infile) == 0:
@@ -191,7 +191,7 @@ def buildPicardDuplicationStats(infile, outfile, picardmem):
     '''
 
     job_memory = picardmem
-    picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals()
+    picard_opts = '-Xmx%(job_memory)s -XX:+UseG1GC ' % locals()
     job_threads = 3
 
     if BamTools.getNumReads(infile) == 0:
@@ -244,7 +244,7 @@ def buildPicardDuplicateStats(infile, outfile, picardmem):
         Output filename with picard output.
     '''
     job_memory = picardmem
-    picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals()
+    picard_opts = '-Xmx%(job_memory)s -XX:+UseG1GC ' % locals()
     job_threads = 3
 
     if BamTools.getNumReads(infile) == 0:
@@ -279,7 +279,7 @@ def buildPicardCoverageStats(infile, outfile, baits, regions,
     '''
 
     job_memory = picardmem
-    picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals()
+    picard_opts = '-Xmx%(job_memory)s -XX:+UseG1GC ' % locals()
     job_threads = 3
 
     if BamTools.getNumReads(infile) == 0:
@@ -310,7 +310,7 @@ def buildPicardGCStats(infile, outfile, genome_file, picardmem):
     """
 
     job_memory = picardmem
-    picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals()
+    picard_opts = '-Xmx%(job_memory)s -XX:+UseG1GC ' % locals()
     job_threads = 3
 
     if BamTools.getNumReads(infile) == 0:
@@ -1060,7 +1060,7 @@ def loadIdxstats(infiles, outfile):
                                       columns=(['region', 'mapped']))
 
         # reformat the df
-        df = df.append(reformatted_df, ignore_index=True)
+        df = df._append(reformatted_df, ignore_index=True)
         df.set_index('region', inplace=True)
         df1 = df[['mapped']].T
         dfs.append(df1)
@@ -1291,7 +1291,7 @@ def buildPicardRnaSeqMetrics(infiles, strand, outfile, picardmem):
 
     '''
     job_memory = picardmem
-    picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals()
+    picard_opts = '-Xmx%(job_memory)s -XX:+UseG1GC ' % locals()
     job_threads = 3
     infile, genome, rRNA_intervals = infiles