Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
86fb87e
remove scriptsdir
jscaber Feb 10, 2022
b90a82f
remove scriptsdir
jscaber Feb 10, 2022
482a294
substatial changes to exome pipeline - bringing it in line with pipel…
jscaber Feb 14, 2022
40fb76e
remove mappingqc from pipeline_mapping.py
jscaber Feb 14, 2022
bfbb604
remove mappingqc from pipeline_windows.py
jscaber Feb 14, 2022
f2e5451
remove mappingqc from pipeline peakcalling
jscaber Feb 14, 2022
37f5213
remove mappingqc from rnaseqqc pipelin
jscaber Feb 14, 2022
cc83e76
remove mappingqc from exome cancer pipeline
jscaber Feb 14, 2022
5f63ab3
remove mappingqc.py - obsolete and superseded by bamstats.py
jscaber Feb 14, 2022
61427c4
fixing errors
jscaber Feb 14, 2022
d1792d1
troubleshooting
jscaber Feb 14, 2022
2c3aabf
troubleshooting
jscaber Feb 14, 2022
a433956
Removed realignment step as per GATK4
jscaber Apr 25, 2022
d377b33
updated BSQR procedure to gatk4 syntax
jscaber Apr 25, 2022
00ede33
Merge branch 'JS-current' into JS-exome
jscaber Apr 26, 2022
0b960e6
Gatk4 update: GVCF and GVCF database
jscaber May 4, 2022
5b8473d
Merge branch 'master' into JS-exome
jscaber May 4, 2022
52b67d0
gatk4: GenotypeGVCF now from database
jscaber May 6, 2022
6be599f
added options for rMATS output filtering (results table) - FDR, PSI a…
jscaber May 9, 2022
aaaf87f
options for pipeline splicing - yml file update
jscaber May 9, 2022
cb62f8c
snpeff updated
jscaber May 9, 2022
75b3ef4
bug corrections pipeline splicing
jscaber May 10, 2022
e406b90
further bugfixes splicing pipeline
jscaber May 10, 2022
37ed54b
major changes exome pipeline
jscaber May 10, 2022
0834b14
replace hapmap by somelier
jscaber May 11, 2022
a3bf625
remove xy prediction, which is provided by other tools and previous e…
jscaber May 11, 2022
2e56974
Exome Pipeline Development
jscaber May 11, 2022
4143583
bugfixes exome pipeline:snpsift and revomal exomeancestry.py
jscaber May 12, 2022
5f081d9
snpsift bugfixes
jscaber May 12, 2022
a589335
Add SpliceAI
jscaber May 13, 2022
39e993e
further exome pipeline - removed pedigree analysis
jscaber May 16, 2022
0769f35
remove pedigree pipeline
jscaber May 16, 2022
d6685f3
functions to establish gene 'Panel' BED from list
jscaber May 20, 2022
3231f31
various improvements to diff expression pipeline - able to handle une…
jscaber Mar 31, 2023
af5661a
Extensive changes to exome pipeline, bamstats and exploratory analysis
jscaber Sep 20, 2023
a0524b9
chages to exome.py
jscaber Sep 20, 2023
9342ccb
further development
jscaber Mar 30, 2024
a78a149
addition of setup instruction for R tools
jscaber Mar 30, 2024
5ed4039
draft long read pipeline
jscaber Mar 30, 2024
fccb2f1
Cchanges to cgatflow.py, compatibility with python 3.12
jscaber Dec 11, 2024
e6275ba
remove rpy2
jscaber Dec 11, 2024
32cac1e
add R scripts to new directory
jscaber Dec 11, 2024
00dffe9
add further R scripts, moved
jscaber Dec 11, 2024
563dd06
Changes to Make GTFsubset compatible with sqlachemy changes
jscaber Dec 17, 2024
d960eed
remove strand specificity which has been removed from cgat apps
jscaber Dec 17, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
Expand Up @@ -22,5 +22,8 @@ exclude scripts/*
include scripts/__init__.py
include scripts/version.py
include scripts/cgat_ruffus_profile.py
include cgatpipelines/Rtools/*
include cgatpipelines/experiment.R


# extensions
12 changes: 7 additions & 5 deletions cgatpipelines/Rtools/diffexpression.R
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,11 @@ suppressMessages(library(goseq))
source(file.path(Sys.getenv("R_ROOT"), "experiment.R"))


mart = useMart(biomart = "ENSEMBL_MART_ENSEMBL",dataset="hsapiens_gene_ensembl", host = "jul2018.archive.ensembl.org")
mart = useMart(biomart = "ENSEMBL_MART_ENSEMBL",dataset="hsapiens_gene_ensembl", host = "https://aug2020.archive.ensembl.org")
getmart <- function(values){
data<- getBM(
filters= "ensembl_gene_id",
attributes= c("ensembl_gene_id", "external_gene_name", "description","entrezgene", 'chromosome_name',
attributes= c("ensembl_gene_id", "external_gene_name", "description","entrezgene_id", 'chromosome_name',
'start_position', 'end_position'),
values= values,
mart= mart,
Expand All @@ -35,7 +35,7 @@ getmart <- function(values){
return(data)
}

start_plot <- function(section, height = 6, width = 6, type = "png", outdir="") {
start_plot <- function(section, height = 10, width = 10, type = "png", outdir="") {
file = get_output_filename(paste0(outdir,"/",section, ".", type))
Cairo(file = file,
type = type,
Expand Down Expand Up @@ -78,7 +78,8 @@ plotTPMs <- function(dftemp, contrast_name){
ggplot(aes(x = contrast, y = value, color = contrast)) +
geom_point(position = position_jitter(w = 0.15, h = 0)) +
facet_wrap(~ var, scales = "free") + theme_bw() +
ylab("normalised counts") + xlab (contrast_name) + guides(color = "none")
ylab("normalised counts") + xlab (contrast_name) + guides(color = "none")+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
}


Expand Down Expand Up @@ -114,7 +115,7 @@ run <- function(opt) {
flog.info("... plotting MA")
## MA Plot
start_plot("MAPlot", outdir=opt$outdir)
DESeq2::plotMA(resLFC, ylim = c(-3,3))
DESeq2::plotMA(resLFC, ylim = c(ceiling(max(resLFC$log2FoldChange)),floor(min(resLFC$log2FoldChange))))
end_plot()

flog.info("... saving DE data")
Expand Down Expand Up @@ -178,6 +179,7 @@ run <- function(opt) {

flog.info("... plotting user-defined genes")
genelist <- unlist(strsplit(opt$userlist, ","))
genelist <- genelist[genelist %in% rownames(dds)]
dftemp <- makeTPMtable(genelist, counts(dds, normalized=TRUE), colData(dds), opt$contrast)
start_plot("Userdefined", outdir=opt$outdir)
print(plotTPMs(dftemp, opt$contrast))
Expand Down
56 changes: 23 additions & 33 deletions cgatpipelines/Rtools/exploratory.R
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,11 @@ source(file.path(Sys.getenv("R_ROOT"), "experiment.R"))
# Dependencies: biomaRt package
# Input: vector of ensembl gene ids
# Output: data frame with description, symbol and entrez gene
mart = useMart(biomart = "ENSEMBL_MART_ENSEMBL",dataset="hsapiens_gene_ensembl", host = "jul2018.archive.ensembl.org")
mart = useMart(biomart = "ENSEMBL_MART_ENSEMBL",dataset="hsapiens_gene_ensembl", host = "https://aug2020.archive.ensembl.org")
getmart <- function(values){
data<- getBM(
filters= "ensembl_gene_id",
attributes= c("ensembl_gene_id", "external_gene_name", "description","entrezgene"),
attributes= c("ensembl_gene_id", "external_gene_name", "description","entrezgene_id"),
values= values,
mart= mart,
useCache = FALSE)
Expand Down Expand Up @@ -74,35 +74,15 @@ run <- function(opt) {
dds = DESeqDataSetFromMatrix(experiment$counts, experiment$sample, design = formula(opt$model))
}
futile.logger::flog.info(paste("reading Experiment object", paste(dim(counts(experiment)), collapse = ",")))

### SVA - ANALYSIS OF BIASES ###
futile.logger::flog.info(paste("Performing Surrogate Variable Analysis"))
dds <- estimateSizeFactors(dds)
dat <- counts(dds, normalized = TRUE)
idx <- rowMeans(dat) > 1
dat <- dat[idx, ]
mod <- model.matrix(formula(opt$model), colData(dds))
mod0 <- model.matrix(~ 1, colData(dds))
svseq <- svaseq(dat, mod, mod0, n.sv = 2)
for(factor in opt$factors){
start_plot(paste0('SVA for ', factor), outdir=opt$outdir)
par(mfrow = c(2, 1), mar = c(3,5,3,1))
for (i in 1:2) {
stripchart(svseq$sv[, i] ~ colData(dds)[, factor], vertical = TRUE, main = paste0("SV", i))
abline(h = 0)
}
end_plot()
}


### TRANSFORMATION OF DATA ###
futile.logger::flog.info(paste("Transforming data"))
rld<- rlog(dds)
dds <- estimateSizeFactors(dds)
vsd<- vst(dds)
df <- bind_rows(
as_tibble(log2(counts(dds, normalized=TRUE)[, 1:2]+1)) %>%
mutate(transformation = "log2(x + 1)"),
as_tibble(assay(vsd)[, 1:2]) %>% mutate(transformation = "vst"),
as_tibble(assay(rld)[, 1:2]) %>% mutate(transformation = "rlog"))
as_tibble(assay(vsd)[, 1:2]) %>% mutate(transformation = "vst"))
colnames(df)[1:2] <- c("x", "y")
start_plot('Variance_Transformations', outdir=opt$outdir)
print(ggplot(df, aes(x = x, y = y)) + geom_hex(bins = 80) +
Expand All @@ -120,11 +100,12 @@ run <- function(opt) {
if(dim(pca$x)[1]>7){
dim_pca = 8
} else {
dim_pca = dim(pca$x)
dim_pca = dim(pca$x)[1]
}
scores <- data.frame(variable.group, pca$x[,1:dim_pca])
start_plot(paste0('PCA_', factor), outdir=opt$outdir)
print(qplot(x=PC1, y=PC2, data=scores, colour=factor(variable.group)) +
print(ggplot(scores, aes(x=PC1, y=PC2, colour=factor(variable.group))) +
geom_point() +
theme(legend.position="right") +
labs(colour=factor, x=paste0("PC1 (", percentVar[1],"% of variance)"),
y=paste0("PC2 (", percentVar[2],"% of variance)")) +
Expand All @@ -133,7 +114,8 @@ run <- function(opt) {
theme(text=element_text(family='serif')))
end_plot()
start_plot(paste0('PCA_13_', factor), outdir=opt$outdir)
print(qplot(x=PC1, y=PC3, data=scores, colour=factor(variable.group)) +
print(ggplot(scores, aes(x=PC1, y=PC3, colour=factor(variable.group))) +
geom_point() +
theme(legend.position="right") +
labs(colour=factor, x=paste0("PC1 (", percentVar[1],"% of variance)"),
y=paste0("PC3 (", percentVar[3],"% of variance)")) +
Expand All @@ -146,7 +128,7 @@ run <- function(opt) {
names(variable.group) <- opt$contrast
scores <- data.frame(variable.group, pca$x[,1:dim_pca])
start_plot(paste0('PCA_grid'), outdir=opt$outdir)
if(dim_pca>7){
if(dim_pca==8){
print(ggplot(scores, aes(x = .panel_x, y = .panel_y, fill = variable.group, colour = variable.group)) +
geom_point(shape = 16, size = 0.5, position = 'auto') +
geom_autodensity(alpha = 0.3, colour = NA, position = 'identity') +
Expand All @@ -171,14 +153,18 @@ run <- function(opt) {
df <- as.data.frame(colData(dds)[,opt$factors])
rownames(df) <- colData(dds)$track
# Heatmap of Top 20 Expressed Genes
select <- order(rowMeans(counts(dds,normalized=TRUE)),decreasing=TRUE)[1:20]
selected <- order(rowMeans(counts(dds,normalized=TRUE)),decreasing=TRUE)[1:20]
start_plot('Heatmap_topExpressed', outdir=opt$outdir)
pheatmap(assay(vsd)[select,], cluster_rows=FALSE, cluster_cols=FALSE, show_rownames=FALSE, annotation_col=df)
pheatmap(assay(vsd)[selected,], cluster_rows=FALSE, cluster_cols=FALSE, show_rownames=FALSE, annotation_col=df)
end_plot()
# Heatmap of Top 20 Variable Genes
topVarGenes <- head(order(rowVars(assay(vsd)),decreasing=TRUE),20)
futile.logger::flog.info(paste("topVarGenes ",topVarGenes))
mat <- assay(vsd)[ topVarGenes, ]
futile.logger::flog.info(paste("mat ",rownames(mat)))
temp <- getmart(rownames(mat))
temp <- temp[!duplicated(temp$ensembl_gene_id), ]
futile.logger::flog.info(paste("temp ",temp))
row.names(temp) <- temp$ensembl_gene_id
rownames(mat) <- temp[rownames(mat),"external_gene_name"]
start_plot('Heatmap_topVariable', outdir=opt$outdir)
Expand All @@ -187,7 +173,9 @@ run <- function(opt) {
# Heatmap of Genes of interest
if (!is.null(opt$genes_of_interest)) {
print(rownames(assay(vsd)))
mat <- assay(vsd)[opt$genes_of_interest, ]
goi <- opt$genes_of_interest
goi <- goi[goi %in% rownames(assay(vsd))]
mat <- assay(vsd)[goi, ]
mat <- mat - rowMeans(mat)
temp <- getmart(rownames(mat))
row.names(temp) <- temp$ensembl_gene_id
Expand Down Expand Up @@ -227,6 +215,7 @@ run <- function(opt) {

### EXPLORE BATCH EFFECTS ###
futile.logger::flog.info(paste("Exploring Batch Effects"))
futile.logger::flog.info(dim_pca)
if(dim_pca == 8){
for(factor in opt$factors){
factor_transformed <- vsd
Expand All @@ -237,7 +226,8 @@ run <- function(opt) {
percentVar <- round(100 * summary(pca)$importance[2,])
scores <- data.frame(variable.group, sample.group, pca$x[,1:2])
start_plot(paste0('PCA_', factor, '_removed'), outdir=opt$outdir)
print(qplot(x=PC1, y=PC2, data=scores, colour=factor(variable.group), shape=factor(sample.group)) +
print(ggplot(scores, aes(x=PC1, y=PC2, colour=factor(variable.group), shape=factor(sample.group))) +
geom_point() +
theme(legend.position="right") +
labs(colour=opt$contrast, shape=factor, x=paste0("PC1 (", percentVar[1],"% of variance)"),
y=paste0("PC2 (", percentVar[2],"% of variance)")) +
Expand Down
9 changes: 7 additions & 2 deletions cgatpipelines/Rtools/filtercounts.R
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,10 @@ run <- function(opt) {
### READING DATA ###
# Read in sampleData Table
futile.logger::flog.info(paste("reading sampleData table from", normalizePath(opt$sampleData)))
sampleData <- read.table(opt$sampleData, header = TRUE)
sampleData <- read_tsv(opt$sampleData)
sampleData <-sampleData[sampleData$include ==1, ]
futile.logger::flog.info(paste("read sampleData ", paste(dim(sampleData), collapse = ",")))
futile.logger::flog.info(paste(sampleData))
rownames(sampleData) <- sampleData$track

futile.logger::flog.info(paste("reading in data from ", opt$source))
Expand Down Expand Up @@ -81,7 +82,8 @@ run <- function(opt) {
flattenedfile=opt$flattenedFile)
} else if(opt$source == "counts_table"){
# Read in Data
raw <- read.table(file = gzfile(opt$counts_tsv), header=TRUE, row.name=1)
raw <- read.table(file = gzfile(opt$counts_tsv), header=TRUE, row.name=1, check.names = FALSE)
futile.logger::flog.info(paste("Colnames: ", colnames(raw)))
experiment_tsv <- raw[,sampleData$track,drop=FALSE]
if(opt$method == "deseq2"){
dataset = DESeqDataSetFromMatrix(experiment_tsv, sampleData, design = formula(opt$model))
Expand All @@ -107,6 +109,9 @@ run <- function(opt) {
futile.logger::flog.info(paste("Counts before filtering ", paste(dim(counts(dataset)), collapse = ",")))
keep <- rowSums(counts(dataset)) >= 10
dataset <- dataset[keep,]
#dataset <- dataset[Reduce('|', dataset[sapply(dataset, is.numeric)]),]
#dataset <- dataset[rowSums(dataset[,sapply(dataset, is.numeric)]==0)<6,]
#dataset <- subset(dataset, !rowSums(assay(dataset) == 0) > 6)
counts_table <- counts(dataset)
futile.logger::flog.info(paste("Counts after filtering ", paste(dim(counts(dataset)), collapse = ",")))
} else if(opt$method == "dexseq"){
Expand Down
9 changes: 5 additions & 4 deletions cgatpipelines/cgatflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
import sys
import re
import glob
import imp
import importlib.util
import cgatpipelines


Expand Down Expand Up @@ -84,9 +84,10 @@ def main(argv=None):
# remove 'cgatflow' from sys.argv
del sys.argv[0]

(file, pathname, description) = imp.find_module(pipeline, paths)

module = imp.load_module(pipeline, file, pathname, description)
spec = importlib.util.spec_from_file_location(pipeline, path+"/"+pipeline+".py")
module = importlib.util.module_from_spec(spec)
sys.modules[pipeline] = module
spec.loader.exec_module(module)

module.main(sys.argv)

Expand Down
16 changes: 8 additions & 8 deletions cgatpipelines/tasks/bamstats.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def buildPicardInsertSizeStats(infile, outfile, genome_file,
Filename with genomic sequence.
'''
job_memory = picardmem
picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals()
picard_opts = '-Xmx%(job_memory)s -XX:+UseG1GC' % locals()
job_threads = 3

if BamTools.getNumReads(infile) == 0:
Expand Down Expand Up @@ -159,7 +159,7 @@ def buildPicardAlignmentStats(infile, outfile, genome_file,
'''

job_memory = picardmem
picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals()
picard_opts = '-Xmx%(job_memory)s -XX:+UseG1GC ' % locals()
job_threads = 3

if BamTools.getNumReads(infile) == 0:
Expand Down Expand Up @@ -191,7 +191,7 @@ def buildPicardDuplicationStats(infile, outfile, picardmem):
'''

job_memory = picardmem
picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals()
picard_opts = '-Xmx%(job_memory)s -XX:+UseG1GC ' % locals()
job_threads = 3

if BamTools.getNumReads(infile) == 0:
Expand Down Expand Up @@ -244,7 +244,7 @@ def buildPicardDuplicateStats(infile, outfile, picardmem):
Output filename with picard output.
'''
job_memory = picardmem
picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals()
picard_opts = '-Xmx%(job_memory)s -XX:+UseG1GC ' % locals()
job_threads = 3

if BamTools.getNumReads(infile) == 0:
Expand Down Expand Up @@ -279,7 +279,7 @@ def buildPicardCoverageStats(infile, outfile, baits, regions,
'''

job_memory = picardmem
picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals()
picard_opts = '-Xmx%(job_memory)s -XX:+UseG1GC ' % locals()
job_threads = 3

if BamTools.getNumReads(infile) == 0:
Expand Down Expand Up @@ -310,7 +310,7 @@ def buildPicardGCStats(infile, outfile, genome_file, picardmem):
"""

job_memory = picardmem
picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals()
picard_opts = '-Xmx%(job_memory)s -XX:+UseG1GC ' % locals()
job_threads = 3

if BamTools.getNumReads(infile) == 0:
Expand Down Expand Up @@ -1060,7 +1060,7 @@ def loadIdxstats(infiles, outfile):
columns=(['region', 'mapped']))

# reformat the df
df = df.append(reformatted_df, ignore_index=True)
df = df._append(reformatted_df, ignore_index=True)
df.set_index('region', inplace=True)
df1 = df[['mapped']].T
dfs.append(df1)
Expand Down Expand Up @@ -1291,7 +1291,7 @@ def buildPicardRnaSeqMetrics(infiles, strand, outfile, picardmem):

'''
job_memory = picardmem
picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals()
picard_opts = '-Xmx%(job_memory)s -XX:+UseG1GC ' % locals()
job_threads = 3
infile, genome, rRNA_intervals = infiles

Expand Down
Loading