-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path12_TopGO.R
215 lines (143 loc) · 7.91 KB
/
12_TopGO.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
#####################################################################################################################################################################
#
# Script used in:
#
# "Gene expression profiling across ontogenetic stages in wood white (Leptidea sinapis) reveals pathways linked to butterfly diapause regulation."
# Luis Leal, Venkat Talla, Thomas Källman, Magne Friberg, Christer Wiklund, Vlad Dincă, Roger Vila, Niclas Backström
# Mol. Eco. (2018)
#
#
#####################################################################################################################################################################
#
# Luis Leal
# Uppsala University, Uppsala, Sweden, 2017
#
#####################################################################################################################################################################
###################################################################################################
#
# R/3.4.0 script used to carry out gene enrichment analysis using the TopGO R Bioconductor package
#
# Dependencies: DESeq2_OUTPUT_sva.csv >> DESeq2 results file
# OUTPUT_trinity_to_GO.txt >> two-column file listing each Trinity gene and its associated GO terms
#
#
###################################################################################################
########## Libraries
##Install TopGO (you only need to do this once)
#source("http://bioconductor.org/biocLite.R")
#biocLite("topGO")
#load libraries
library(topGO)
########## Clear all states and remove all plots
rm(list=ls(all=TRUE))
if(!is.null(dev.list())) dev.off()
########## input files and paths
### Path to folder containing DESeq2 results file
base_deseq <- "/DESeq2"
list.files(base_deseq)
### Name of DESeq2 results file
###
### IMPORTANT>> MUST EDIT DESeq2 file and add "gene_ID" to header
###
deseq_file <- "DESeq2_OUTPUT_sva.csv"
### Import DESeq2 results file
deseq_data <- read.table(file.path(base_deseq, deseq_file), header = TRUE)
head(deseq_data, 10)
### Path to folder containing Trinotate export file (two-column file listing each Trinity gene and its associated GO terms)
base_trinotate <- "/Trinotate"
list.files(base_trinotate)
### Two-column file listing trinity gene names and associated Go terms (IN CSV FORMAT)
## NOTE: headers should be as follows:
## "gene_id" <tab> "gene_ontology"
##
trinotate_file <- "OUTPUT_trinity_to_GO.txt"
### import Trinotate export file
annot <- read.table(file.path(base_trinotate, trinotate_file), header = TRUE)
head(annot, 50)
########## Define gene universe and the set of interesting genes
## genes of interest (p-adj<0.5)
goi_aux <- deseq_data[complete.cases(deseq_data), ] # filter-out genes with p-adj=N/A
goi_aux <- goi_aux[goi_aux[, "padj"] < 0.05,] # keep genes with p-adj < 0.05
goi_aux <- goi_aux[abs(goi_aux[, "log2FoldChange"]) > 1,] # keep genes with log2 fold change > 1
goi_aux <- goi_aux[abs(goi_aux[, "baseMean"]) > 10,] # keep genes with baseMean > 10
myInterestingGenes <- as.vector(goi_aux$gene_ID) # genes of interest
head(myInterestingGenes,10)
Numb_genes_of_interest <- length(myInterestingGenes)
Numb_genes_of_interest
## gene universe: genes with (baseMean >= 6) + myInterestingGenes
guniv_aux <- deseq_data[deseq_data[, "baseMean"] >= 6,] # keep genes with baseMean >= 6
guniv_names <- as.vector(guniv_aux$gene_ID) # gene names
head(guniv_names,10)
geneNames <- c(myInterestingGenes, guniv_names) # combine the two genes lists
geneNames <-unique(geneNames) # remove duplicate entries
Numb_background_genes <- length(geneNames)
Numb_background_genes
## Create a factor - geneList - which indicates for every gene in our universe
# (union of background and DE-genes), whether it is differentially expressed or not.
geneList <- factor(as.integer(geneNames %in% myInterestingGenes))
names(geneList) <- geneNames
str(geneList)
########## Establish association between genes and GO terms
GO_list <- as.list(as.character(annot[['gene_id']])) # generates list; element names are gene IDs
GO_list <- as.list(setNames(as.character(annot$gene_ontology), as.character(annot$gene_id))) # adds Gene Ontology data to list
GO_list <- lapply(GO_list, function(x) unlist(strsplit(x, split='\\`'))) # split single GO terms string into a character vector, one element per term
GO_list <- lapply(GO_list, function(x) substr(x, 1, 10)) # strips away GO descriptions, leaving just ID number for each element
geneID2GO <- GO_list
str(head(geneID2GO,50))
## make full list of transcript names, geneNames
#geneNames <- names(GO_list)
########## build the topGOdata object
## This object will contain all gene identifiers and their scores, the GO annotations,
## the GO hierarchical structure and all other information needed to perform the desired
## enrichment analysis.
GOdata <- new("topGOdata",
description = "DESeq2+sva+Trinotate-invertebrates session",
ontology = "BP",
allGenes = geneList,
geneSel = topDiffGenes,
nodeSize = 1,
annot = annFUN.gene2GO,
gene2GO = geneID2GO)
## notes:
## description : character string containing a short description of the study [optional].
## ontology : character string specifying the ontology of interest
## >> "BP" (Biological Process), "MF" (Molecular Function) or "CC" (Cellular Component).
## allGenes : named vector of type numeric or factor. The names attribute contains the genes identifiers.
## The genes listed in this object define the gene universe.
## geneSel : list of interesting genes (eg, genes with padj<0.05).
## nodeSize : an integer larger or equal to 1. This parameter is used to prune the GO hierarchy
## from the terms which have less than nodeSize annotated genes (after the true path rule
## is applied)
## annot : annotationFun is a function which maps genes identifiers to GO terms. The annotation
## functions take three arguments. One of those arguments is specifying where the mappings
## can be found, and needs to be provided by the user. There are several possibilies. Here we use:
## >> annFUN.gene2GO: this function is used when the annotations are provided as a gene-to-GOs mapping.
## gene2GO : mapping provided by the user
## Number of genes annotated to the GO terms
numGenes(GOdata)
## basic stats of GOdata object:
termStat(GOdata)
####### Perform enrichment tests
## Fisher (classic) >> high false positive rate
resultFisher.classic <- runTest(GOdata, algorithm = "classic", statistic = "fisher")
## Fisher (elim) [elimination algorithm] >> low # false positives; risk of discarding some relevant nodes]
resultFisher.elim <- runTest(GOdata, algorithm = "elim", statistic = "fisher")
## Fisher (weight) [not as strict as elim, and thus a higher # of false positives]
resultFisher.weight <- runTest(GOdata, algorithm = "weight", statistic = "fisher")
## Fisher (weight01) [mixture between the elim and the weight algorithms >>> TopGO default ]
resultFisher.weight01 <- runTest(GOdata, algorithm = "weight01", statistic = "fisher")
## Fisher (parentchild)
resultFisher.parentChild <- runTest(GOdata, algorithm = "parentChild", statistic = "fisher")
####### Analysis of results
allRes <- GenTable( GOdata,
classic_Fisher = resultFisher.classic,
elim_Fisher = resultFisher.elim,
weight_Fisher = resultFisher.weight,
weight01_Fisher = resultFisher.weight01,
parentChild_Fisher = resultFisher.parentChild,
orderBy = "elim_Fisher",
ranksOf = "classic_Fisher",
topNodes = 1000)
allRes
####### save results to file
#write.table(allRes, file = "TopGO_OUTPUT.csv", sep="\t")