-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathCpG2BED.R
131 lines (109 loc) · 3.76 KB
/
CpG2BED.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# Run this script to convert makeCGI results to BED and to create CGI plots
# Use "result" folder create by makeCGI as the script argument
#
# To start script execution run
#Rscript CpG2BED.R "makegci_directory/result"
# in the terminal
#
# To create plot make sure that ggplot2 is installed
#Clear environment
rm(list = ls())
#Define function for RDA to BED conversion
rda2bed <- function(rdafile, concat = F, verbose = F) {
print(paste('Analyzing file:', rdafile))
#Load file for processing
load(rdafile)
print(paste('Number of observations loaded for the current file:', nrow(cgi)))
bed <- data.frame(cgi$chr, cgi$start, cgi$end, cgi$length)
if (concat) {
filename = 'result.bed'
} else {
filename = paste(rdafile, '.bed', sep = '')
}
#Write bed file
write.table(bed, file = filename, append = concat,
quote = F, sep = '\t', row.names = F, col.names = F)
#In verbouse mode create additional table with full data
if (verbose) {
write.table(cgi, file = paste(filename, '.full'), append = concat,
quote = F, sep = '\t', row.names = F, col.names = T)
createplot(cgi, rdafile)
}
#Print summary
CGI <- nrow(cgi)
GF <- findGF(cgi)
print(paste(rdafile, 'HMM CGI:', CGI, 'Gardiner-Garden and Frommer:', GF))
}
#Create CGI plot
createplot <- function(cgi, title) {
library(ggplot2)
#Create borders
gccut <- data.frame(x=c(0, 1), y=c(0.5, 0.5))
cgi$GardinerFrommer <- cgi$obsExp > 0.6 & cgi$pctGC > 0.5
cgi$GFminLength <- cgi$length > 200
#Plot Gardiner-Garden and Frommer compilant CGI (without length)
p <- qplot(pctGC, obsExp, data=cgi,
xlab = 'GC content', ylab = 'Observerd/Expected',
colour = GardinerFrommer, xlim=c(0.3, 0.9), ylim=c(0, 2.5))
#Save to file
ggsave(filename=paste(title, '.png', sep = ''), plot=p)
#Plot Gardiner-Garden and Frommer compilant CGI (length only)
p1 <- qplot(length, data=cgi, geom="histogram", binwidth=50, colour = GFminLength)
ggsave(filename=paste(title, '_length.png', sep = ''), plot=p1)
#Plot Gardiner-Garden and Frommer compilant CGI (with length)
p <- qplot(pctGC, obsExp, data=cgi,
xlab = 'GC content', ylab = 'Observerd/Expected',
colour = GardinerFrommer & GFminLength, xlim=c(0.3, 0.9), ylim=c(0, 2.5))
#Save to file
ggsave(filename=paste(title, 'GGF_length.png', sep = ''), plot=p)
}
#Find number of CGI that conform to Gardiner-Garden and Frommer definition
findGF <- function(cgi) {
GCandOE <- nrow(subset(cgi, obsExp > 0.6 & pctGC > 0.5 & length > 200))
return (GCandOE)
}
#Separate CGI file by separate files by contig
splitcgi <- function(rdafile) {
print(paste('Analyzing file:', rdafile))
#Load file for processing
load(rdafile)
cgiall <- data.frame(cgi)
contigs <- levels(cgiall$chr)
for (contig in contigs) {
cgi <- cgiall[cgiall$chr==contig,]
save(cgi, file=paste(
sub('.rda', '', rdafile, ignore.case = T),
contig,
'rda',
sep = '.'
))
}
}
## EXECUTE SCRIPT ##
#Get command line arguments and working directory from it
args <- commandArgs()
#Check arguments length and
if (length(args) >= 6) {
#Get data folder
#CONFIGURATION
datafolder <- args[6]
#Or set datafolder manually
#datafolder <- 'makecgi/results'
#CONFIGURATION END
if (file.exists(datafolder)) {
print(paste('Analyzing CGI results file in the folder', datafolder))
} else {
print(paste('Can\'t open the folder specified:', datafolder))
q()
}
} else {
print('Pease specify working directory name as the script argument')
q()
}
#Get CGI result files in the folder
files <- list.files(path = datafolder, pattern = '^CGI.*\\.rda$')
setwd(datafolder)
#Perform conversion
for (file in files) {
rda2bed(file, verbose = T)
}