forked from bfriedrichgrube/OC_CPTAC_iTRAQ_SWATH
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path5_SWATH_processing.R
152 lines (110 loc) · 5.56 KB
/
5_SWATH_processing.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
## SWATH protein data preparation ##
## after mapDIA we use the Matrix fragments_for_protein_quantification for further steps
## Step 1: Prepare R session ##
sessionInfo()
setwd('')
## Step 2: Load in data ##
data <- read.delim('fragments_for_protein_quantification.txt', sep='\t', dec='.', as.is=T, header=T)
dim(data)
## Step 3: Filter data and select top3 fragments (we select overall top3 to have the same fragments in each sample)
# mapDIA filtered out fragments and set all their values to 0 and put them on the end
data_filtered <- data[1:90421,]
dim(data_filtered)
tail(data_filtered[,1])
data_filtered$fragment_sum <- apply(as.matrix(data_filtered[,4:106]), 1, sum, na.rm=T)
l <- data_filtered[order(data_filtered$fragment_sum, decreasing=T),]
d <- by(l, l$Peptide, head, n=3)
data_top3 <- Reduce(rbind, d)
dim(data_top3)
## Step 4: Aggregate fragments to peptides by summing the top3 fragments
length(unique(data_top3$Peptide))
data_peptide <- aggregate(data_top3[,4:106], by=list(data_top3$Protein, data_top3$Peptide), FUN=sum)
dim(data_peptide)
length(unique(data_peptide$Group.1))
## Step 5: Calculate peptide ratio to internal reference (peptide mean) this is comparable to the peptide ratios in iTRAQ
peptide_mean <- apply(as.matrix(data_peptide[,3:105]), 1, mean, na.rm=T)
length(peptide_mean)
data_pepratio <- as.matrix(data_peptide[,3:105])/peptide_mean
data_logratio <- log2(data_pepratio)
## Step 6: Calculate protein ratios by averaging peptide ratios
data_protein <- aggregate(data_logratio, by=list(data_peptide$Group.1), FUN=mean, na.rm=T)
dim(data_protein)
rownames(data_protein) <- data_protein[,1]
data_protein[,1]<- NULL
dim(data_protein)
save(data_peptide, data_pepratio, data_logratio, file='CPTAC_SWATH_peptide_level.Rdata')
save(data_protein, file='CPTAC_SWATH_protein_level.Rdata')
save(data_top3, file='CPTAC_SWATH_transition_level_top3.Rdata')
## Step 7: Filtering with adaption to abundance level, high abundant proteins are allowed more missing values, while low abundant protein might have missing values due to technical reason and are allowed less missing values
#mean protein abundance will be calculated from peptide level, and protein matrix will be sorted accordingly
data_protein_abund <- aggregate(data_peptide[,3:105], by=list(data_peptide$Group.1), FUN=mean, na.rm=T)
mean_abund <- apply(data_protein_abund[,2:104], 1, mean, na.rm=T)
names(mean_abund)<- data_protein_abund[,1]
head(mean_abund)
mean_abund <- sort(mean_abund)
head(mean_abund)
data_protein <- as.matrix(data_protein)
data_protein_sorted <- data_protein[names(mean_abund),]
#count missing values per protein
SWATH_missing <- NULL
for (i in 1:nrow(data_protein_sorted)) {
tmp <- length(which(is.nan(as.matrix(data_protein_sorted[i,]))))
SWATH_missing <- c(SWATH_missing, tmp)
}
length(SWATH_missing)
names(SWATH_missing) <- rownames(data_protein_sorted)
#plot missing values vs. abundance (Figure S1C)
library(ggplot2)
pdf('missing_vs_abund_20180227.pdf', width=10, height=10)
ggplot(a)+
geom_smooth(aes(index, SWATH_missing), se=FALSE, size=2)+
geom_smooth(aes(index, 5*(log2(mean_abund))), colour='red', size=2)+
coord_cartesian(ylim=c(1,100))+
scale_y_continuous(sec.axis= sec_axis(~./5))+theme_bw()+
theme(axis.text.x = element_text(colour='black', size=35, angle=45, hjust=1), axis.text.y=element_text(colour='black', size=35), axis.title.x=element_text(size=0), axis.title.y=element_text(size=0))
dev.off()
#filter now based on percentiles
#lowest 10% abundant protein are allowed no missing values, the next 10% are allowed less than 10% missing values, the highest abundant 10% proteins are allowed 90% missing values
quantile(mean_abund, c(0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1))
length(which(mean_abund<=3861.542))
#[1] 291
...
a <- names(which(SWATH_missing[1:291]==0))
...
SWATH_filtered <- data_protein_sorted[c(a,b,c,d,e,f,g,h,i,j),]
dim(SWATH_filtered)
#one protein could not be mapped to gene name and therefore removed
SWATH_filtered <- SWATH_filtered[grep('P62158', rownames(SWATH_filtered), invert=T),]
mapping_table <- read.delim('M2017110183C3DD8CE55183C76102DC5D3A26728BF22CD96.tab', sep='\t', dec='.', header=T, as.is=T)
mapping_table <- mapping_table[!duplicated(mapping_table$From),]
dim(mapping_table)
rownames(SWATH_filtered) <- mapping_table$To
#save output
save(data_protein_abund, mean_abund, data_protein_sorted, SWATH_missing, a, b, c, d, e, f, g, h, i, j, file='CPTAC_protein_filter_continuous.Rdata')
save(SWATH_filtered, file='CPTAC_protein_level_filtered.Rdata')
##Step 8: Imputing missing values
# similar to Perseus software, a normal distribution of random values at the left end of the measured value-distribution will be created
mean(SWATH_filtered, na.rm=T)
[1] -0.1921363
sd(SWATH_filtered, na.rm=T)
[1] 0.5570528
count_missing <- length(which(is.na(SWATH_filtered)))
count_missing
[1] 21914
new_mean <- mean(SWATH_filtered, na.rm=T)*9
new_sd <- sd(SWATH_filtered, na.rm=T)*0.48
set.seed(0)
background <- rnorm(count_missing, mean=new_mean, sd=new_sd)
#plot in order to check where random distribution lays
data_hist <- hist(SWATH_filtered)
back_hist <- hist(background)
plot(data_hist, xlim=c(-8,8), col='lightblue', main='Distribution of data and distribution of background', xlab='log2 intensities')
plot(back_hist, xlim=c(-8,8), col='red', main='Distribution of data and distribution of background', xlab='log2 intensities', add=T)
#insert imputed values
SWATH_imputed <- SWATH_filtered
length(which(is.na(SWATH_imputed)))
[1] 21914
SWATH_imputed[which(is.na(SWATH_imputed))]<-background
length(which(is.na(SWATH_imputed)))
[1] 0
save(SWATH_imputed, file='CPTAC_SWATH_imputed.Rdata')