Skip to content

Commit d8fba1a

Browse files
committed
add initial greedy method previously implemented by @ozanozisik
1 parent 9621330 commit d8fba1a

File tree

1 file changed

+301
-0
lines changed

1 file changed

+301
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,301 @@
1+
#' Calculate Background Score
2+
#'
3+
#' @param pin_graph \code{\link[igraph]{igraph}} object containing the protein-protein
4+
#' interaction network
5+
#' @param number_of_iterations the number of iterations for background score calculations
6+
#' @param pin_scores_vec vector of score for each gene in the PIN
7+
#'
8+
#' @return a list containing 2 elements \describe{
9+
#' \item{sampling_score_means}{vector of sampling score means}
10+
#' \item{sampling_score_stds}{vector of sampling score standard deviations}
11+
#' }
12+
calculate_background_score <- function(pin_graph, number_of_iterations, pin_scores_vec) {
13+
14+
sampling_score_sums <- rep(0, igraph::vcount(pin_graph))
15+
sampling_score_square_sums <- rep(0, igraph::vcount(pin_graph))
16+
17+
node_list_for_sampling <- igraph::V(pin_graph)
18+
19+
for (iter in seq_len(number_of_iterations)) {
20+
21+
# progress message
22+
if (iter %% 50 == 0) {
23+
message(round(iter / number_of_iterations, 2) * 100, "% of total iterations in background score calculation finished")
24+
}
25+
26+
# shuffle nodes
27+
node_list_for_sampling <- sample(node_list_for_sampling)
28+
29+
z_sum <- 0
30+
number_of_nodes_in_subnetwork <- 0
31+
32+
for (node in node_list_for_sampling) {
33+
34+
z_sum <- z_sum + pin_scores_vec[[node]] #Much faster than using V(pin_graph)$score[node]
35+
36+
number_of_nodes_in_subnetwork <- number_of_nodes_in_subnetwork + 1
37+
38+
score <- z_sum/sqrt(number_of_nodes_in_subnetwork)
39+
40+
sampling_score_sums[number_of_nodes_in_subnetwork] <- sampling_score_sums[number_of_nodes_in_subnetwork] + score
41+
sampling_score_square_sums[number_of_nodes_in_subnetwork] <- sampling_score_square_sums[number_of_nodes_in_subnetwork] + score ^ 2
42+
}
43+
}
44+
45+
# These are vector operations
46+
sampling_score_means <- sampling_score_sums / number_of_iterations
47+
48+
sampling_score_stds <- sampling_score_square_sums/number_of_iterations - sampling_score_means ^ 2
49+
sampling_score_stds <- sqrt(sampling_score_stds + 1e-10)
50+
51+
# Addition of small number has two purposes: 1.Prevents division by zero in calculate_component_score() 2.Corrects for very small negative numbers that might appear,
52+
# like -3.388132e-21
53+
54+
## Explanation of the operation above var = SUM((x-xmean)^2) / N var = SUM(x^2 - 2*xmean*x + xmean^2)/N var = SUM(x^2)/N - (2*xmean*SUM(x))/N + (N*xmean^2)/N var =
55+
## SUM(x^2)/N - 2*xmean^2 + xmean^2 var = SUM(x^2)/N - xmean^2
56+
return(list(sampling_score_means = sampling_score_means, sampling_score_stds = sampling_score_stds))
57+
}
58+
59+
60+
#' Calculate Component Score
61+
#'
62+
#' @inheritParams calculate_background_score
63+
#' @param component component (vector)
64+
#' @param sampling_score_means vector of background sampling score means
65+
#' @param sampling_score_stds vector of background sampling score standard deviations
66+
#'
67+
#' @return the score of the component
68+
calculate_component_score <- function(pin_scores_vec, component, sampling_score_means, sampling_score_stds) {
69+
numberOfNodes <- length(component)
70+
71+
if (numberOfNodes == 0)
72+
return(0)
73+
74+
score <- sum(pin_scores_vec[component]) / sqrt(numberOfNodes)
75+
score <- (score - sampling_score_means[numberOfNodes]) / sampling_score_stds[numberOfNodes]
76+
return(score)
77+
}
78+
79+
80+
#' Greedy Breadth-First Active Subnetwork search method implementation in R
81+
#'
82+
#' @param seed_node the seed node (an \code{\link[igraph]{igraph}} vs object)
83+
#' @inheritParams calculate_background_score
84+
#' @param input_scores_df data frame containing scores per each gene in the PIN
85+
#' @param background_sampling_result list containing vector of background sampling score means
86+
#' and vector of background sampling score standard deviations
87+
#' @param max_depth maximum depth in search
88+
#' @param check_second_neighbors boolean to indicate whether to check second
89+
#' neighbors when any direct neighbor does not improve the score
90+
#'
91+
#' @return an active module (vector)
92+
greedy_breadth_first_active_subnetwork_search <- function(seed_node, pin_graph, input_scores_df, pin_scores_vec, background_sampling_result, max_depth, check_second_neighbors) {
93+
94+
sampling_score_means <- background_sampling_result$sampling_score_means
95+
sampling_score_stds <- background_sampling_result$sampling_score_stds
96+
97+
comp <- c()
98+
99+
queue <- c(seed_node)
100+
checked_in_greedy <- c(seed_node)
101+
distances_from_seed <- c(0)
102+
103+
will_be_checked_for_neighbors <- c()
104+
# When check_second_neighbors==TRUE, a node, the nodes which do not increase the score by themselves are added to the end of the queue and they are checked when
105+
# there are only this kind of nodes. The reason is, an important neighbor might already be added by the help of another important node and we may not need this
106+
# unimportant node
107+
108+
while (length(queue) > 0) {
109+
if (setequal(queue, will_be_checked_for_neighbors)) {
110+
# all nodes in the queue are nodes whose neighbors will be checked for addition
111+
node <- queue[1] # get next node
112+
queue <- queue[-1] # remove the node
113+
114+
node_distance <- distances_from_seed[1]
115+
distances_from_seed <- distances_from_seed[-1]
116+
117+
neighbor_names <- names(igraph::neighbors(pin_graph, node))
118+
neighbor_scores_df <- input_scores_df[input_scores_df$Gene %in% neighbor_names, ]
119+
neighbor_names <- neighbor_scores_df$Gene[order(neighbor_scores_df$Score, decreasing = TRUE)]
120+
121+
current_score <- calculate_component_score(pin_graph = pin_graph, pin_scores_vec = pin_scores_vec, compo = comp, sampling_score_means = sampling_score_means, sampling_score_stds = sampling_score_stds)
122+
comp <- c(comp, node)
123+
will_be_checked_for_neighbors <- will_be_checked_for_neighbors[will_be_checked_for_neighbors != node] #remove from postponed list
124+
125+
neighbor_added <- FALSE
126+
for (neighbor_name in neighbor_names) {
127+
if (pin_scores_vec[neighbor_name] > 0) {
128+
neighbor_node <- igraph::V(pin_graph)[neighbor_name] #getting igraph.vs from gene name
129+
if (!neighbor_node %in% checked_in_greedy) {
130+
checked_in_greedy <- c(checked_in_greedy, neighbor_node)
131+
new_score <- calculate_component_score(pin_graph = pin_graph, pin_scores_vec = pin_scores_vec, compo = c(comp, neighbor_node), sampling_score_means = sampling_score_means,
132+
sampling_score_stds = sampling_score_stds)
133+
if (new_score > current_score) {
134+
queue <- c(queue, neighbor_node)
135+
distances_from_seed <- c(distances_from_seed, node_distance + 1)
136+
neighbor_added <- TRUE
137+
}
138+
}
139+
}
140+
}
141+
142+
if (!neighbor_added)
143+
comp <- comp[-length(comp)] # Removing node from comp
144+
145+
} else {
146+
node <- queue[1] # get next node
147+
queue <- queue[-1] # remove the node
148+
149+
node_distance <- distances_from_seed[1]
150+
distances_from_seed <- distances_from_seed[-1]
151+
152+
if (node %in% will_be_checked_for_neighbors) {
153+
# Sending to the end of the queue, will be checked when there are only this kind of nodes
154+
queue <- c(queue, neighbor_node)
155+
distances_from_seed <- c(distances_from_seed, node_distance)
156+
} else {
157+
current_score <- calculate_component_score(pin_graph = pin_graph, pin_scores_vec = pin_scores_vec, compo = comp, sampling_score_means = sampling_score_means, sampling_score_stds = sampling_score_stds)
158+
new_score <- calculate_component_score(pin_graph = pin_graph, pin_scores_vec = pin_scores_vec, compo = c(comp, node), sampling_score_means = sampling_score_means, sampling_score_stds = sampling_score_stds)
159+
160+
161+
if (new_score > current_score) {
162+
163+
comp <- c(comp, node)
164+
165+
if (node_distance < max_depth) {
166+
# Its distance is less than max_depth, which means we can go further and check its neighbors
167+
neighbor_names <- names(igraph::neighbors(pin_graph, node))
168+
neighbor_scores_df <- input_scores_df[input_scores_df$Gene %in% neighbor_names, ]
169+
neighbor_names <- neighbor_scores_df$Gene[order(neighbor_scores_df$Score, decreasing = TRUE)]
170+
171+
for (neighbor_name in neighbor_names) {
172+
if (pin_scores_vec[neighbor_name] > 0)
173+
{
174+
neighbor_node <- igraph::V(pin_graph)[neighbor_name] # getting igraph.vs from gene name
175+
if (!neighbor_node %in% checked_in_greedy) {
176+
checked_in_greedy <- c(checked_in_greedy, neighbor_node)
177+
queue <- c(queue, neighbor_node)
178+
distances_from_seed <- c(distances_from_seed, node_distance + 1)
179+
}
180+
} #else break
181+
}
182+
}
183+
184+
} else {
185+
if (check_second_neighbors) {
186+
if (node_distance < max_depth) {
187+
# if we will be able to add neighbors of this node Postponing
188+
will_be_checked_for_neighbors <- c(will_be_checked_for_neighbors, node)
189+
queue <- c(queue, node)
190+
distances_from_seed <- c(distances_from_seed, node_distance)
191+
}
192+
}
193+
}
194+
}
195+
}
196+
}
197+
198+
return(comp)
199+
}
200+
201+
202+
#' Preprocessing and data generation for active subnetwork search functions
203+
#'
204+
#' @inheritParams active_snw_search
205+
#'
206+
#' @return list of processed data to be used in active subnetwork search functions
207+
active_snw_search_preprocessing <- function(input_for_search, pin_name_path) {
208+
# read PIN
209+
pin_df <- utils::read.delim(pin_path, header = FALSE)
210+
pin_df <- subset(pin_df, select = -2)
211+
212+
pin <- igraph::graph_from_data_frame(pin_df, directed = FALSE, vertices = NULL)
213+
214+
# read scores file
215+
scores_df <- input_for_search
216+
colnames(scores_df) <- c("Gene", "Score")
217+
218+
# qnorm that is used for z-score conversion gives Inf for less than (1 - 5e-17)
219+
# we are applying a threshold at 1E-15 and converting any smaller value to 1e-15
220+
scores_df$Score <- vapply(scores_df$Score, function(x) ifelse(x < 1e-15, 1e-15, x), 0.01)
221+
222+
# p-values are converted to z-scores
223+
scores_df$Score <- stats::qnorm(1 - scores_df$Score)
224+
225+
# add rows with a score of 0 for missing genes (PIN genes not in input)
226+
scores_df_to_append <- data.frame(Gene = setdiff(igraph::V(pin)$name,
227+
scores_df$Gene),
228+
Score = 0)
229+
scores_df <- rbind(scores_df, scores_df_to_append)
230+
231+
# assign scores to the PIN graph
232+
idx <- match(igraph::V(pin)$name, scores_df$Gene)
233+
igraph::V(pin)$score <- scores_df$Score[idx]
234+
235+
# create a vector of scores (for faster execution)
236+
scores_vec <- scores_df$Score[idx]
237+
names(scores_vec) <- scores_df$Gene[idx]
238+
239+
return(list(pin = pin, scores_df = scores_df, scores_vec = scores_vec))
240+
}
241+
242+
243+
244+
#' Greed Active Subnetwork Search wrapper for pathfindR
245+
#'
246+
#' @inheritParams active_snw_search
247+
#' @param max_depth Sets max depth in greedy search, 0 for no limit (default = 1) #TODO Move to `active_snw_search` doc
248+
#' @param check_second_neighbors in greedy search, whether to check second
249+
#' neighbors when any direct neighbor does not improve the score (default = TRUE) #TODO Default TRUE? #TODO Move to `active_snw_search` doc
250+
#'
251+
#' @return list containing active subnetworks
252+
#' @export
253+
greedy_active_subnetwork_search <- function(input_for_search, pin_name_path, max_depth, check_second_neighbors) {
254+
255+
res <- active_snw_search_preprocessing(input_for_search = input_for_search, pin_name_path = pin_name_path)
256+
257+
input_scores_df <- res$scores_df
258+
pin_graph <- res$pin
259+
pin_scores_vec <- res$scores_vec
260+
sig_genes <- input_for_search$GENE
261+
262+
background_sampling_result <- calculate_background_score(
263+
pin_graph = pin_graph,
264+
number_of_iterations = 1000,
265+
scores_vec = pin_scores_vec
266+
)
267+
268+
active_modules <- list()
269+
num_of_seeds_used <- 0
270+
ratio_print_threshold <- 10
271+
for (seed in sig_genes) {
272+
# progress
273+
num_of_seeds_used <- num_of_seeds_used + 1
274+
used_seed_ratio <- round(num_of_seeds_used / length(sig_genes), 2) * 100
275+
if (used_seed_ratio >= ratio_print_threshold) {
276+
message(used_seed_ratio, "% of seeds are checked")
277+
ratio_print_threshold <- ratio_print_threshold + 10
278+
}
279+
280+
seed_node <- igraph::V(pin_graph)[seed] # getting igraph.vs from gene name
281+
comp <- greedy_breadth_first_active_subnetwork_search(seed_node = seed_node,
282+
pin_graph = pin_graph,
283+
input_scores_df = input_scores_df,
284+
pin_scores_vec = pin_scores_vec,
285+
background_sampling_result = background_sampling_result,
286+
max_depth = max_depth,
287+
check_second_neighbors = check_second_neighbors)
288+
# check if the same module exists
289+
same_exists <- FALSE
290+
for (active_module in active_modules){
291+
if(setequal(comp, active_module)){
292+
same_exists <- TRUE
293+
break
294+
}
295+
}
296+
297+
if(!same_exists)
298+
active_modules[[length(active_modules) + 1]] <- names(comp)
299+
}
300+
return(active_modules)
301+
}

0 commit comments

Comments
 (0)