|
| 1 | +#' Calculate Background Score |
| 2 | +#' |
| 3 | +#' @param pin_graph \code{\link[igraph]{igraph}} object containing the protein-protein |
| 4 | +#' interaction network |
| 5 | +#' @param number_of_iterations the number of iterations for background score calculations |
| 6 | +#' @param pin_scores_vec vector of score for each gene in the PIN |
| 7 | +#' |
| 8 | +#' @return a list containing 2 elements \describe{ |
| 9 | +#' \item{sampling_score_means}{vector of sampling score means} |
| 10 | +#' \item{sampling_score_stds}{vector of sampling score standard deviations} |
| 11 | +#' } |
| 12 | +calculate_background_score <- function(pin_graph, number_of_iterations, pin_scores_vec) { |
| 13 | + |
| 14 | + sampling_score_sums <- rep(0, igraph::vcount(pin_graph)) |
| 15 | + sampling_score_square_sums <- rep(0, igraph::vcount(pin_graph)) |
| 16 | + |
| 17 | + node_list_for_sampling <- igraph::V(pin_graph) |
| 18 | + |
| 19 | + for (iter in seq_len(number_of_iterations)) { |
| 20 | + |
| 21 | + # progress message |
| 22 | + if (iter %% 50 == 0) { |
| 23 | + message(round(iter / number_of_iterations, 2) * 100, "% of total iterations in background score calculation finished") |
| 24 | + } |
| 25 | + |
| 26 | + # shuffle nodes |
| 27 | + node_list_for_sampling <- sample(node_list_for_sampling) |
| 28 | + |
| 29 | + z_sum <- 0 |
| 30 | + number_of_nodes_in_subnetwork <- 0 |
| 31 | + |
| 32 | + for (node in node_list_for_sampling) { |
| 33 | + |
| 34 | + z_sum <- z_sum + pin_scores_vec[[node]] #Much faster than using V(pin_graph)$score[node] |
| 35 | + |
| 36 | + number_of_nodes_in_subnetwork <- number_of_nodes_in_subnetwork + 1 |
| 37 | + |
| 38 | + score <- z_sum/sqrt(number_of_nodes_in_subnetwork) |
| 39 | + |
| 40 | + sampling_score_sums[number_of_nodes_in_subnetwork] <- sampling_score_sums[number_of_nodes_in_subnetwork] + score |
| 41 | + sampling_score_square_sums[number_of_nodes_in_subnetwork] <- sampling_score_square_sums[number_of_nodes_in_subnetwork] + score ^ 2 |
| 42 | + } |
| 43 | + } |
| 44 | + |
| 45 | + # These are vector operations |
| 46 | + sampling_score_means <- sampling_score_sums / number_of_iterations |
| 47 | + |
| 48 | + sampling_score_stds <- sampling_score_square_sums/number_of_iterations - sampling_score_means ^ 2 |
| 49 | + sampling_score_stds <- sqrt(sampling_score_stds + 1e-10) |
| 50 | + |
| 51 | + # Addition of small number has two purposes: 1.Prevents division by zero in calculate_component_score() 2.Corrects for very small negative numbers that might appear, |
| 52 | + # like -3.388132e-21 |
| 53 | + |
| 54 | + ## Explanation of the operation above var = SUM((x-xmean)^2) / N var = SUM(x^2 - 2*xmean*x + xmean^2)/N var = SUM(x^2)/N - (2*xmean*SUM(x))/N + (N*xmean^2)/N var = |
| 55 | + ## SUM(x^2)/N - 2*xmean^2 + xmean^2 var = SUM(x^2)/N - xmean^2 |
| 56 | + return(list(sampling_score_means = sampling_score_means, sampling_score_stds = sampling_score_stds)) |
| 57 | +} |
| 58 | + |
| 59 | + |
| 60 | +#' Calculate Component Score |
| 61 | +#' |
| 62 | +#' @inheritParams calculate_background_score |
| 63 | +#' @param component component (vector) |
| 64 | +#' @param sampling_score_means vector of background sampling score means |
| 65 | +#' @param sampling_score_stds vector of background sampling score standard deviations |
| 66 | +#' |
| 67 | +#' @return the score of the component |
| 68 | +calculate_component_score <- function(pin_scores_vec, component, sampling_score_means, sampling_score_stds) { |
| 69 | + numberOfNodes <- length(component) |
| 70 | + |
| 71 | + if (numberOfNodes == 0) |
| 72 | + return(0) |
| 73 | + |
| 74 | + score <- sum(pin_scores_vec[component]) / sqrt(numberOfNodes) |
| 75 | + score <- (score - sampling_score_means[numberOfNodes]) / sampling_score_stds[numberOfNodes] |
| 76 | + return(score) |
| 77 | +} |
| 78 | + |
| 79 | + |
| 80 | +#' Greedy Breadth-First Active Subnetwork search method implementation in R |
| 81 | +#' |
| 82 | +#' @param seed_node the seed node (an \code{\link[igraph]{igraph}} vs object) |
| 83 | +#' @inheritParams calculate_background_score |
| 84 | +#' @param input_scores_df data frame containing scores per each gene in the PIN |
| 85 | +#' @param background_sampling_result list containing vector of background sampling score means |
| 86 | +#' and vector of background sampling score standard deviations |
| 87 | +#' @param max_depth maximum depth in search |
| 88 | +#' @param check_second_neighbors boolean to indicate whether to check second |
| 89 | +#' neighbors when any direct neighbor does not improve the score |
| 90 | +#' |
| 91 | +#' @return an active module (vector) |
| 92 | +greedy_breadth_first_active_subnetwork_search <- function(seed_node, pin_graph, input_scores_df, pin_scores_vec, background_sampling_result, max_depth, check_second_neighbors) { |
| 93 | + |
| 94 | + sampling_score_means <- background_sampling_result$sampling_score_means |
| 95 | + sampling_score_stds <- background_sampling_result$sampling_score_stds |
| 96 | + |
| 97 | + comp <- c() |
| 98 | + |
| 99 | + queue <- c(seed_node) |
| 100 | + checked_in_greedy <- c(seed_node) |
| 101 | + distances_from_seed <- c(0) |
| 102 | + |
| 103 | + will_be_checked_for_neighbors <- c() |
| 104 | + # When check_second_neighbors==TRUE, a node, the nodes which do not increase the score by themselves are added to the end of the queue and they are checked when |
| 105 | + # there are only this kind of nodes. The reason is, an important neighbor might already be added by the help of another important node and we may not need this |
| 106 | + # unimportant node |
| 107 | + |
| 108 | + while (length(queue) > 0) { |
| 109 | + if (setequal(queue, will_be_checked_for_neighbors)) { |
| 110 | + # all nodes in the queue are nodes whose neighbors will be checked for addition |
| 111 | + node <- queue[1] # get next node |
| 112 | + queue <- queue[-1] # remove the node |
| 113 | + |
| 114 | + node_distance <- distances_from_seed[1] |
| 115 | + distances_from_seed <- distances_from_seed[-1] |
| 116 | + |
| 117 | + neighbor_names <- names(igraph::neighbors(pin_graph, node)) |
| 118 | + neighbor_scores_df <- input_scores_df[input_scores_df$Gene %in% neighbor_names, ] |
| 119 | + neighbor_names <- neighbor_scores_df$Gene[order(neighbor_scores_df$Score, decreasing = TRUE)] |
| 120 | + |
| 121 | + current_score <- calculate_component_score(pin_graph = pin_graph, pin_scores_vec = pin_scores_vec, compo = comp, sampling_score_means = sampling_score_means, sampling_score_stds = sampling_score_stds) |
| 122 | + comp <- c(comp, node) |
| 123 | + will_be_checked_for_neighbors <- will_be_checked_for_neighbors[will_be_checked_for_neighbors != node] #remove from postponed list |
| 124 | + |
| 125 | + neighbor_added <- FALSE |
| 126 | + for (neighbor_name in neighbor_names) { |
| 127 | + if (pin_scores_vec[neighbor_name] > 0) { |
| 128 | + neighbor_node <- igraph::V(pin_graph)[neighbor_name] #getting igraph.vs from gene name |
| 129 | + if (!neighbor_node %in% checked_in_greedy) { |
| 130 | + checked_in_greedy <- c(checked_in_greedy, neighbor_node) |
| 131 | + new_score <- calculate_component_score(pin_graph = pin_graph, pin_scores_vec = pin_scores_vec, compo = c(comp, neighbor_node), sampling_score_means = sampling_score_means, |
| 132 | + sampling_score_stds = sampling_score_stds) |
| 133 | + if (new_score > current_score) { |
| 134 | + queue <- c(queue, neighbor_node) |
| 135 | + distances_from_seed <- c(distances_from_seed, node_distance + 1) |
| 136 | + neighbor_added <- TRUE |
| 137 | + } |
| 138 | + } |
| 139 | + } |
| 140 | + } |
| 141 | + |
| 142 | + if (!neighbor_added) |
| 143 | + comp <- comp[-length(comp)] # Removing node from comp |
| 144 | + |
| 145 | + } else { |
| 146 | + node <- queue[1] # get next node |
| 147 | + queue <- queue[-1] # remove the node |
| 148 | + |
| 149 | + node_distance <- distances_from_seed[1] |
| 150 | + distances_from_seed <- distances_from_seed[-1] |
| 151 | + |
| 152 | + if (node %in% will_be_checked_for_neighbors) { |
| 153 | + # Sending to the end of the queue, will be checked when there are only this kind of nodes |
| 154 | + queue <- c(queue, neighbor_node) |
| 155 | + distances_from_seed <- c(distances_from_seed, node_distance) |
| 156 | + } else { |
| 157 | + current_score <- calculate_component_score(pin_graph = pin_graph, pin_scores_vec = pin_scores_vec, compo = comp, sampling_score_means = sampling_score_means, sampling_score_stds = sampling_score_stds) |
| 158 | + new_score <- calculate_component_score(pin_graph = pin_graph, pin_scores_vec = pin_scores_vec, compo = c(comp, node), sampling_score_means = sampling_score_means, sampling_score_stds = sampling_score_stds) |
| 159 | + |
| 160 | + |
| 161 | + if (new_score > current_score) { |
| 162 | + |
| 163 | + comp <- c(comp, node) |
| 164 | + |
| 165 | + if (node_distance < max_depth) { |
| 166 | + # Its distance is less than max_depth, which means we can go further and check its neighbors |
| 167 | + neighbor_names <- names(igraph::neighbors(pin_graph, node)) |
| 168 | + neighbor_scores_df <- input_scores_df[input_scores_df$Gene %in% neighbor_names, ] |
| 169 | + neighbor_names <- neighbor_scores_df$Gene[order(neighbor_scores_df$Score, decreasing = TRUE)] |
| 170 | + |
| 171 | + for (neighbor_name in neighbor_names) { |
| 172 | + if (pin_scores_vec[neighbor_name] > 0) |
| 173 | + { |
| 174 | + neighbor_node <- igraph::V(pin_graph)[neighbor_name] # getting igraph.vs from gene name |
| 175 | + if (!neighbor_node %in% checked_in_greedy) { |
| 176 | + checked_in_greedy <- c(checked_in_greedy, neighbor_node) |
| 177 | + queue <- c(queue, neighbor_node) |
| 178 | + distances_from_seed <- c(distances_from_seed, node_distance + 1) |
| 179 | + } |
| 180 | + } #else break |
| 181 | + } |
| 182 | + } |
| 183 | + |
| 184 | + } else { |
| 185 | + if (check_second_neighbors) { |
| 186 | + if (node_distance < max_depth) { |
| 187 | + # if we will be able to add neighbors of this node Postponing |
| 188 | + will_be_checked_for_neighbors <- c(will_be_checked_for_neighbors, node) |
| 189 | + queue <- c(queue, node) |
| 190 | + distances_from_seed <- c(distances_from_seed, node_distance) |
| 191 | + } |
| 192 | + } |
| 193 | + } |
| 194 | + } |
| 195 | + } |
| 196 | + } |
| 197 | + |
| 198 | + return(comp) |
| 199 | +} |
| 200 | + |
| 201 | + |
| 202 | +#' Preprocessing and data generation for active subnetwork search functions |
| 203 | +#' |
| 204 | +#' @inheritParams active_snw_search |
| 205 | +#' |
| 206 | +#' @return list of processed data to be used in active subnetwork search functions |
| 207 | +active_snw_search_preprocessing <- function(input_for_search, pin_name_path) { |
| 208 | + # read PIN |
| 209 | + pin_df <- utils::read.delim(pin_path, header = FALSE) |
| 210 | + pin_df <- subset(pin_df, select = -2) |
| 211 | + |
| 212 | + pin <- igraph::graph_from_data_frame(pin_df, directed = FALSE, vertices = NULL) |
| 213 | + |
| 214 | + # read scores file |
| 215 | + scores_df <- input_for_search |
| 216 | + colnames(scores_df) <- c("Gene", "Score") |
| 217 | + |
| 218 | + # qnorm that is used for z-score conversion gives Inf for less than (1 - 5e-17) |
| 219 | + # we are applying a threshold at 1E-15 and converting any smaller value to 1e-15 |
| 220 | + scores_df$Score <- vapply(scores_df$Score, function(x) ifelse(x < 1e-15, 1e-15, x), 0.01) |
| 221 | + |
| 222 | + # p-values are converted to z-scores |
| 223 | + scores_df$Score <- stats::qnorm(1 - scores_df$Score) |
| 224 | + |
| 225 | + # add rows with a score of 0 for missing genes (PIN genes not in input) |
| 226 | + scores_df_to_append <- data.frame(Gene = setdiff(igraph::V(pin)$name, |
| 227 | + scores_df$Gene), |
| 228 | + Score = 0) |
| 229 | + scores_df <- rbind(scores_df, scores_df_to_append) |
| 230 | + |
| 231 | + # assign scores to the PIN graph |
| 232 | + idx <- match(igraph::V(pin)$name, scores_df$Gene) |
| 233 | + igraph::V(pin)$score <- scores_df$Score[idx] |
| 234 | + |
| 235 | + # create a vector of scores (for faster execution) |
| 236 | + scores_vec <- scores_df$Score[idx] |
| 237 | + names(scores_vec) <- scores_df$Gene[idx] |
| 238 | + |
| 239 | + return(list(pin = pin, scores_df = scores_df, scores_vec = scores_vec)) |
| 240 | +} |
| 241 | + |
| 242 | + |
| 243 | + |
| 244 | +#' Greed Active Subnetwork Search wrapper for pathfindR |
| 245 | +#' |
| 246 | +#' @inheritParams active_snw_search |
| 247 | +#' @param max_depth Sets max depth in greedy search, 0 for no limit (default = 1) #TODO Move to `active_snw_search` doc |
| 248 | +#' @param check_second_neighbors in greedy search, whether to check second |
| 249 | +#' neighbors when any direct neighbor does not improve the score (default = TRUE) #TODO Default TRUE? #TODO Move to `active_snw_search` doc |
| 250 | +#' |
| 251 | +#' @return list containing active subnetworks |
| 252 | +#' @export |
| 253 | +greedy_active_subnetwork_search <- function(input_for_search, pin_name_path, max_depth, check_second_neighbors) { |
| 254 | + |
| 255 | + res <- active_snw_search_preprocessing(input_for_search = input_for_search, pin_name_path = pin_name_path) |
| 256 | + |
| 257 | + input_scores_df <- res$scores_df |
| 258 | + pin_graph <- res$pin |
| 259 | + pin_scores_vec <- res$scores_vec |
| 260 | + sig_genes <- input_for_search$GENE |
| 261 | + |
| 262 | + background_sampling_result <- calculate_background_score( |
| 263 | + pin_graph = pin_graph, |
| 264 | + number_of_iterations = 1000, |
| 265 | + scores_vec = pin_scores_vec |
| 266 | + ) |
| 267 | + |
| 268 | + active_modules <- list() |
| 269 | + num_of_seeds_used <- 0 |
| 270 | + ratio_print_threshold <- 10 |
| 271 | + for (seed in sig_genes) { |
| 272 | + # progress |
| 273 | + num_of_seeds_used <- num_of_seeds_used + 1 |
| 274 | + used_seed_ratio <- round(num_of_seeds_used / length(sig_genes), 2) * 100 |
| 275 | + if (used_seed_ratio >= ratio_print_threshold) { |
| 276 | + message(used_seed_ratio, "% of seeds are checked") |
| 277 | + ratio_print_threshold <- ratio_print_threshold + 10 |
| 278 | + } |
| 279 | + |
| 280 | + seed_node <- igraph::V(pin_graph)[seed] # getting igraph.vs from gene name |
| 281 | + comp <- greedy_breadth_first_active_subnetwork_search(seed_node = seed_node, |
| 282 | + pin_graph = pin_graph, |
| 283 | + input_scores_df = input_scores_df, |
| 284 | + pin_scores_vec = pin_scores_vec, |
| 285 | + background_sampling_result = background_sampling_result, |
| 286 | + max_depth = max_depth, |
| 287 | + check_second_neighbors = check_second_neighbors) |
| 288 | + # check if the same module exists |
| 289 | + same_exists <- FALSE |
| 290 | + for (active_module in active_modules){ |
| 291 | + if(setequal(comp, active_module)){ |
| 292 | + same_exists <- TRUE |
| 293 | + break |
| 294 | + } |
| 295 | + } |
| 296 | + |
| 297 | + if(!same_exists) |
| 298 | + active_modules[[length(active_modules) + 1]] <- names(comp) |
| 299 | + } |
| 300 | + return(active_modules) |
| 301 | +} |
0 commit comments