diff --git a/Evaluation.Rmd b/Evaluation.Rmd new file mode 100644 index 0000000..9c236f0 --- /dev/null +++ b/Evaluation.Rmd @@ -0,0 +1,218 @@ +--- +title: "Evaluation" +author: "Tain Velasco Luquez" +date: "19/04/2018" +output: html_document +editor_options: + chunk_output_type: console +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +``` + +## Opening +This file contain sthe comparison of g2f against other widely used algortihms to find and fill gaps in GEMs. + +### Required packages +```{r message=FALSE} +library("minval") +library("sybilSBML") +library("sybil") +library("glpkAPI") +library("g2f") +library("tictoc") +library("readr") +``` + +### Useful functions + +If the models has some syntax incompatibilities, this function enables the localisation of the incompatibilities and replace them by the user define syntax. +```{r message=FALSE} +# FUN by Theodore lytras in https://stackoverflow.com/questions/15253954/replace-multiple-letters-with-accents-with-gsub/15256643?utm_medium=organic&utm_source=google_rich_qa&utm_campaign=google_rich_qa +mgsub <- function(pattern, replacement, x, ...) { + if (length(pattern)!=length(replacement)) { + stop("pattern and replacement do not have the same length.") + } + result <- x + for (i in 1:length(pattern)) { + result <- gsub(pattern[i], replacement[i], result, ...) + } + result +} +``` + +This function retrieves an organism's GEM, in .xml extension from biomodels, and the complete set of reactions, in .csv extension from KEGG. +```{r message=FALSE} +my_models <- function(biomodel_ID, kegg_ID){ + # Both arguments must be of class character + # Go to http://rest.kegg.jp/list/organism for the KEGG_ID code + # Go to https://wwwdev.ebi.ac.uk/biomodels/ for the BioModel ID + + # Let's replace the url for the selected biomodel ID + my_model_url <- gsub("MODEL(.{10})", biomodel_ID, "https://wwwdev.ebi.ac.uk/biomodels/model/download/MODEL1507180017.2?filename=MODEL1507180017_url.xml") + + # And download it + download.file(my_model_url, paste0(kegg_ID, ".xml")) + + # Now lets get the reaction list and save it + write.csv(g2f::getReference(kegg_ID), file = paste0(kegg_ID, "_all.csv")) + + message("It's done!") +} +``` + +Extract reactions from a model +```{r message=FALSE} +# Function extracted from MinVal, by Daniel Osorio +# modelData must be of class modelorg and kegg_ID of class character +my_reactions <- function(modelData, kegg_ID){ + + modelData <- minval:::convertData(model = modelData) + + outputData <- NULL + outputData$abbreviation <- modelData[["ID"]] + outputData$name <- modelData[["DESCRIPTION"]] + outputData$equation <- minval:::rearmReactions(S = minval::stoichiometricMatrix(modelData[["REACTION"]]), + reversible = grepl("<=>", modelData[["REACTION"]]), + type = "TSV", boundary = boundary) + outputData$reversible <- ifelse(test = grepl("<=>", modelData[["REACTION"]]), + yes = "reversible", no = "irreversible") + outputData$compartment <- unlist(lapply(modelData[["REACTION"]], + function(reaction) { + paste0(minval::compartments(reaction), collapse = " , ") + })) + outputData$lowbnd <- as.character(modelData[["LOWER.BOUND"]]) + outputData$uppbnd <- as.character(modelData[["UPPER.BOUND"]]) + outputData$obj_coef <- as.character(modelData[["OBJECTIVE"]]) + outputData$rule <- modelData[["GPR"]] + outputData$subsystem <- rep("", length(modelData[["REACTION"]])) + outputData <- data.frame(outputData) + + write.table(outputData, file = paste0(kegg_ID, "_react.csv"), + quote = TRUE, row.names = FALSE, sep = ",") +} +``` + +## Warming up + +We are going to load several GEMs their reaction lists and complete set of reactions from KEGG. +```{r message=FALSE} +# Download the models and its associated KEGG reactions +mapply(my_models, c("MODEL1507180048", "MODEL1507180017"), c("tma", "stm")) + +# Load all .xml models in the working directory to the .GlobalEnv and rename them adding _model +for (i in as.list(list.files(pattern = "\\.xml$"))) { + assign(gsub(".xml", "_model", i), sybilSBML::readSBMLmod(i), envir = .GlobalEnv) +} +# Load all reaction lists from KEGG per model +for (i in as.list(list.files(pattern = "\\_all.csv$"))) { + assign(gsub(".csv", "", i), read.csv(i), envir = .GlobalEnv) +} + +# Extract the reaction list of every .xml file in the working directory, save such reaction list to a .csv with the same name as the model +mapply(my_reactions, mget(setdiff(ls(pattern = "model"), lsf.str())), gsub("_all.csv", "", list.files(pattern = "\\_all.csv$"))) + +# And load them to the workspace +for (i in as.list(list.files(pattern = "\\_react.csv$"))) { + assign(gsub(".csv", "", i), read.csv(i), envir = .GlobalEnv) +} +``` + +Inasmuch as the gap filling function is sensitive to incorrect syntax, let's check it's correctness +```{r} +# Is the syntax for the models correct? +sapply(mget(setdiff(ls(pattern = "react"), lsf.str())), function(i) { + suppressMessages(suppressWarnings(sum( + minval::validateSyntax(reactionList = i$equation) + ) == nrow(i))) +}) +``` + +As seen, the reaction list have an incorrect syntax. General mistakes are described in `r ?minval::validateSyntax `. Let's fix them: +```{r message=FALSE, eval=FALSE} +# Value for all data files (excluding functions) in the global environment are returned in the form of list +temp_var <- mget(setdiff(ls(pattern = "react"), lsf.str())) + +sapply(temp_var[[i]][, 3], function(i) { + assign(temp_var[[i]][, 3], mgsub( + c("-->", '<==>', '->', "\\(|\\)"), + c("=>", '<=>', "=>", ""), + temp_var[[i]][, 3] + ), envir = .GlobalEnv) +}) + + + +``` +```{r} +stm_react$equation <- mgsub(c("-->", '<==>', '->', "\\(|\\)"), c("=>", '<=>', "=>", ""), stm_react$equation) + +tma_react$equation <- mgsub(c("-->", '<==>', '->', "\\(|\\)"), c("=>", '<=>', "=>", ""), tma_react$equation) + +# Is the syntax for the models correct? +sapply(mget(setdiff(ls(pattern = "react"), lsf.str())), function(i) { + suppressMessages(suppressWarnings(sum( + minval::validateSyntax(reactionList = i$equation) + ) == nrow(i))) +}) +``` + +## The workout +Once the syntax is checked, it is time to do the gapfinding and gapfilling +```{r} +# lets fire it up ! +tictoc::tic("Gapfilling time") +gapFill(reactionList = unlist(stm_react$equation), + reference = stm_all$reaction, + limit = 0.25, + woCompartment = TRUE, + consensus = FALSE) +tictoc::toc() + +``` + +## Now in pyhton +Now lets compare the g2f results against the gapfill algorithm implemented in Copbrapy +```{python} +# Importing the modules +from __future__ import print_function +from cobra.flux_analysis import gapfill +import cobra.test, time +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import glob +``` + +# In order to supplement the universal reaction files to the gapfill algorithm, we are going to: +1. Read several .csv files at once +```{python} +all_all = {} +for file_all in glob.glob( '*_all.csv' ): + name = os.path.splitext( file_all )[0] + all_all[name] = cobra.Model( pd.read_csv( file_all ) ) +``` + +2. Import the models +```{python} +all_model = {} +for file_model in glob.glob( '*.xml' ): + name = os.path.splitext( file_model )[0] + all_model[name] = cobra.io.read_sbml_model( file_model ) +``` + +3. let's gapfill them! +```{python} +solution={} + +for every_model_key, every_all_key in zip(all_model.keys(), all_all.keys() ): + + start_time = time.time( ) + solution[every_model_key] = gapfill( every_model_key, every_all_key ) + print( "Gap filling process took %s seconds " % (time.time( ) - start_time) ) + print( "The number of filled reactions is:".format( len( solution.key( )[every_model_key].reactions ) ) ) + +``` + + diff --git a/README.md b/README.md index 64e6b2d..5bccbc6 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ For install the latest stable version this package directly from GitHub: # Install 'devtools' R package install.packages("devtools") -# Install 'minval' package +# Install 'g2f' package devtools::install_github("gibbslab/g2f") library("g2f") ```