From 8ccc53578e72d09a6f932786c0cf9f060d1133c9 Mon Sep 17 00:00:00 2001 From: theHumanBorch Date: Tue, 8 Oct 2024 15:04:15 -0500 Subject: [PATCH 1/6] Update percentVJ.R Fix issue #415 No longer assumes > 1 group for faceting --- R/percentVJ.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/R/percentVJ.R b/R/percentVJ.R index cb93983a..4e2e66dd 100644 --- a/R/percentVJ.R +++ b/R/percentVJ.R @@ -99,8 +99,10 @@ percentVJ <- function(input.data, geom_tile(lwd= 0.25, color = "black") + scale_fill_gradientn(name = "Percentage", colors = .colorizer(palette,21)) + theme_classic() + - facet_wrap(~L1) + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), axis.title = element_blank()) + if(length(unique(mat_melt$L1)) > 1) { + plot <- plot + facet_wrap(~L1) + } return(plot) } From b42e312d2c34f385508431e5e506688ce22ffd9e Mon Sep 17 00:00:00 2001 From: theHumanBorch Date: Tue, 8 Oct 2024 15:05:01 -0500 Subject: [PATCH 2/6] Update NEWS.md --- NEWS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/NEWS.md b/NEWS.md index e262e8d7..b3409df8 100644 --- a/NEWS.md +++ b/NEWS.md @@ -3,6 +3,7 @@ ## UNDERLYING CHANGES * Fixed issue with single chain output for ```clonalLength()``` * Removed unnecessary code remnant in ```clonalLength()``` +* Allow one sample to be plotted by ```percentVJ()``` # scRepertoire VERSION 2.0.7 From 8026497de1af5082f9fdfb872aa3e8f65c7a546f Mon Sep 17 00:00:00 2001 From: theHumanBorch Date: Tue, 15 Oct 2024 15:41:17 -0500 Subject: [PATCH 3/6] Fix Issue with exportTable Per #420 mat_melt to mat --- R/positionalProperty.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/positionalProperty.R b/R/positionalProperty.R index 9b930ba1..399dde62 100644 --- a/R/positionalProperty.R +++ b/R/positionalProperty.R @@ -154,7 +154,7 @@ positionalProperty <- function(input.data, theme_classic() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) if (exportTable == TRUE) { - return(mat_melt) + return(mat) } return(plot) From 391b4eefcd77eb837b168fe472b3dcf178ee6dde Mon Sep 17 00:00:00 2001 From: theHumanBorch Date: Tue, 15 Oct 2024 15:42:41 -0500 Subject: [PATCH 4/6] Update NEWS.md Update fix for exporttable --- NEWS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/NEWS.md b/NEWS.md index b3409df8..69719082 100644 --- a/NEWS.md +++ b/NEWS.md @@ -4,6 +4,7 @@ * Fixed issue with single chain output for ```clonalLength()``` * Removed unnecessary code remnant in ```clonalLength()``` * Allow one sample to be plotted by ```percentVJ()``` +* Fixed issue with ```positionalProperty()``` and exportTable # scRepertoire VERSION 2.0.7 From fd10c0b7ac3bb905ddb8c9d152be4ccc59f49fb4 Mon Sep 17 00:00:00 2001 From: theHumanBorch Date: Thu, 17 Oct 2024 11:37:48 -0500 Subject: [PATCH 5/6] Update immApex.Rmd --- vignettes/articles/immApex.Rmd | 51 +++++++++++++++++++++------------- 1 file changed, 31 insertions(+), 20 deletions(-) diff --git a/vignettes/articles/immApex.Rmd b/vignettes/articles/immApex.Rmd index 6464590f..f7062910 100644 --- a/vignettes/articles/immApex.Rmd +++ b/vignettes/articles/immApex.Rmd @@ -42,19 +42,30 @@ suppressMessages(library(keras)) suppressMessages(library(ggplot2)) suppressMessages(library(viridis)) suppressMessages(library(dplyr)) +set.seed(42) ``` # Getting Started -**immApex** is meant to serve as an API for deep-learning models based on immune receptor sequencing. These functions extract or generate amino acid or nucleotide sequences and prepare them for deep learning tasks through [Keras](https://tensorflow.rstudio.com/guides/keras/basics). **immApex** is the underlying structure for the BCR models in [Ibex](https://github.com/ncborcherding/Ibex) and TCR models in [Trex](https://github.com/ncborcherding/Trex). It should be noted, although the tools here are created for immune receptor sequences, they will work more generally for nucleotide or amino acid sequences. +**immApex** is meant to serve as an API for deep-learning models based on immune receptor sequencing. These functions extract or generate amino acid or nucleotide sequences and prepare them for deep learning tasks through [Keras](https://tensorflow.rstudio.com/guides/keras/basics). **immApex** is the underlying structure for the BCR models in [Ibex](https://github.com/ncborcherding/Ibex) and TCR models in [Trex](https://github.com/ncborcherding/Trex). It should be noted that the tools here are created for immune receptor sequences; they will work more generally for nucleotide or amino acid sequences. The package itself supports AIRR, Adaptive, and 10x formats and interacts with the **scRepertoire** R package. More information is available at the [immApex GitHub Repo](https://github.com/ncborcherding/immApex). +## Loading Libraries + +```{r} +suppressMessages(library(immApex)) +suppressMessages(library(keras)) +suppressMessages(library(ggplot2)) +suppressMessages(library(viridis)) +suppressMessages(library(dplyr)) +``` + # Getting and Manipulating Sequences ## generateSequences -Generating synthetic sequences is a quick way to start testing the model code. ```generateSequences()``` can also generate realistic noise for generative adversarial networks. +Generating synthetic sequences is a quick way to start testing the model code. ```generateSequences()``` can also generate realistic noise for generative adversarial networks. Parameters for ```generateSequences()``` @@ -74,7 +85,7 @@ sequences <- generateSequences(prefix.motif = "CAS", head(sequences) ``` -If we want to generate nucleotide sequences instead of amino acids, we just need to change the **sequence.dictionary**. +If we want to generate nucleotide sequences instead of amino acids, we must to change the **sequence.dictionary**. ```{r tidy = FALSE} nucleotide.sequences <- generateSequences(number.of.sequences = 1000, @@ -119,9 +130,10 @@ variational.sequences <- variationalSequences(sequences, call.threshold = 0.1) head(variational.sequences) ``` + ## mutateSequences -A common approach is to mutate sequences randomly or at specific intervals. This can be particularly helpful if we have fewer sequences or want to test a model for accuracy given new, altered sequences. mutateSequences() allows us to tune the type of mutation, where along the sequences to introduce the mutation and the overall number of mutations. +A common approach is to mutate sequences randomly or at specific intervals. This can be particularly helpful if we have fewer sequences or want to test a model for accuracy given new, altered sequences. ```mutateSequences()``` allows us to tune the type of mutation, where along the sequences to introduce the mutation and the overall number of mutations. Parameters for ```mutateSequences()``` @@ -149,7 +161,7 @@ Parameters for ```formatGenes()``` * **input.data** Data frame of sequencing data or scRepertoire outputs * **region** Sequence gene loci to access - 'v', 'd', 'j', 'c' or a combination using c('v', 'd', 'j') -* **technology** The sequencing technology employed - 'TenX', "Adaptive', 'AIRR', or 'Omniscope'. +* **technology** The sequencing technology employed - 'TenX', "Adaptive', or 'AIRR' * **species** One or two word designation of species. Currently supporting: "human", "mouse", "rat", "rabbit", "rhesus monkey", "sheep", "pig", "platypus", "alpaca", "dog", "chicken", and "ferret" * **simplify.format** If applicable, remove the allelic designation (TRUE) or retain all information (FALSE) @@ -166,7 +178,7 @@ head(Adaptive_example[,c("aminoAcid","vGeneName", "v_IMGT", "v_IMGT.check")]) ## getIMGT -Depending on the sequencing technology and the version, we might want to expand the length of our sequence embedding approach. The first step in the process is pulling the reference sequences from the ImMunoGeneTics (IMGT) system using ```getIMGT()```. More information for IMGT can be found at [imgt.org](https://www.imgt.org/). +Depending on the sequencing technology and the version, we might want to expand the length of our sequence embedding approach. The first step in the process is pulling the reference sequences from the ImMunoGeneTics (IMGT) system using ```getIMGT()```. More information for IMGT can be found at [imgt.org](https://www.imgt.org/). Data from IMGT is under a CC BY-NC-ND 4.0 license. Please be aware that attribution is required for usage and should not be used to create commercial or derivative work. Parameters for ```getIMGT()``` @@ -196,7 +208,7 @@ Parameters for ```inferCDR``` * **input.data** Data frame of sequencing data or output from formatGenes(). * **reference** IMGT sequences from ```getIMGT()``` -* **technology** The sequencing technology employed - 'TenX', "Adaptive', 'AIRR', or 'Omniscope' +* **technology** The sequencing technology employed - 'TenX', "Adaptive', or 'AIRR', * **sequence.type** Type of sequence - "aa" for amino acid or "nt" for nucleotide * **sequences** The specific regions of the CDR loop to get from the data. @@ -280,7 +292,7 @@ head(median.property.matrix[,1:3]) ## geometricEncoder -One approach to encode amino acid sequences is geometric isometry, such as [GIANA](https://pubmed.ncbi.nlm.nih.gov/34349111/). +One approach to encoding amino acid sequences is geometric isometry, such as [GIANA](https://pubmed.ncbi.nlm.nih.gov/34349111/). Parameters for ```geometricEncoder()``` @@ -296,7 +308,7 @@ head(geometric.matrix) ## tokenizeSequences -Another approach to transforming a sequence into numerical values is tokenizing it into numbers. This is a common approach for recurrent neural networks where one letter corresponds to a single integer. In addition, we can add a start and stop tokens to our original sequences to differentiate between the beginning and end of the sequences. +Another approach to transforming a sequence into numerical values is tokenizing it into numbers. This is a common approach for recurrent neural networks where one letter corresponds to a single integer. In addition, we can add start and stop tokens to our original sequences to differentiate between the beginning and end of the sequences. Parameters for ```tokenizeSequences()``` @@ -352,7 +364,7 @@ adj.matrix ## sequenceDecoder -We have a function called ```sequenceDecoder()``` that extracts sequences from one-hot or property-encoded matrices or arrays. This function can be applied to any generative approach to sequence generation. +We have a function called ```sequenceDecoder()``` that extracts sequences from one-hot or property-encoded matrices or arrays. This function can be applied to any generative approach to sequence generation. Parameters for ```sequenceDecoder()``` @@ -400,7 +412,7 @@ The steps to train the model include: 4. Fitting the model ```{r tidy = FALSE} -#Sampling to make Training/Valid Data +#Sampling to make Training/Validation Data Cohorts set.seed(42) num_sequences <- nrow(sequence.matrix) indices <- 1:num_sequences @@ -418,11 +430,11 @@ encoding_dim <- 40 hidden_dim1 <- 256 # Hidden layer 1 size hidden_dim2 <- 128 # Hidden layer 2 size -es = callback_early_stopping(monitor = "val_loss", - min_delta = 0, - patience = 4, - verbose = 1, - mode = "min") +es <- callback_early_stopping(monitor = "val_loss", + min_delta = 0, + patience = 4, + verbose = 1, + mode = "min") # Define the Model input_seq <- layer_input(shape = c(input_shape)) @@ -472,7 +484,7 @@ plot(history) + We can also build classifiers directly using deep or shallow neural networks. Building deep classifiers requires more data than classical machine learning methods, like random forests, so the vignette may not be ideal. -The first step is to generate distinct types of sequences using ```generateSequences()``` and ```onehotEncoder()``` to prepare the data for the model. +The first step is to generate distinct types of sequences using ```generateSequences()``` and ```onehotEncoder()``` to prepare the data for the model. ```{r tidy = FALSE} class1.sequences <- generateSequences(prefix.motif = "CAS", @@ -512,7 +524,7 @@ classifier.model %>% compile( metrics = c("accuracy") ) -#Seperating data and labels +#Separating data and labels set.seed(42) val_indices <- sample(nrow(classifier.matrix), 10000*0.2) x_val <- classifier.matrix[val_indices,] @@ -541,11 +553,10 @@ Here, we can achieve a validation accuracy of 98.25%, which is impressive. But t *** # Conclusion -This has been a general overview of the capabilities of immApex for processing immune receptor sequences and making deep learning models. If you have any questions, comments, or suggestions, feel free to visit the [GitHub repository](https://github.com/ncborcherding/immApex). +This has been a general overview of the capabilities of **immApex** for processing immune receptor sequences and making deep learning models. If you have any questions, comments, or suggestions, feel free to visit the [GitHub repository](https://github.com/ncborcherding/immApex). ## Session Info ```{r} sessionInfo() ``` - From 34e696ccdf753dcedd48dac07239142bcc82bba5 Mon Sep 17 00:00:00 2001 From: pwwang Date: Thu, 17 Oct 2024 16:52:15 -0700 Subject: [PATCH 6/6] export table eagerly --- R/clonalAbundance.R | 101 +++++++++-------- R/clonalSizeDistribution.R | 224 ++++++++++++++++++------------------- R/percentAA.R | 48 ++++---- R/positionalEntropy.R | 68 +++++------ R/positionalProperty.R | 104 ++++++++--------- R/vizGenes.R | 118 +++++++++---------- 6 files changed, 333 insertions(+), 330 deletions(-) diff --git a/R/clonalAbundance.R b/R/clonalAbundance.R index 3e8bc946..344c2f50 100644 --- a/R/clonalAbundance.R +++ b/R/clonalAbundance.R @@ -1,89 +1,92 @@ #' Demonstrate the relative abundance of clones by group or sample #' -#' Displays the number of clones at specific frequencies by sample +#' Displays the number of clones at specific frequencies by sample #' or group. Visualization can either be a line graph ( -#' \strong{scale} = FALSE) using calculated numbers or density -#' plot (\strong{scale} = TRUE). Multiple sequencing runs can -#' be group together using the group parameter. If a matrix -#' output for the data is preferred, set +#' \strong{scale} = FALSE) using calculated numbers or density +#' plot (\strong{scale} = TRUE). Multiple sequencing runs can +#' be group together using the group parameter. If a matrix +#' output for the data is preferred, set #' \strong{exportTable} = TRUE. #' #' @examples #' #Making combined contig data -#' combined <- combineTCR(contig_list, -#' samples = c("P17B", "P17L", "P18B", "P18L", +#' combined <- combineTCR(contig_list, +#' samples = c("P17B", "P17L", "P18B", "P18L", #' "P19B","P19L", "P20B", "P20L")) -#' clonalAbundance(combined, -#' cloneCall = "gene", +#' clonalAbundance(combined, +#' cloneCall = "gene", #' scale = FALSE) #' -#' @param input.data The product of \code{\link{combineTCR}}, +#' @param input.data The product of \code{\link{combineTCR}}, #' \code{\link{combineBCR}}, or \code{\link{combineExpression}}. -#' @param cloneCall How to call the clone - VDJC gene (\strong{gene}), +#' @param cloneCall How to call the clone - VDJC gene (\strong{gene}), #' CDR3 nucleotide (\strong{nt}), CDR3 amino acid (\strong{aa}), -#' VDJC gene + CDR3 nucleotide (\strong{strict}) or a custom variable -#' in the data. -#' @param chain indicate if both or a specific chain should be used - +#' VDJC gene + CDR3 nucleotide (\strong{strict}) or a custom variable +#' in the data. +#' @param chain indicate if both or a specific chain should be used - #' e.g. "both", "TRA", "TRG", "IGH", "IGL" #' @param group.by The variable to use for grouping #' @param order.by A vector of specific plotting order or "alphanumeric" #' to plot groups in order -#' @param scale Converts the graphs into density plots in order to show +#' @param scale Converts the graphs into density plots in order to show #' relative distributions. #' @param exportTable Returns the data frame used for forming the graph #' to the visualization. -#' @param palette Colors to use in visualization - input any +#' @param palette Colors to use in visualization - input any #' \link[grDevices]{hcl.pals}. #' @importFrom ggplot2 ggplot #' @export #' @concept Visualizing_Clones -#' @return ggplot of the total or relative abundance of clones +#' @return ggplot of the total or relative abundance of clones #' across quanta -clonalAbundance <- function(input.data, - cloneCall = "strict", - chain = "both", - scale=FALSE, - group.by = NULL, +clonalAbundance <- function(input.data, + cloneCall = "strict", + chain = "both", + scale=FALSE, + group.by = NULL, order.by = NULL, exportTable = FALSE, palette = "inferno") { Con.df <- NULL xlab <- "Abundance" - input.data <- .data.wrangle(input.data, - group.by, - .theCall(input.data, cloneCall, check.df = FALSE), + input.data <- .data.wrangle(input.data, + group.by, + .theCall(input.data, cloneCall, check.df = FALSE), chain) cloneCall <- .theCall(input.data, cloneCall) - + names <- names(input.data) if (!is.null(group.by)) { for (i in seq_along(input.data)) { data1 <- .parseContigs(input.data, i, names, cloneCall) label <- input.data[[i]][1,group.by] data1[,paste(group.by)] <- label - Con.df<- rbind.data.frame(Con.df, data1) + Con.df<- rbind.data.frame(Con.df, data1) } Con.df <- data.frame(Con.df) + if (exportTable == TRUE) { + return(Con.df) + } col <- length(unique(Con.df[,group.by])) fill <- group.by if(!is.null(order.by)) { Con.df <- .ordering.function(vector = order.by, - group.by = group.by, + group.by = group.by, data.frame = Con.df) } - if (scale == TRUE) { + if (scale == TRUE) { ylab <- "Density of Clones" plot <- ggplot(Con.df, aes(x=Abundance, fill=Con.df[,group.by])) + - geom_density(aes(y=after_stat(scaled)), - alpha=0.5, - lwd=0.25, - color="black", + geom_density(aes(y=after_stat(scaled)), + alpha=0.5, + lwd=0.25, + color="black", bw=0.5) + scale_fill_manual(values = .colorizer(palette,col)) + labs(fill = fill) - } else { + } else { ylab <- "Number of Clones" - plot <- ggplot(Con.df, aes(x=Abundance, group.by = values, + plot <- ggplot(Con.df, aes(x=Abundance, group.by = values, color = Con.df[,group.by])) + geom_line(stat="count") + scale_color_manual(values = .colorizer(palette,col)) + @@ -92,39 +95,39 @@ clonalAbundance <- function(input.data, } else { for (i in seq_along(input.data)) { data1 <- .parseContigs(input.data, i, names, cloneCall) - Con.df<- rbind.data.frame(Con.df, data1) + Con.df<- rbind.data.frame(Con.df, data1) } Con.df <- data.frame(Con.df) + if (exportTable == TRUE) { + return(Con.df) + } if(!is.null(order.by)) { Con.df <- .ordering.function(vector = order.by, - group.by = "values", + group.by = "values", data.frame = Con.df) } - + col <- length(unique(Con.df$values)) fill <- "Samples" - if (scale == TRUE) { + if (scale == TRUE) { ylab <- "Density of Clones" plot <- ggplot(Con.df, aes(Abundance, fill=values)) + - geom_density(aes(y=after_stat(scaled)), - alpha=0.5, - lwd=0.25, - color="black", + geom_density(aes(y=after_stat(scaled)), + alpha=0.5, + lwd=0.25, + color="black", bw=0.5) + scale_fill_manual(values = .colorizer(palette,col)) + labs(fill = fill) - } else { + } else { ylab <- "Number of Clones" - plot <- ggplot(Con.df, aes(x=Abundance, group = values, + plot <- ggplot(Con.df, aes(x=Abundance, group = values, color = values)) + geom_line(stat="count") + scale_color_manual(values = .colorizer(palette,col)) + labs(color = fill) } } - if (exportTable == TRUE) { - return(Con.df) - } plot <- plot + scale_x_log10() + ylab(ylab) + xlab(xlab) + theme_classic() - return(plot) + return(plot) } diff --git a/R/clonalSizeDistribution.R b/R/clonalSizeDistribution.R index 5d76e6a2..14af0e82 100644 --- a/R/clonalSizeDistribution.R +++ b/R/clonalSizeDistribution.R @@ -1,14 +1,14 @@ #' Hierarchical clustering of clones using Gamma-GPD spliced threshold model #' -#' This function produces a hierarchical clustering of clones by sample -#' using discrete gamma-GPD spliced threshold model. If using this -#' model please read and cite powerTCR (more info available at +#' This function produces a hierarchical clustering of clones by sample +#' using discrete gamma-GPD spliced threshold model. If using this +#' model please read and cite powerTCR (more info available at #' \href{https://pubmed.ncbi.nlm.nih.gov/30485278/}{PMID: 30485278}). -#' +#' #' @details #' The probability density function (pdf) for the \strong{Generalized Pareto Distribution (GPD)} is given by: #' \deqn{f(x|\mu, \sigma, \xi) = \frac{1}{\sigma} \left( 1 + \xi \left( \frac{x - \mu}{\sigma} \right) \right)^{-\left( \frac{1}{\xi} + 1 \right)}} -#' +#' #' Where: #' \itemize{ #' \item{\eqn{\mu} is a location parameter} @@ -16,10 +16,10 @@ #' \item{\eqn{\xi} is a shape parameter} #' \item{\eqn{x \ge \mu} if \eqn{\xi \ge 0} and \eqn{\mu \le x \le \mu - \sigma/\xi} if \eqn{\xi < 0}} #' } -#' +#' #' The probability density function (pdf) for the \strong{Gamma Distribution} is given by: #' \deqn{f(x|\alpha, \beta) = \frac{x^{\alpha-1} e^{-x/\beta}}{\beta^\alpha \Gamma(\alpha)}} -#' +#' #' Where: #' \itemize{ #' \item{\eqn{\alpha > 0} is the shape parameter} @@ -27,30 +27,30 @@ #' \item{\eqn{x \ge 0}} #' \item{\eqn{\Gamma(\alpha)} is the gamma function of \eqn{\alpha}} #' } -#' +#' #' @examples #' #Making combined contig data -#' combined <- combineTCR(contig_list, -#' samples = c("P17B", "P17L", "P18B", "P18L", +#' combined <- combineTCR(contig_list, +#' samples = c("P17B", "P17L", "P18B", "P18L", #' "P19B","P19L", "P20B", "P20L")) #' clonalSizeDistribution(combined, cloneCall = "strict", method="ward.D2") #' -#' @param input.data The product of \code{\link{combineTCR}}, +#' @param input.data The product of \code{\link{combineTCR}}, #' \code{\link{combineBCR}}, or \code{\link{combineExpression}}. -#' @param cloneCall How to call the clone - VDJC gene (\strong{gene}), +#' @param cloneCall How to call the clone - VDJC gene (\strong{gene}), #' CDR3 nucleotide (\strong{nt}), CDR3 amino acid (\strong{aa}), -#' VDJC gene + CDR3 nucleotide (\strong{strict}) or a custom variable -#' in the data. -#' @param chain indicate if both or a specific chain should be used - +#' VDJC gene + CDR3 nucleotide (\strong{strict}) or a custom variable +#' in the data. +#' @param chain indicate if both or a specific chain should be used - #' e.g. "both", "TRA", "TRG", "IGH", "IGL". -#' @param threshold Numerical vector containing the thresholds +#' @param threshold Numerical vector containing the thresholds #' the grid search was performed over. #' @param method The clustering parameter for the dendrogram. #' @param group.by The variable to use for grouping. #' @param exportTable Returns the data frame used for forming the graph. -#' @param palette Colors to use in visualization - input any +#' @param palette Colors to use in visualization - input any #' \link[grDevices]{hcl.pals}. -#' @import ggplot2 +#' @import ggplot2 #' @importFrom dplyr bind_rows #' @importFrom ggdendro dendro_data segment label #' @importFrom stats hclust optim pgamma as.dist @@ -60,17 +60,17 @@ #' @author Hillary Koch clonalSizeDistribution <- function(input.data, - cloneCall ="strict", - chain = "both", - method = "ward.D2", - threshold = 1, + cloneCall ="strict", + chain = "both", + method = "ward.D2", + threshold = 1, group.by = NULL, - exportTable = FALSE, + exportTable = FALSE, palette = "inferno") { x <- xend <- yend <- mpg_div_hp <- NULL - input.data <- .data.wrangle(input.data, - group.by, - .theCall(input.data, cloneCall, check.df = FALSE), + input.data <- .data.wrangle(input.data, + group.by, + .theCall(input.data, cloneCall, check.df = FALSE), chain) cloneCall <- .theCall(input.data, cloneCall) sco <- is_seurat_object(input.data) | is_se_object(input.data) @@ -79,14 +79,14 @@ clonalSizeDistribution <- function(input.data, } data <- bind_rows(input.data) unique_df<- unique(data[,cloneCall]) - + #Forming data frame to store values Con.df <- data.frame(matrix(NA, length(unique_df), length(input.data))) Con.df <- data.frame(unique_df, Con.df, stringsAsFactors = FALSE) colnames(Con.df)[1] <- "clonotype" for (i in seq_along(input.data)) { data <- input.data[[i]] - data <- data.frame(table(data[,cloneCall]), + data <- data.frame(table(data[,cloneCall]), stringsAsFactors = FALSE) colnames(data) <- c(cloneCall, "Freq") for (y in seq_along(unique_df)){ # here is the first speed bottleneck that has speedup potential @@ -104,38 +104,38 @@ clonalSizeDistribution <- function(input.data, names(list) <- names(input.data) grid <- 0:10000 distances <- .get_distances(list, grid, modelType="Spliced") - mat_melt <- dendro_data(hclust(as.dist(distances), method = method), + if (exportTable) { + return(distances) + } + mat_melt <- dendro_data(hclust(as.dist(distances), method = method), type = "rectangle") - + #Plotting - plot <- ggplot() + - geom_segment(data = segment(mat_melt), - aes(x = x, - y = y, - xend = xend, + plot <- ggplot() + + geom_segment(data = segment(mat_melt), + aes(x = x, + y = y, + xend = xend, yend = yend)) + - geom_text(data = label(mat_melt), - aes(x = x, y = -0.02, - label = label, - hjust = 0), + geom_text(data = label(mat_melt), + aes(x = x, y = -0.02, + label = label, + hjust = 0), size = 4) + - geom_point(data = label(mat_melt), - aes(x = x, - y = -0.01, - color = as.factor(label)), - size = 2) + + geom_point(data = label(mat_melt), + aes(x = x, + y = -0.01, + color = as.factor(label)), + size = 2) + coord_flip() + - scale_y_reverse(expand = c(0.2, 0)) + - scale_color_manual(values = .colorizer(palette, nrow(label(mat_melt)))) + - theme_classic() + - guides(color = "none") + - theme(axis.title = element_blank(), - axis.ticks.y = element_blank(), - axis.text.y = element_blank()) - - if (exportTable) { - return(distances) - } + scale_y_reverse(expand = c(0.2, 0)) + + scale_color_manual(values = .colorizer(palette, nrow(label(mat_melt)))) + + theme_classic() + + guides(color = "none") + + theme(axis.title = element_blank(), + axis.ticks.y = element_blank(), + axis.text.y = element_blank()) + return(plot) } @@ -150,7 +150,7 @@ clonalSizeDistribution <- function(input.data, if(!is(x, "numeric")){ stop("x must be numeric.") } - + if(!is.null(shift)){ if(!is(shift, "numeric")){ stop("shift must be numeric.") @@ -159,31 +159,31 @@ clonalSizeDistribution <- function(input.data, stop("shift must be an integer.") } } - + if(!is(useq, "numeric")){ stop("useq must be numeric.") } - + if(any(x != round(x))){ stop("all elements in x must be integers.") } - + if(any(useq != round(useq))){ stop("all elements in useq must be integers.") } - + if(!is.null(pvector) & !(length(pvector) == 5)){ stop("pvector must contain 5 elements.") } - + if(!(is.logical(std.err))){ stop("std.err must be TRUE or FALSE.") } - + if(is.null(shift)){ shift <- min(x) } - + if (is.null(pvector)) { pvector <- rep(NA,5) s <- log(mean(x+0.5))-mean(log(x+0.5)) @@ -191,20 +191,20 @@ clonalSizeDistribution <- function(input.data, pvector[1] <- k pvector[2] <- k/mean(x) pvector[3] <- as.vector(quantile(x, 0.9)) - + xu <- x[x>=pvector[3]] initfgpd <- fgpd(xu, min(xu)-10^(-5)) pvector[4] <- initfgpd$mle[1] pvector[5] <- initfgpd$mle[2] } - + bulk <- lapply(seq_along(useq), function(idx,x,useq) x < useq[idx], x=x, useq=useq) tail <- lapply(seq_along(useq), function(idx,x,useq) x >= useq[idx], x=x, useq=useq) phiu <- lapply(seq_along(useq), function(idx,tail) mean(tail[[idx]]), tail=tail) - + gammfit <- list() gpdfit <- list() nllhu <- rep(NA, length(useq)) @@ -237,7 +237,7 @@ clonalSizeDistribution <- function(input.data, nllhu[i] <- tryCatch(expr = gammfit[[i]]$value + gpdfit[[i]]$value, error = function(err) NA) } - + bestfit <- which.min(nllhu) fit.out <- list(gammfit[[bestfit]], gpdfit[[bestfit]]) names(fit.out) <- c("bulk","tail") @@ -272,12 +272,12 @@ clonalSizeDistribution <- function(input.data, if(any(x != floor(x))){ stop("x must be an integer") } - + out <- rep(0, length(x)) - + up <- pgamma(x+1-shift, shape=shape, rate=rate) down <- pgamma(x-shift, shape=shape, rate=rate) - + if(!log){ b <- pgamma(thresh-shift, shape=shape, rate=rate) out[x < thresh] <- ((1-phiu)*(up-down)/b)[x < thresh] @@ -309,7 +309,7 @@ clonalSizeDistribution <- function(input.data, .ddiscgpd <- function(x, thresh, sigma, xi, phiu, log = FALSE){ up <- pgpd(x+1, u=thresh, sigmau=sigma, xi=xi) down <- pgpd(x, u=thresh, sigmau=sigma, xi=xi) - + if(!log){ phiu*(up-down) } else{ @@ -341,11 +341,11 @@ pdiscgpd <- function(q, thresh, sigma, xi, phiu){ .discgammanll <- function(param, dat, thresh, phiu, shift=0){ shape <- exp(param[1]) rate <- exp(param[2]) - + if(any(dat > thresh-1)){ warning("data must be less than the threshold") } - + ll <- log(.ddiscgamma(dat, shape, rate, thresh, phiu, shift)) - + sum(-ll) } @@ -353,7 +353,7 @@ pdiscgpd <- function(q, thresh, sigma, xi, phiu){ sigma <- exp(param[1]) xi <- param[2] ll <- log(.ddiscgpd(dat, thresh, sigma, xi, phiu)) - + sum(-ll) } @@ -379,12 +379,12 @@ pdiscgpd <- function(q, thresh, sigma, xi, phiu){ if(!(modelType %in% c("Spliced", "Desponds"))){ stop("modelType must be either \"Spliced\" or \"Desponds\".") } - + distances <- matrix(rep(0, length(fits)^2), nrow = length(fits)) if(!is.null(names(fits))){ rownames(distances) <- colnames(distances) <- names(fits) } - + for(i in seq_len((length(fits)-1))){ for(j in (i+1):length(fits)){ distances[i,j] <- .JS_dist(fits[[i]], @@ -401,7 +401,7 @@ pdiscgpd <- function(q, thresh, sigma, xi, phiu){ if(!(modelType %in% c("Spliced", "Desponds"))){ stop("modelType must be either \"Spliced\" or \"Desponds\".") } - + if(modelType == "Spliced"){ if(!all(c("x", "init", "useq", "nllhuseq", "nllh", "optim", "mle") %in% names(fit1))){ @@ -427,7 +427,7 @@ pdiscgpd <- function(q, thresh, sigma, xi, phiu){ sigmaq <- fit2$mle['sigma'] xip <- fit1$mle['xi'] xiq <- fit2$mle['xi'] - + out <- .JS_spliced(grid, shiftp, shiftq, phip, phiq, shapep, shapeq, ratep, rateq, threshp, threshq, sigmap, sigmaq, xip, xiq) @@ -457,45 +457,45 @@ pdiscgpd <- function(q, thresh, sigma, xi, phiu){ if(!is(grid, "numeric")){ stop("grid must be numeric.") } - + if(any(grid != round(grid))){ stop("all elements in grid must be integers.") } - + if(any(!is(c(shiftp, shiftq, phip, phiq, shapep, shapeq, ratep, rateq, threshp, threshq, sigmap, sigmaq, xip, xiq), "numeric"))){ stop("shiftp, shiftq, phip, phiq, shapep, shapeq, ratep, rateq, threshp, threshq, sigmap, sigmaq, xip, and xiq must be numeric.") } - + if(shiftp != round(shiftp) | shiftq != round(shiftq)){ stop("shiftp and shiftq must be integers.") } - + if(any(c(shapep, shapeq, ratep, rateq, sigmap, sigmaq) <= 0)){ - stop("shapep, shapeq, ratep, rateq, sigmap, and sigmaq must be + stop("shapep, shapeq, ratep, rateq, sigmap, and sigmaq must be greater than 0.") } - + if(any(c(phip, phiq) > 1) | any(c(phip, phiq) < 0)){ stop("phip and phiq must be in [0,1].") } - + if(ratep <= 0 | rateq <= 0){ stop("ratep and rateq must be greater than 0.") } - + if(shapep <= 0 | shapeq <= 0){ stop("shapep and shapeq must be greater than 0.") } - + if(threshp != round(threshp) | threshq != round(threshq)){ stop("threshp and threshq must be integers.") } - + K <- max(grid) - + P <- .ddiscgammagpd(min(grid):K, shape = shapep, rate = ratep, u=threshp, sigma = sigmap, xi = xip, phiu = phip, shift=shiftp, @@ -505,7 +505,7 @@ pdiscgpd <- function(q, thresh, sigma, xi, phiu){ P[adjp] <- dgpd(adjp+0.5, u=threshp, sigmau = sigmap, xi = xip, phiu = phip) } - + Q <- .ddiscgammagpd(min(grid):K, shape = shapeq, rate = rateq, u=threshq, sigma = sigmaq, xi = xiq, phiu = phiq, shift=shiftq, @@ -515,15 +515,15 @@ pdiscgpd <- function(q, thresh, sigma, xi, phiu){ Q[adjq] <- dgpd(adjq+0.5, u=threshq, sigmau = sigmaq, xi = xiq, phiu = phiq) } - + M <- 0.5*(P+Q) pzero <- which(P == 0) qzero <- which(Q == 0) - + sum1 <- sum2 <- rep(NA, length(grid)) sum1 <- P*(log(P) - log(M)) sum2 <- Q*(log(Q) - log(M)) - + if(length(intersect(pzero, qzero)) != 0){ sum1[intersect(pzero, qzero)] <- 0 sum2[intersect(pzero, qzero)] <- 0 @@ -534,7 +534,7 @@ pdiscgpd <- function(q, thresh, sigma, xi, phiu){ if(length(setdiff(qzero, pzero)) != 0){ sum2[setdiff(qzero, pzero)] <- 0 } - + out <- sqrt(0.5*(sum(sum1) + sum(sum2))) out } @@ -545,26 +545,26 @@ pdiscgpd <- function(q, thresh, sigma, xi, phiu){ if(!is(grid, "numeric")){ stop("grid must be numeric.") } - + if(any(grid != round(grid))){ stop("all elements in grid must be integers.") } - + if(any(!is(c(Cminp, Cminq, alphap, alphaq), "numeric"))){ stop("Cminp, Cminq, alphap, and alphaq must be numeric.") } - + if(Cminp != round(Cminp) | Cminq != round(Cminq)){ stop("Cminp and Cminq must be integers.") } - + if(alphap <= 0 | alphaq <= 0){ stop("alphap and alphaq must be greater than 0.") } - + lower <- min(grid) upper <- max(grid) - + out <- adaptIntegrate(.eval_desponds, lowerLimit = lower, upperLimit = upper, Cminp = Cminp, Cminq = Cminq, @@ -591,40 +591,40 @@ pdiscgpd <- function(q, thresh, sigma, xi, phiu){ if(!is(x, "numeric")){ stop("x must be numeric.") } - + if(!is(shift, "numeric")){ stop("shift must be numeric.") } - + if(any(x != floor(x))){ stop("x must be an integer") } - + if(shift != round(shift)){ stop("shift must be an integer.") } - + if(any(c(shape, rate, sigma) <= 0)){ stop("shape, rate, and sigma must all be positive.") } - + if(!is.null(phiu)){ if(phiu < 0 | phiu > 1){ stop("phiu must be in [0,1].") } } - + if(is.null(phiu)){ phiu <- 1-.pdiscgamma(u-1, shape=shape, rate=rate, thresh=Inf, phiu = 0, shift=shift) } - + out <- rep(NA, length(x)) - + if(sum(x>=u) != 0){ out[x>=u] <- .ddiscgpd(x[x>=u], u, sigma, xi, phiu, log=log) } - + if(sum(x% mutate_if(is.numeric, list(~ ./sum(.))) @@ -51,28 +51,28 @@ percentAA <- function(input.data, melt.res$group <- names(input.data)[x] melt.res }) -> res.list - + mat_melt <- do.call(rbind, res.list) + if (exportTable == TRUE) { + return(mat_melt) + } if(!is.null(order.by)) { mat_melt <- .ordering.function(vector = order.by, - group.by = "variable", + group.by = "variable", mat_melt) } - + plot <- ggplot(mat_melt, aes(x=as.factor(variable), y = value, fill=AA)) + geom_bar(stat = "identity", position="fill", lwd= 0.25, color = "black") + - scale_fill_manual(name = "Amino Acid", + scale_fill_manual(name = "Amino Acid", values = rev(.colorizer(palette,21))) + xlab("Amino Acid Residues") + ylab("Relative Percent") + - theme_classic() + + theme_classic() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) if(length(res.list) > 1) { plot <- plot + facet_grid(group~.) } - if (exportTable == TRUE) { - return(mat_melt) - } return(plot) -} - +} + diff --git a/R/positionalEntropy.R b/R/positionalEntropy.R index af2483b0..eec4b507 100644 --- a/R/positionalEntropy.R +++ b/R/positionalEntropy.R @@ -1,29 +1,29 @@ #' Examining the diversity of amino acids by position #' -#' This function the diversity amino acids along the residues -#' of the CDR3 amino acid sequence. Please see -#' \code{\link{clonalDiversity}} for more information on -#' the underlying methods for diversity/entropy calculations. -#' Positions without variance will have a value reported as 0 +#' This function the diversity amino acids along the residues +#' of the CDR3 amino acid sequence. Please see +#' \code{\link{clonalDiversity}} for more information on +#' the underlying methods for diversity/entropy calculations. +#' Positions without variance will have a value reported as 0 #' for the purposes of comparison. #' #' @examples #' #Making combined contig data -#' combined <- combineTCR(contig_list, -#' samples = c("P17B", "P17L", "P18B", "P18L", +#' combined <- combineTCR(contig_list, +#' samples = c("P17B", "P17L", "P18B", "P18L", #' "P19B","P19L", "P20B", "P20L")) -#' positionalEntropy(combined, -#' chain = "TRB", +#' positionalEntropy(combined, +#' chain = "TRB", #' aa.length = 20) -#' @param input.data The product of \code{\link{combineTCR}}, +#' @param input.data The product of \code{\link{combineTCR}}, #' \code{\link{combineBCR}}, or \code{\link{combineExpression}} #' @param chain "TRA", "TRB", "TRG", "TRG", "IGH", "IGL" #' @param group.by The variable to use for grouping #' @param order.by A vector of specific plotting order or "alphanumeric" #' to plot groups in order -#' @param aa.length The maximum length of the CDR3 amino acid sequence. -#' @param method The method to calculate the entropy/diversity - +#' @param aa.length The maximum length of the CDR3 amino acid sequence. +#' @param method The method to calculate the entropy/diversity - #' "shannon", "inv.simpson", "norm.entropy" #' @param exportTable Returns the data frame used for forming the graph #' @param palette Colors to use in visualization - input any \link[grDevices]{hcl.pals} @@ -32,38 +32,38 @@ #' @export #' @concept Summarize_Repertoire #' @return ggplot of line graph of diversity by position -positionalEntropy <- function(input.data, - chain = "TRB", - group.by = NULL, +positionalEntropy <- function(input.data, + chain = "TRB", + group.by = NULL, order.by = NULL, aa.length = 20, method = "norm.entropy", - exportTable = FALSE, + exportTable = FALSE, palette = "inferno") { - + if(method %!in% c("shannon", "inv.simpson", "norm.entropy")) { stop("Please select a compatible method.") } sco <- is_seurat_object(input.data) | is_se_object(input.data) - input.data <- .data.wrangle(input.data, - group.by, - .theCall(input.data, "CTaa", check.df = FALSE), + input.data <- .data.wrangle(input.data, + group.by, + .theCall(input.data, "CTaa", check.df = FALSE), chain) cloneCall <- .theCall(input.data, "CTaa") - + if(!is.null(group.by) & !sco) { input.data <- .groupList(input.data, group.by) } - + #Selecting Diversity Function diversityFunc <- switch(method, "norm.entropy" = .normentropy, "inv.simpson" = .invsimpson, "shannon" = .shannon, stop("Invalid method provided")) - + aa.count.list <- .aa.counter(input.data, "CTaa", aa.length) - + lapply(aa.count.list, function(x){ diversity <- sapply(x[,2:ncol(x)], diversityFunc) diversity[is.nan(diversity)] <- 0 @@ -72,28 +72,28 @@ positionalEntropy <- function(input.data, mat <- do.call(rbind, group.results) mat_melt <- suppressMessages(melt(mat)) - + if (exportTable == TRUE) { + return(mat_melt) + } + if(!is.null(order.by)) { mat_melt <- .ordering.function(vector = order.by, - group.by = "Var1", + group.by = "Var1", mat_melt) } - + plot <- ggplot(mat_melt, aes(x=Var2, y = value, group= Var1, color = Var1)) + geom_line(stat = "identity") + - geom_point() + - scale_color_manual(name = "Groups", + geom_point() + + scale_color_manual(name = "Groups", values = rev(.colorizer(palette,nrow(mat)))) + xlab("Amino Acid Residues") + ylab("Relative Diversity") + - theme_classic() + + theme_classic() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) - if (exportTable == TRUE) { - return(mat_melt) - } return(plot) } - + diff --git a/R/positionalProperty.R b/R/positionalProperty.R index 399dde62..5872564e 100644 --- a/R/positionalProperty.R +++ b/R/positionalProperty.R @@ -1,41 +1,41 @@ #' Examining the mean property of amino acids by position #' -#' This function calculates the mean selected property for -#' amino acids along the residues of the CDR3 amino acid sequence. -#' The ribbon surrounding the individual line represents the 95% +#' This function calculates the mean selected property for +#' amino acids along the residues of the CDR3 amino acid sequence. +#' The ribbon surrounding the individual line represents the 95% #' confidence interval. -#' +#' #' @details #' More information for the individual methods can be found at the following citations: -#' +#' #' \strong{Atchley:} \href{https://pubmed.ncbi.nlm.nih.gov/15851683/}{citation} -#' +#' #' \strong{Kidera:} \href{https://link.springer.com/article/10.1007/BF01025492}{citation} -#' +#' #' \strong{stScales:} \href{https://pubmed.ncbi.nlm.nih.gov/19373543/}{citation} -#' +#' #' \strong{tScales:} \href{https://www.sciencedirect.com/science/article/pii/S0022286006006314?casa_token=uDj97DwXDDEAAAAA:VZfahldPRwU1WObySJlohudtMSDwF7nJSUzcEGwPhvkY13ALLKhs08Cf0_FyyfYZjxJlj-fVf0SM}{citation} -#' +#' #' \strong{VHSE:} \href{https://pubmed.ncbi.nlm.nih.gov/15895431/}{citation} -#' +#' #' #' @examples #' #Making combined contig data -#' combined <- combineTCR(contig_list, -#' samples = c("P17B", "P17L", "P18B", "P18L", +#' combined <- combineTCR(contig_list, +#' samples = c("P17B", "P17L", "P18B", "P18L", #' "P19B","P19L", "P20B", "P20L")) -#' positionalProperty(combined, +#' positionalProperty(combined, #' chain = "TRB", -#' method = "Atchley", +#' method = "Atchley", #' aa.length = 20) -#' @param input.data The product of \code{\link{combineTCR}}, +#' @param input.data The product of \code{\link{combineTCR}}, #' \code{\link{combineBCR}}, or \code{\link{combineExpression}} #' @param chain "TRA", "TRB", "TRG", "TRG", "IGH", "IGL" #' @param group.by The variable to use for grouping #' @param order.by A vector of specific plotting order or "alphanumeric" #' to plot groups in order -#' @param aa.length The maximum length of the CDR3 amino acid sequence. +#' @param aa.length The maximum length of the CDR3 amino acid sequence. #' @param method The method to calculate the property - "Atchley", "Kidera", #' "stScales", "tScales", or "VHSE" #' @param exportTable Returns the data frame used for forming the graph @@ -43,35 +43,35 @@ #' @import ggplot2 #' @importFrom stringr str_split #' @importFrom stats qt -#' @importFrom dplyr %>% summarise n group_by +#' @importFrom dplyr %>% summarise n group_by #' @export #' @concept Summarize_Repertoire #' @return ggplot of line graph of diversity by position #' @author Florian Bach, Nick Borcherding -positionalProperty <- function(input.data, - chain = "TRB", - group.by = NULL, +positionalProperty <- function(input.data, + chain = "TRB", + group.by = NULL, order.by = NULL, aa.length = 20, method = "Atchley", - exportTable = FALSE, + exportTable = FALSE, palette = "inferno") { options( dplyr.summarise.inform = FALSE ) if(method %!in% c("Atchley", "Kidera", "stScales", "tScales", "VHSE")) { stop("Please select a compatible method.") } sco <- is_seurat_object(input.data) | is_se_object(input.data) - input.data <- .data.wrangle(input.data, - group.by, - .theCall(input.data, "CTaa", check.df = FALSE), + input.data <- .data.wrangle(input.data, + group.by, + .theCall(input.data, "CTaa", check.df = FALSE), chain) cloneCall <- .theCall(input.data, "CTaa") - + if(!is.null(group.by) & !sco) { input.data <- .groupList(input.data, group.by) } - + #Selecting Property Function propertyFunc <- switch(method, "Atchley" = .af.ref, @@ -80,12 +80,12 @@ positionalProperty <- function(input.data, "tScales" = .tscales.ref, "VHSE" = .vhse.ref, stop("Invalid method provided")) - + #Getting AA Counts aa.count.list <- .aa.counter(input.data, cloneCall, aa.length) - + aa.count.list <- lapply(aa.count.list, function(x)subset(x, x$AA %in% c("A", "R", "N", "D", "C", "Q", "E", "G", "H", "I", "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V"))) - + #Calculating properties and melting data lapply(seq_along(aa.count.list), function(x) { lapply(seq_len(nrow(aa.count.list[[x]]))[-1], function(y) { @@ -98,17 +98,17 @@ positionalProperty <- function(input.data, results }) -> output.values output.values <- unlist(output.values) - df <- data.frame(group = names(output.values), + df <- data.frame(group = names(output.values), value = unlist(output.values)) - + if(nrow(df) == 0) { #For only NA positions - summary <- data.frame(mean = rep(0, dim(propertyFunc)[2]), + summary <- data.frame(mean = rep(0, dim(propertyFunc)[2]), ci_lower = rep(0, dim(propertyFunc)[2]), ci_upper = rep(0, dim(propertyFunc)[2])) - + } else { - summary <- df %>% - group_by(group) %>% + summary <- df %>% + group_by(group) %>% summarise(mean = mean(value), sd = sd(value), # Standard deviation n = n(), # Number of observations per group @@ -116,7 +116,7 @@ positionalProperty <- function(input.data, ci_lower = ifelse(n > 1, mean - qt(0.975, n-1) * se, mean), ci_upper = ifelse(n > 1, mean + qt(0.975, n-1) * se, mean)) %>% as.data.frame() - + summary <- summary[,c("mean", "ci_lower", "ci_upper")] } summary$property <- colnames(propertyFunc) @@ -129,35 +129,35 @@ positionalProperty <- function(input.data, mat <- bind_rows(property.calculations, .id = "group") mat$position <- paste0("pos", mat$position) mat$position <- factor(mat$position, levels = str_sort(unique(mat$position), numeric = TRUE)) - + if (exportTable == TRUE) { + return(mat) + } + if(!is.null(order.by)) { mat <- .ordering.function(vector = order.by, - group.by = "group", + group.by = "group", mat) } - - plot <- ggplot(mat, aes(x = position, - y = mean, - group = group, + + plot <- ggplot(mat, aes(x = position, + y = mean, + group = group, color = group)) + geom_ribbon(aes(ymin = ci_lower, ymax = ci_upper, fill = group), alpha = 0.5, lwd = 0) + geom_line(stat = "identity", alpha = 0.5) + - geom_point() + - scale_color_manual(name = "Group", + geom_point() + + scale_color_manual(name = "Group", values = .colorizer(palette,length(unique(mat$group)))) + scale_fill_manual(values = .colorizer(palette,length(unique(mat$group)))) + xlab("Amino Acid Residues") + ylab("Mean Values") + - facet_grid(property~.) + - guides(fill = "none", - color = guide_legend(override.aes=list(fill=NA))) + - theme_classic() + + facet_grid(property~.) + + guides(fill = "none", + color = guide_legend(override.aes=list(fill=NA))) + + theme_classic() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) - if (exportTable == TRUE) { - return(mat) - } return(plot) - + } ############################### diff --git a/R/vizGenes.R b/R/vizGenes.R index 2f0002e1..0c607da4 100644 --- a/R/vizGenes.R +++ b/R/vizGenes.R @@ -1,36 +1,36 @@ #' Visualizing the distribution of gene usage -#' -#' This function will allow for the visualizing the distribution +#' +#' This function will allow for the visualizing the distribution #' of the any VDJ and C gene of the TCR or BCR using heatmap or -#' bar chart. This function requires assumes two chains were used in -#' defining clone, if not, it will default to the only chain +#' bar chart. This function requires assumes two chains were used in +#' defining clone, if not, it will default to the only chain #' present regardless of the chain parameter. #' #' @examples #' #Making combined contig data -#' combined <- combineTCR(contig_list, -#' samples = c("P17B", "P17L", "P18B", "P18L", +#' combined <- combineTCR(contig_list, +#' samples = c("P17B", "P17L", "P18B", "P18L", #' "P19B","P19L", "P20B", "P20L")) -#' -#' vizGenes(combined, +#' +#' vizGenes(combined, #' x.axis = "TRBV", #' y.axis = NULL, #' plot = "heatmap") #' -#' @param input.data The product of \code{\link{combineTCR}}, +#' @param input.data The product of \code{\link{combineTCR}}, #' \code{\link{combineBCR}}, or \code{\link{combineExpression}}. -#' @param plot The type of plot to return - heatmap or barplot. -#' @param x.axis Gene segments to separate the x-axis, such as "TRAV", +#' @param plot The type of plot to return - heatmap or barplot. +#' @param x.axis Gene segments to separate the x-axis, such as "TRAV", #' "TRBD", "IGKJ". -#' @param y.axis Variable to separate the y-axis, can be both categorical +#' @param y.axis Variable to separate the y-axis, can be both categorical #' or other gene gene segments, such as "TRAV", "TRBD", "IGKJ". #' @param group.by Variable in which to group the diversity calculation. -#' @param order Categorical variable to organize the x-axis, either +#' @param order Categorical variable to organize the x-axis, either #' "gene" or "variance" -#' @param scale Converts the individual count of genes to proportion using -#' the total respective repertoire size +#' @param scale Converts the individual count of genes to proportion using +#' the total respective repertoire size #' @param exportTable Returns the data frame used for forming the graph. -#' @param palette Colors to use in visualization - input any +#' @param palette Colors to use in visualization - input any #' \link[grDevices]{hcl.pals}. #' @import ggplot2 #' @importFrom stringr str_split @@ -40,18 +40,18 @@ #' @concept Visualizing_Clones #' @return ggplot bar diagram or heatmap of gene usage -vizGenes <- function(input.data, +vizGenes <- function(input.data, x.axis = "TRBV", y.axis = NULL, - group.by = NULL, - plot = "heatmap", + group.by = NULL, + plot = "heatmap", order = "gene", - scale = TRUE, + scale = TRUE, exportTable = FALSE, palette = "inferno") { element.names <- NULL sco <- is_seurat_object(input.data) | is_se_object(input.data) - + #Extracting group.by in case of null if(!grepl("TRA|TRB|TRG|TRD|IGH|IGL|IGK", y.axis) && !is.null(y.axis)) { group.by <- y.axis @@ -60,12 +60,12 @@ vizGenes <- function(input.data, if(is.null(group.by) & sco) { group.by <- "ident" } - + input.data <- .list.input.return(input.data, split.by = group.by) if(!is.null(group.by) & !sco) { input.data <- .groupList(input.data, group.by) } - + input.data <- .bound.input.return(input.data) #Parsing x.axis if gene used if (x.axis %!in% colnames(input.data)) { @@ -74,7 +74,7 @@ vizGenes <- function(input.data, colnames(input.data)[ncol(input.data)] <- x.axis ######## Check if need this } } - + #Parsing y.axis if gene used if (any(y.axis %!in% colnames(input.data)) | !is.null(y.axis)) { if (grepl("TRA|TRB|TRG|TRD|IGH|IGL|IGK", y.axis)) { @@ -86,7 +86,7 @@ vizGenes <- function(input.data, } else { y.axis <- "element.names" } - + #Filtering steps input.data[input.data == "NA"] <- NA input.data[input.data == ""] <- NA @@ -96,7 +96,7 @@ vizGenes <- function(input.data, if(!is.null(x.axis) & any(is.na(input.data[,c(x.axis)]))) { input.data <- subset(input.data, !is.na(input.data[,c(x.axis)])) } - + if (!scale) { col.lab <- "Total n" values <- "count" @@ -106,7 +106,7 @@ vizGenes <- function(input.data, } #Ensure only x.axis match is displayed input.data <- input.data[grep(x.axis, input.data[,x.axis]),] - + #Calculating the summary values if (!is.null(y.axis) && y.axis != "element.names") { mat <- input.data %>% @@ -114,7 +114,7 @@ vizGenes <- function(input.data, summarise(count = n()) %>% ungroup() %>% group_by(element.names) %>% - mutate(total = sum(count), + mutate(total = sum(count), proportion = count / total) %>% as.data.frame() %>% na.omit() @@ -122,18 +122,18 @@ vizGenes <- function(input.data, mat$n <- mat[,values] mat <- mat %>% group_by(y.axis, x.axis) %>% - mutate(varcount = sum(n), + mutate(varcount = sum(n), sd = sd(n, na.rm = TRUE), mean = mean(n)) %>% - as.data.frame() - + as.data.frame() + } else { mat <- input.data %>% group_by(element.names, input.data[,x.axis]) %>% summarise(count = n()) %>% ungroup() %>% group_by(element.names) %>% - mutate(total = sum(count), + mutate(total = sum(count), proportion = count / total) %>% as.data.frame() %>% na.omit() @@ -141,58 +141,58 @@ vizGenes <- function(input.data, mat$n <- mat[,values] mat <- mat %>% group_by(y.axis, x.axis) %>% - mutate(varcount = sum(n), + mutate(varcount = sum(n), sd = sd(n, na.rm = TRUE), mean = mean(n)) %>% - as.data.frame() + as.data.frame() } - - + + if (order == "variance") { varOrder <- unique(mat[order(mat$varcount, decreasing = TRUE),"x.axis"]) } else { varOrder <- str_sort(unique(mat$x.axis), numeric = TRUE) } - mat[,"x.axis"] <- factor(mat[,"x.axis"], levels = varOrder) + mat[,"x.axis"] <- factor(mat[,"x.axis"], levels = varOrder) + if (exportTable == TRUE) { + return(mat) + } if (plot == "barplot") { mat2 <- unique(mat[,c("x.axis", "y.axis", "sd", "mean")]) - plot <- ggplot(mat2, aes(x=x.axis, y = mean)) + - geom_bar(stat = "identity", color = "black", lwd = 0.25) + + plot <- ggplot(mat2, aes(x=x.axis, y = mean)) + + geom_bar(stat = "identity", color = "black", lwd = 0.25) + geom_errorbar(aes(ymin=mean, ymax=mean+sd), width=.2, - position=position_dodge(.9)) + - theme_classic() + + position=position_dodge(.9)) + + theme_classic() + theme(axis.title.x = element_blank(), #remove titles - axis.title.y = element_blank(), + axis.title.y = element_blank(), axis.ticks.x = element_blank(), #removes ticks - axis.text.x = element_text(angle = 90, - vjust = 0.5, hjust=1, size=rel(0.5))) + + axis.text.x = element_text(angle = 90, + vjust = 0.5, hjust=1, size=rel(0.5))) + facet_grid(y.axis~.) } else if (plot == "heatmap") { - - plot <- ggplot(mat, aes(x=x.axis, y = y.axis)) + - geom_tile(aes(fill =log(n)), color = "black", lwd = 0.25) + - theme_classic() + - labs(fill = paste0("Log(",col.lab, ")")) + + + plot <- ggplot(mat, aes(x=x.axis, y = y.axis)) + + geom_tile(aes(fill =log(n)), color = "black", lwd = 0.25) + + theme_classic() + + labs(fill = paste0("Log(",col.lab, ")")) + theme(axis.title.x = element_blank(), #remove titles axis.ticks.x = element_blank(), #removes ticks - axis.text.x = element_text(angle = 90, - vjust = 0.5, hjust=1, size=rel(0.5)), - axis.title.y = element_blank(), - axis.text.y = element_text(size=rel(0.5))) + + axis.text.x = element_text(angle = 90, + vjust = 0.5, hjust=1, size=rel(0.5)), + axis.title.y = element_blank(), + axis.text.y = element_text(size=rel(0.5))) + scale_fill_gradientn(colors = .colorizer(palette,5)) } - if (exportTable == TRUE) { - return(mat) - } return(plot) } #Parsing the CTgene .select.gene <- function(df, position, label) { - chains <- c("TRAV" = 1, "TRBV" = 1, "TRGV" = 1, "TRDV" = 1, + chains <- c("TRAV" = 1, "TRBV" = 1, "TRGV" = 1, "TRDV" = 1, "IGHV" = 1, "IGLV" = 1, "IGKV" = 1, "TRAJ" = 2, "TRGJ" = 2, "IGKJ" = 2, "IKLJ" = 2, - "TRBD" = 2, "TRDD" = 2, "IGHV" = 2, + "TRBD" = 2, "TRDD" = 2, "IGHV" = 2, "TRBJ" = 3, "TRDJ" = 2, "IGHJ" = 3) chain.poisiton <- chains[position] if(substring(position,3,3) %in% c("A", "G", "H")) { @@ -200,7 +200,7 @@ vizGenes <- function(input.data, } else { chain.gene <- str_split(df[,"CTgene"], "_", simplify = TRUE)[,2] } - genes <- str_split(chain.gene, "[.]", simplify = TRUE)[,chain.poisiton] + genes <- str_split(chain.gene, "[.]", simplify = TRUE)[,chain.poisiton] df[,label] <- NA df[,label] <- genes return(df)