Merge pull request #282 from Qile0317/v2

Attempted fix to segfault in issue #281 and #279
BorchLab · Dec 2, 2023 · a30dec0 · a30dec0
2 parents 7890fdb + e4e5c7b
commit a30dec0
Show file tree

Hide file tree

Showing 14 changed files with 69 additions and 63 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -9,4 +9,4 @@
 ^docs$
 ^pkgdown$
 ^index\.md$
-^dev\.R$
+^qile$
diff --git a/.gitignore b/.gitignore
@@ -7,4 +7,4 @@ local_tests.R
 docs
 vignettes/articles/scRep_example_full.rds
 .vscode
-dev.R
+qile
diff --git a/NEWS.md b/NEWS.md
@@ -33,7 +33,7 @@
 * Added support for Immcantation pipeline, .json, Omniscope, and MiXCR formats for ```loadContigs()```
 * Made GitHub.io website for support/vignettes/FAQ
 * Restructured NEWS Tracking
-* Added testhat for all exported and internal functions
+* Added testthat for all exported and internal functions
 
 ## DEPRECATED AND DEFUNCT
 
@@ -58,6 +58,7 @@
 * exportClones to clonalNetwork() to isolate clones shared across identities.
 
 ## UNDERLYING CHANGES
+
 * Fix issue with clonalDiversity() and skipping boots
 * Fixing underlying assumptions with clonalBias()
 * Adding reads variable to parseAIRR
@@ -233,7 +234,7 @@
 
 # scRepertoire VERSION 0.99.16
 
-* Added getCirclize() 
+* Added ```getCirclize()```
 
 
 # scRepertoire VERSION 0.99.15
@@ -245,7 +246,7 @@
 
 * Removed bracket from indexing function
 
-#scRepertoire VERSION 0.99.13
+# scRepertoire VERSION 0.99.13
 
 * Added exportTable to remaining viz functions
 * Modified morisita index to correct error
@@ -264,7 +265,7 @@
 
 # scRepertoire VERSION 0.99.9
 
-* Bioconductor had no love - changed the Seurat package to imports instead of required, see if that will address the compiling issue that results in a killed: 9 error. 
+* Bioconductor had no love - changed the Seurat package to imports instead of required, see if that will address the compiling issue that results in a killed: 9 error.
 
 # scRepertoire VERSION 0.99.8
 
@@ -328,7 +329,7 @@ DEPRECATED AND DEFUNCT
 
 SIGNIFICANT USER-VISIBLE CHANGES
 
-* Added support for SingleCellExperiement format.
+* Added support for ```SingleCellExperiment``` format.
 
 
 DEPRECATED AND DEFUNCT

diff --git a/R/clonalSizeDistribution.R b/R/clonalSizeDistribution.R
@@ -91,10 +91,10 @@ clonalSizeDistribution <- function(input.data,
     data <- data.frame(table(data[,cloneCall]), 
                        stringsAsFactors = FALSE)
     colnames(data) <- c(cloneCall, "Freq")
-    for (y in seq_along(unique_df)){
+    for (y in seq_along(unique_df)){ # here is the first speed bottleneck that has speedup potential
       clonotype.y <- Con.df$clonotype[y]
-      location.y <- which(clonotype.y == data[,cloneCall])
-      Con.df[y,i+1] <- data[location.y[1],"Freq"]
+      location.y <- which(clonotype.y == data[,cloneCall]) # some pre-indexing likely possible here to shave ~5s
+      Con.df[y,i+1] <- data[location.y[1],"Freq"] # assignment likely could be sped up by constant factor to shave ~8s
     }
   }
   colnames(Con.df)[2:(length(input.data)+1)] <- names(input.data)

diff --git a/R/exportClones.R b/R/exportClones.R
@@ -111,7 +111,7 @@ exportClones <- function(input.data,
 
     mat <- list()
 
-    # Define a function for processing each row
+    # Define a function for processing each row - main speed bottleneck
     .process_row <- function(row) {
       # Split gene, amino acid, and nucleotide columns
       genes <- str_split(row$CTgene, "_", simplify = TRUE)
@@ -150,7 +150,7 @@ exportClones <- function(input.data,
       chain1_gene <- .sort_gene_calls(chain1_gene)
       chain2_gene <- .sort_gene_calls(chain2_gene)
 
-      # Create the formatted data frame
+      # Create the formatted data frame - smaller speed bottleneck of ~10 sec
       tmp.out <- data.frame(
         cell_id = row$barcode,
         locus = c(locus1, locus2),
@@ -176,7 +176,7 @@ exportClones <- function(input.data,
 
     # Process each row and store the results in the list
     for (i in seq_len(nrow(input.data))) {
-      mat[[i]] <- .process_row(input.data[i, ])
+      mat[[i]] <- .process_row(input.data[i, ]) # main speed bottleneck (see above)
     }
   mat <- do.call(rbind, mat)
   return(mat)

diff --git a/R/utils.R b/R/utils.R
@@ -291,7 +291,7 @@ is_seurat_or_se_object <- function(obj) {
 
 #Assigning positions for BCR contig data
 #Now assumes lambda over kappa in the context of only 2 light chains
-#' @author Gloria Kraus, Nick Bormann, Nick Borcherding
+#' @author Gloria Kraus, Nick Bormann, Nick Borcherding, Qile Yang
 #' @keywords internal
 .parseBCR <- function (Con.df, unique_df, data2) {
   barcodeIndex <- rcppConstructBarcodeIndex(unique_df, data2$barcode)

diff --git a/README.md b/README.md
@@ -11,29 +11,33 @@
 <img align="right" src="https://www.borch.dev/uploads/screpertoire/reference/figures/screpertoire_hex.png" width="305" height="352">
 
 ### Introduction
-Single-cell sequencing is an emerging technology in the field of immunology and oncology that allows researchers to couple RNA quantification and other modalities, like immune cell receptor profiling at the level of an individual cell. A number of workflows and software packages have been created to process and analyze single-cell transcriptomic data. These packages allow users to take the vast dimensionality of the data generated in single-cell-based experiments and distill the data into novel insights. Unlike the transcriptomic field, there is a lack of options for software that allow for single-cell immune receptor profiling. Enabling users to easily combine RNA and immune profiling, scRepertoire was built to process data derived from the 10x Genomics Chromium Immune Profiling for both T-cell receptor (TCR) and immunoglobulin (Ig) enrichment workflows and subsequently interacts with the popular Seurat R package. 
+Single-cell sequencing is an emerging technology in the field of immunology and oncology that allows researchers to couple RNA quantification and other modalities, like immune cell receptor profiling at the level of an individual cell. A number of workflows and software packages have been created to process and analyze single-cell transcriptomic data. These packages allow users to take the vast dimensionality of the data generated in single-cell-based experiments and distill the data into novel insights. Unlike the transcriptomic field, there is a lack of options for software that allow for single-cell immune receptor profiling. Enabling users to easily combine RNA and immune profiling, scRepertoire was built to process data derived from the 10x Genomics Chromium Immune Profiling for both T-cell receptor (TCR) and immunoglobulin (Ig) enrichment workflows and subsequently interacts with the popular Seurat R package.
 
 ### Applying Deep Learning to VDJ data
-scRepertoire is compatible and integrated with the R packages [Trex](https://github.com/ncborcherding/Trex) for deep-learning-based autoencoding of the T cell receptor and [Ibex](https://github.com/ncborcherding/Ibex) for the B cell receptor. 
+scRepertoire is compatible and integrated with the R packages [Trex](https://github.com/ncborcherding/Trex) for deep-learning-based autoencoding of the T cell receptor and [Ibex](https://github.com/ncborcherding/Ibex) for the B cell receptor.
 
 ## Working with scRepertoire
 
 scRepertoire has a comprehensive [website](https://www.borch.dev/uploads/screpertoire/) for detailed tutorials and function information.
 
 ## Installation
 
-#### Installation of Master Branch
-```
+### Installation of Master Branch
+
+```R
 devtools::install_github("ncborcherding/scRepertoire")
 ```
-#### Most up-to-date version
-```
+
+### Most up-to-date version
+
+```R
 devtools::install_github("ncborcherding/scRepertoire@dev")
 ```
 
-#### Legacy Version 1
+### Legacy Version 1
 If you are looking for version 1 of scRepertoire - there is a static version available below:
-```
+
+```R
 devtools::install_github("ncborcherding/scRepertoire@v1")
 ```
 
@@ -60,5 +64,4 @@ be extremely helpful.
 Feel free to use, edit, modify scRepertoire, but if you do, please cite the [manuscript](https://f1000research.com/articles/9-47/v1). If you are building your own tool based on scRepertoire, reach out, I am happy to help and make things compatible.
 
 ## Contact
-Questions, comments, suggestions, please feel free to contact Nick Borcherding via this repository, [email](mailto:[email protected]), or using [twitter](https://twitter.com/theHumanBorch). 
-
+Questions, comments, suggestions, please feel free to contact Nick Borcherding via this repository, [email](mailto:[email protected]), or using [twitter](https://twitter.com/theHumanBorch).
diff --git a/inst/WORDLIST b/inst/WORDLIST
@@ -30,7 +30,6 @@ Contig
 Contigs
 Corvino
 Drs
-Embeddings
 FilteringMulti
 Francesco
 GSE
@@ -41,6 +40,7 @@ IGH
 IGKJ
 IGL
 Ig
+Immcantation
 Inv
 JC
 JSON
@@ -107,11 +107,9 @@ alluvialClones
 alluvialClonotype
 alluvialClonotypes
 autoencoder
-autoencoders
 autoencoding
 barcode
 barcodes
-bioc
 bioconductor
 bronchoalveolar
 cav
@@ -142,7 +140,6 @@ cloneCall
 cloneSize
 cloneSizes
 cloneType
-cloneTypes
 clonesizeDistribution
 clonotype
 clonotypeBias
@@ -205,7 +202,6 @@ hammingCompare
 hcl
 highlightClones
 highlightClonotypes
-https
 hypermutation
 iNEXT
 iedb
@@ -233,24 +229,20 @@ migr
 migra
 morisita
 multisystem
-mumosa
 na
 nt
 nucleotides
 occupiedClonotype
 occupiedRepertoire
 occupiedscRepertoire
-paramater
 parseAIRR
 parseBCR
 parseTCR
 percentAA
 percentGenes
 percentKmer
 percentVJ
-phateR
 powerTCR
-pseudotime
 quantContig
 quantifications
 rda
@@ -260,7 +252,6 @@ relevel
 removeMulti
 removeNA
 repo
-rescaling
 runIbex
 runTrex
 sc
@@ -280,6 +271,7 @@ subsetClones
 subtype
 subtypes
 summarise
+testthat
 theCall
 trackable
 tran

diff --git a/src/aaKmers.cpp b/src/aaKmers.cpp
@@ -24,7 +24,7 @@ class AaKmerCounter {
     int k;
     unsigned long int mask;
     std::unordered_map<char, unsigned long int> aaIndexMap;
-    std::vector<long double> bins;
+    std::vector<double> bins;
 
     std::unordered_map<unsigned long int, int> toAaUIntKmerMap(const std::vector<std::string>& motifs) {
         std::unordered_map<unsigned long int, int> map;
@@ -59,12 +59,13 @@ class AaKmerCounter {
         k = _k;
         mask = (unsigned long int) ((1 << (_k * 5)) - 1);
         aaUIntKmerMap = toAaUIntKmerMap(motifs);
-        bins = std::vector<long double> (motifs.size(), 0.0);
+        bins = std::vector<double> (motifs.size(), 0.0);
     }
 
     void countKmers(const std::vector<std::string>& seqs) {
         for (std::string seq : seqs) {
-            if ((int) seq.size() < k) {
+            int n = (int) seq.size();
+            if (n < k) {
                 continue;
             }
 
@@ -76,7 +77,7 @@ class AaKmerCounter {
                 updateSkip(skip, seq[i]);
             }
 
-            for (int i = (k - 1); i < (int) seq.size(); i++) {
+            for (int i = (k - 1); i < n; i++) {
                 kmer = ((kmer << 5) & mask) | toAaIndex(seq[i]);
                 updateSkip(skip, seq[i]);
                 if (skip == 0) {
@@ -86,7 +87,7 @@ class AaKmerCounter {
         }
     }
 
-    std::vector<long double> getCounts() {
+    std::vector<double> getCounts() {
         return bins;
     }
 };
@@ -95,20 +96,21 @@ class AaKmerCounter {
 Rcpp::NumericVector rcppGetAaKmerPercent(
     const std::vector<std::string>& seqs, const std::vector<std::string>& motifs, const int k
 ) {
+    int numKmers = (int) motifs.size();
 
     AaKmerCounter counter = AaKmerCounter(motifs, k);
     counter.countKmers(seqs);
-    std::vector<long double> bins = counter.getCounts();
+    std::vector<double> bins = counter.getCounts();
 
-    long double binSum = scRepHelper::sum(bins);
+    double binSum = scRepHelper::sum(bins);
     if (binSum == 0.0) { // pretty sure this can only happen if there arent valid seqs?
-        return Rcpp::NumericVector (motifs.size(), R_NaReal);
+        return Rcpp::NumericVector (numKmers, R_NaReal);
     }
 
     double scaleFactor = 1 / binSum;
-    for (int i = 0; i < (int) motifs.size(); i++) {
+    for (int i = 0; i < numKmers; i++) {
         bins[i] *= scaleFactor;
     }
 
-    return scRepHelper::convertZerosToNA(bins, motifs.size());
+    return scRepHelper::convertZerosToNA(bins, numKmers);
 }
diff --git a/src/constructConDfAndParseBCR.cpp b/src/constructConDfAndParseBCR.cpp
@@ -4,6 +4,7 @@
 #include <Rcpp.h>
 #include <string>
 #include <vector>
+#include <unordered_map>
 #include "scRepHelper.h"
 
 #define BarcodeIndciesMap std::unordered_map<std::string, std::vector<int>>

diff --git a/src/constructConDfAndparseTCR.cpp b/src/constructConDfAndparseTCR.cpp
@@ -39,8 +39,7 @@ class TcrParser {
     // variable for helper barcode index
     std::vector<std::vector<int>> barcodeIndex;
 
-    // constructor: in the future if more columns are ever added its probably a much
-    // more general algo would be to just find the indicies of the dataframe by looking at the colnames
+    // constructor
     TcrParser(
         Rcpp::DataFrame& data2, std::vector<std::string>& uniqueData2Barcodes
     ) {
@@ -51,17 +50,17 @@ class TcrParser {
         conDf[0] = uniqueData2Barcodes;
 
         // set references to fixed data2 columns
-        data2ChainTypes = data2[5];
-        data2Cdr3 = data2[12];
-        data2Cdr3Nt = data2[13];
+        data2ChainTypes = data2[data2.findName("chain")];
+        data2Cdr3 = data2[data2.findName("cdr3")];
+        data2Cdr3Nt = data2[data2.findName("cdr3_nt")];
 
         // setting reference to the TCR columns assuming all extra columns come before
-        data2Tcr1 = data2[data2.size() - 2];
-        data2Tcr2 = data2[data2.size() - 1]; 
+        data2Tcr1 = data2[data2.findName("TCR1")];
+        data2Tcr2 = data2[data2.findName("TCR2")];
 
         // construct barcodeIndex
         barcodeIndex = constructBarcodeIndex(
-            uniqueData2Barcodes, Rcpp::as<std::vector<std::string>>(data2[0])
+            uniqueData2Barcodes, Rcpp::as<std::vector<std::string>>(data2[data2.findName("barcode")])
         );
     }
 
@@ -80,6 +79,7 @@ class TcrParser {
     }
 
     // parseTCR() helpers
+
     void handleTcr1(int y, int data2index) {
         handleTcr(y, data2index, data2Tcr1, 1, 2, 3);
     }
-Original file line number
+Diff line change
@@ Expand Up / @@ -7,4 +7,4 @@ local_tests.R @@
     docs
     vignettes/articles/scRep_example_full.rds
     .vscode
-    dev.R
+    qile