cmap · uwidocki · Mar 27, 2025 · Mar 10, 2025 · Mar 14, 2025 · Mar 17, 2025
diff --git a/scripts/collate_fastq_reads.R b/scripts/collate_fastq_reads.R
@@ -11,12 +11,12 @@ parser <- ArgumentParser()
 # specify desired options
 parser$add_argument("-v", "--verbose", action="store_true", default=TRUE, help="Print extra output [default]")
 parser$add_argument("-q", "--quietly", action="store_false", dest="verbose", help="Print little output")
-parser$add_argument('--raw_counts_uncollapsed', default= "raw_counts_uncollapsed.csv",
+parser$add_argument('--raw_counts_uncollapsed', default= "raw_counts_uncollapsed.csv.gz",
                     help= "path to file containing uncollapsed raw counts file")
 parser$add_argument("--sample_meta", default="sample_meta.csv", help= "Sample metadata")
 parser$add_argument("--cell_line_meta", default="cell_line_meta.csv", help= "Cell line metadata")
 parser$add_argument("--CB_meta", default= "CB_meta.csv", help= "Control Barcode metadata")
-parser$add_argument('--sequencing_index_cols', default= 'index_1,index_2', 
+parser$add_argument('--sequencing_index_cols', default= 'flowcell_names,index_1,index_2', 
                     help= 'List of sequencing columns in the sample meta.')
 parser$add_argument("--id_cols", default= "pcr_plate,pcr_well", 
                     help = "Columns that identify a unique PCR well")
@@ -38,9 +38,13 @@ if (args$out == "") {
 }
 
 # Read in metadata files as data.table objects ----
-sample_meta= data.table::fread(args$sample_meta, header= TRUE, sep= ',')
 cell_line_meta= data.table::fread(args$cell_line_meta, header= TRUE, sep= ',')
 CB_meta= data.table::fread(args$CB_meta, header= TRUE, sep= ',')
+sample_meta= data.table::fread(args$sample_meta, header= TRUE, sep= ',')
+## if flowcell_lanes one lane, the integer gets read in as numeric, so force this to be a string
+if(validate_columns_exist("flowcell_lanes", sample_meta)){
+  sample_meta %<>% mutate(flowcell_lanes = as.character(flowcell_lanes))
+}
 
 # Parse some parameters into vectors ----
 sequencing_index_cols= unlist(strsplit(args$sequencing_index_cols, ","))

diff --git a/scripts/filteredCounts_QC.R b/scripts/filteredCounts_QC.R
@@ -40,6 +40,7 @@ parser$add_argument('--count_threshold', default= 40, help= 'Low counts theshold
 parser$add_argument('--reverse_index2', type= "logical", default= FALSE,
                     help= 'Switch to reverse complement index_2 for some sequencers')
 parser$add_argument('-o', '--out', default= '', help= 'Output path, defaults to working directory')
+parser$add_argument("--pseudocount", default=20, help = "pseudo raw count used in normalization")
 
 # get command line options, if help option encountered print help and exit
 args <- parser$parse_args()
@@ -85,4 +86,5 @@ QC_images(raw_counts_uncollapsed_path= args$raw_counts_uncollapsed,
           control_type= args$control_type, 
           count_threshold= as.numeric(args$count_threshold), 
           reverse_index2= args$reverse_index2,
-          out= args$out)
+          out= args$out,
+          pseudocount = as.numeric(args$pseudocount))
diff --git a/scripts/filteredCounts_QC.sh b/scripts/filteredCounts_QC.sh
@@ -99,6 +99,7 @@ echo COUNT_THRESHOLD is: $COUNT_THRESHOLD
 echo RAW_COUNTS_UNCOLLAPSED is: $RAW_COUNTS_UNCOLLAPSED
 echo LFC is: $LFC
 echo REVERSE_INDEX2 is: $REVERSE_INDEX2
+echo PSEUDOCOUNT is: $PSEUDOCOUNT
 
 args=(
 --sample_meta "$SAMPLE_META"
@@ -117,6 +118,7 @@ args=(
 --reverse_index2 "$REVERSE_INDEX2"
 --chunk_size "$CHUNK_SIZE"
 --cell_line_cols "$CELL_LINE_COLS"
+--pseudocount "$PSEUDOCOUNT"
 )
 
 echo Rscript filteredCounts_QC.R "${args[@]}"

diff --git a/scripts/src/QC_images.R b/scripts/src/QC_images.R
@@ -286,10 +286,15 @@ create_cdf_plot= function(input_df, id_cols, counts_col= 'n', mark1= 0.5, mark2=
 #' @param normalized_counts Dataframe output from the normalize module.
 #' @param id_cols Vector of column names that identify every PCR well.
 #' @param value_col Name of the column that contains the values.
+#' @param pseudocount a pseudo raw count used for missing cell line counts, also used when computing log2 on raw counts
 #' @returns Returns a ggplot object.
-create_ctrlBC_scatterplots= function(normalized_counts, id_cols, value_col= 'log2_n') {
+create_ctrlBC_scatterplots= function(normalized_counts, id_cols, value_col= 'log2_n', pseudocount = 20) {
   # Validation: Check that id_cols and value_col exist in filtered counts.
-  if(!validate_columns_exist(c(id_cols, value_col), normalized_counts)) {
+  if(value_col == "log2_n" & validate_columns_exist(c(id_cols, "n"), normalized_counts) & 
+     !(validate_columns_exist(c("log2_n"), normalized_counts))){
+    print("Calculating log2_n")
+    normalized_counts %<>% mutate(log2_n = log2(n+1))
+  }else if(!validate_columns_exist(c(id_cols, value_col), normalized_counts)) {
     stop('Some input columns were not detected in normalized counts.')
   }
 
@@ -306,7 +311,7 @@ create_ctrlBC_scatterplots= function(normalized_counts, id_cols, value_col= 'log
                     residual2= (cb_log2_dose - log2_n)^2,
                     squares2= (cb_log2_dose - mean_y)^2,
                     norm_r2= 1 - sum(residual2) / sum(squares2),
-                    norm_mae= median(abs(cb_log2_dose- log2_n))) %>% ungroup()
+                    norm_mae= median(abs(cb_log2_dose - log2_n))) %>% ungroup()
   } 
 
   # Filter for just the control barcodes, create a profile_id for faceting, 
@@ -466,17 +471,18 @@ create_replicate_scatterplots= function(input_df, cell_line_cols, replicate_grou
 #' @param count_threshold Threshold for low read counts.
 #' @param reverse_index2 Boolean set to TRUE if the sequencing involved the reverse complement workflow.
 #' @param out Path to the directory to save the QC images.
+#' @param pseudocount a pseudo raw count used for missing cell line counts, also used when computing log2 on raw counts
 #' @returns NA. QC images are written out to the specified folder.
 QC_images= function(raw_counts_uncollapsed_path,
                     prism_barcode_counts, unknown_barcode_counts,
                     annotated_counts, normalized_counts= NA, l2fc, 
                     sample_meta,
                     barcode_col= 'forward_read_barcode',
                     id_cols= c('pcr_plate', 'pcr_well'),
-                    cell_line_cols= c('depmap_id'), 
+                    cell_line_cols= c('depmap_id', 'lua'), 
                     sig_cols,
                     control_type= 'ctl_vehicle', count_threshold= 40, 
-                    chunk_size= 10^6,
+                    chunk_size= 10^6, pseudocount = 20,
                     reverse_index2= FALSE, out= NA) {
 
   # Required packages ----
@@ -693,7 +699,8 @@ QC_images= function(raw_counts_uncollapsed_path,
     print('8. Generating control_barcode_trend image')
     potential_error= base::tryCatch({
       trend_sc= create_ctrlBC_scatterplots(normalized_counts %>% dplyr::filter(!cb_ladder %in% c(NA, FALSE, 'none', '')), 
-                                           id_cols, value_col= 'log2_n')
+                                           id_cols, value_col= 'log2_n',
+                                           pseudocount = pseudocount)
 
       pdf(file=paste(out, "control_barcode_trend.pdf", sep="/"),
           width= sqrt(num_profiles) * 2, height= sqrt(num_profiles) * 2)

diff --git a/scripts/src/collate_fastq_reads.R b/scripts/src/collate_fastq_reads.R
@@ -180,6 +180,11 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta,
   # Calculate index purity ----
   # This is only accurate if the Nori input file is small enough to fit into a chunk.
   index_purity= sum(summed_reads$n) / sum(uncollapsed_raw_counts$n)
+  # if index_purity is NA, make it 0
+  if(is.na(index_purity)){
+    index_purity = 0
+  }
+
   # Throw an error if the purity is greater than 1.
   # Throw a warning if the purity is below 0.5.
   print(paste0('Index purity in chunk: ', round(index_purity, 4)))

diff --git a/scripts/src/qc_table_functions.R b/scripts/src/qc_table_functions.R
@@ -2,7 +2,7 @@ options(cli.unicode = FALSE)
 library(tidyverse)
 library(data.table)
 library(magrittr)
-library (PRROC)
+library(PRROC)
 
 # BY ID_COLS (PCR_PLATE, PCR_WELL) ----------