From 9d1c76e3037501b2c1686c28560c61a510705cc7 Mon Sep 17 00:00:00 2001
From: uwidocki <widocki@broadinstitute.org>
Date: Mon, 10 Mar 2025 14:41:11 -0400
Subject: [PATCH 1/6] if flowcell_lanes is a column, cforces it to be a string
 otherwise the collating function will not work if it is a single integer.
 Minor edits to make defaults in the script match new defaults we also use on
 Jenkins UI

---
 scripts/collate_fastq_reads.R | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/scripts/collate_fastq_reads.R b/scripts/collate_fastq_reads.R
index 03af8264..cbadc68a 100644
--- a/scripts/collate_fastq_reads.R
+++ b/scripts/collate_fastq_reads.R
@@ -11,12 +11,12 @@ parser <- ArgumentParser()
 # specify desired options
 parser$add_argument("-v", "--verbose", action="store_true", default=TRUE, help="Print extra output [default]")
 parser$add_argument("-q", "--quietly", action="store_false", dest="verbose", help="Print little output")
-parser$add_argument('--raw_counts_uncollapsed', default= "raw_counts_uncollapsed.csv",
+parser$add_argument('--raw_counts_uncollapsed', default= "raw_counts_uncollapsed.csv.gz",
                     help= "path to file containing uncollapsed raw counts file")
 parser$add_argument("--sample_meta", default="sample_meta.csv", help= "Sample metadata")
 parser$add_argument("--cell_line_meta", default="cell_line_meta.csv", help= "Cell line metadata")
 parser$add_argument("--CB_meta", default= "CB_meta.csv", help= "Control Barcode metadata")
-parser$add_argument('--sequencing_index_cols', default= 'index_1,index_2', 
+parser$add_argument('--sequencing_index_cols', default= 'flowcell_names,index_1,index_2', 
                     help= 'List of sequencing columns in the sample meta.')
 parser$add_argument("--id_cols", default= "pcr_plate,pcr_well", 
                     help = "Columns that identify a unique PCR well")
@@ -38,9 +38,13 @@ if (args$out == "") {
 }
 
 # Read in metadata files as data.table objects ----
-sample_meta= data.table::fread(args$sample_meta, header= TRUE, sep= ',')
 cell_line_meta= data.table::fread(args$cell_line_meta, header= TRUE, sep= ',')
 CB_meta= data.table::fread(args$CB_meta, header= TRUE, sep= ',')
+sample_meta= data.table::fread(args$sample_meta, header= TRUE, sep= ',')
+## if flowcell_lanes one lane, the integer gets read in as numeric, so force this to be a string
+if(validate_columns_exist("flowcell_lanes", sample_meta)){
+  sample_meta %<>% mutate(flowcell_lanes = as.character(flowcell_lanes))
+}
 
 # Parse some parameters into vectors ----
 sequencing_index_cols= unlist(strsplit(args$sequencing_index_cols, ","))

From b8a5a5a00bb7b69f511452de3533740c1e16f3d4 Mon Sep 17 00:00:00 2001
From: uwidocki <widocki@broadinstitute.org>
Date: Fri, 14 Mar 2025 09:32:49 -0400
Subject: [PATCH 2/6] added statement to make index_purity 0 if it is NA to
 prevent collate modules from quitting

---
 scripts/src/collate_fastq_reads.R | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/scripts/src/collate_fastq_reads.R b/scripts/src/collate_fastq_reads.R
index 264ceb6e..5f4b4bc3 100644
--- a/scripts/src/collate_fastq_reads.R
+++ b/scripts/src/collate_fastq_reads.R
@@ -180,6 +180,11 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta,
   # Calculate index purity ----
   # This is only accurate if the Nori input file is small enough to fit into a chunk.
   index_purity= sum(summed_reads$n) / sum(uncollapsed_raw_counts$n)
+  # if index_purity is NA, make it 0
+  if(is.na(index_purity)){
+    index_purity = 0
+  }
+  
   # Throw an error if the purity is greater than 1.
   # Throw a warning if the purity is below 0.5.
   print(paste0('Index purity in chunk: ', round(index_purity, 4)))

From f718c86e71d69066fd46a9bcfffe076173748c74 Mon Sep 17 00:00:00 2001
From: uwidocki <widocki@broadinstitute.org>
Date: Mon, 17 Mar 2025 09:36:21 -0400
Subject: [PATCH 3/6] edit to QC image #8 to calculate log2_n in normalized
 counts if not present but is the value_col parameter

---
 scripts/src/QC_images.R | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/scripts/src/QC_images.R b/scripts/src/QC_images.R
index 51467275..7e70d3fe 100755
--- a/scripts/src/QC_images.R
+++ b/scripts/src/QC_images.R
@@ -289,7 +289,10 @@ create_cdf_plot= function(input_df, id_cols, counts_col= 'n', mark1= 0.5, mark2=
 #' @returns Returns a ggplot object.
 create_ctrlBC_scatterplots= function(normalized_counts, id_cols, value_col= 'log2_n') {
   # Validation: Check that id_cols and value_col exist in filtered counts.
-  if(!validate_columns_exist(c(id_cols, value_col), normalized_counts)) {
+  if(value_col == "log2_n" & validate_columns_exist(c(id_cols, "n"), normalized_counts) & 
+     !(validate_columns_exist(c("log2_n"), normalized_counts))){
+    normalized_counts %<>% mutate(log2_n = log2(n+1))
+  }else if(!validate_columns_exist(c(id_cols, value_col), normalized_counts)) {
     stop('Some input columns were not detected in normalized counts.')
   }
   
@@ -303,10 +306,10 @@ create_ctrlBC_scatterplots= function(normalized_counts, id_cols, value_col= 'log
       dplyr::filter(!is.na(cb_ladder), n != 0) %>%
       dplyr::group_by(pick(all_of(id_cols))) %>%
       dplyr::mutate(mean_y= mean(cb_log2_dose),
-                    residual2= (cb_log2_dose - log2_n)^2,
+                    residual2= (cb_log2_dose - log2(n+1))^2,
                     squares2= (cb_log2_dose - mean_y)^2,
                     norm_r2= 1 - sum(residual2) / sum(squares2),
-                    norm_mae= median(abs(cb_log2_dose- log2_n))) %>% ungroup()
+                    norm_mae= median(abs(cb_log2_dose- log2(n+1)))) %>% ungroup()
   } 
   
   # Filter for just the control barcodes, create a profile_id for faceting, 

From c8aa298e1ed89207ae2717002d75da17180c0ef5 Mon Sep 17 00:00:00 2001
From: uwidocki <widocki@broadinstitute.org>
Date: Mon, 17 Mar 2025 09:52:20 -0400
Subject: [PATCH 4/6] added print statement

---
 scripts/src/QC_images.R | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/scripts/src/QC_images.R b/scripts/src/QC_images.R
index 7e70d3fe..f278d8d3 100755
--- a/scripts/src/QC_images.R
+++ b/scripts/src/QC_images.R
@@ -291,6 +291,7 @@ create_ctrlBC_scatterplots= function(normalized_counts, id_cols, value_col= 'log
   # Validation: Check that id_cols and value_col exist in filtered counts.
   if(value_col == "log2_n" & validate_columns_exist(c(id_cols, "n"), normalized_counts) & 
      !(validate_columns_exist(c("log2_n"), normalized_counts))){
+    print("Calculating log2_n")
     normalized_counts %<>% mutate(log2_n = log2(n+1))
   }else if(!validate_columns_exist(c(id_cols, value_col), normalized_counts)) {
     stop('Some input columns were not detected in normalized counts.')
@@ -306,10 +307,10 @@ create_ctrlBC_scatterplots= function(normalized_counts, id_cols, value_col= 'log
       dplyr::filter(!is.na(cb_ladder), n != 0) %>%
       dplyr::group_by(pick(all_of(id_cols))) %>%
       dplyr::mutate(mean_y= mean(cb_log2_dose),
-                    residual2= (cb_log2_dose - log2(n+1))^2,
+                    residual2= (cb_log2_dose - log2_n)^2,
                     squares2= (cb_log2_dose - mean_y)^2,
                     norm_r2= 1 - sum(residual2) / sum(squares2),
-                    norm_mae= median(abs(cb_log2_dose- log2(n+1)))) %>% ungroup()
+                    norm_mae= median(abs(cb_log2_dose - log2_n))) %>% ungroup()
   } 
   
   # Filter for just the control barcodes, create a profile_id for faceting, 

From 6ccb75cdc00bddc575752ed088f91d33293a44f5 Mon Sep 17 00:00:00 2001
From: uwidocki <widocki@broadinstitute.org>
Date: Mon, 17 Mar 2025 16:08:23 -0400
Subject: [PATCH 5/6] made changes to use pseudocount in create control barcode
 plot function in QC images

---
 scripts/filteredCounts_QC.R  |  4 +++-
 scripts/filteredCounts_QC.sh |  2 ++
 scripts/src/QC_images.R      | 11 +++++++----
 3 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/scripts/filteredCounts_QC.R b/scripts/filteredCounts_QC.R
index b33b2e47..a1aa8404 100755
--- a/scripts/filteredCounts_QC.R
+++ b/scripts/filteredCounts_QC.R
@@ -40,6 +40,7 @@ parser$add_argument('--count_threshold', default= 40, help= 'Low counts theshold
 parser$add_argument('--reverse_index2', type= "logical", default= FALSE,
                     help= 'Switch to reverse complement index_2 for some sequencers')
 parser$add_argument('-o', '--out', default= '', help= 'Output path, defaults to working directory')
+parser$add_argument("--pseudocount", default=20, help = "pseudo raw count used in normalization")
 
 # get command line options, if help option encountered print help and exit
 args <- parser$parse_args()
@@ -85,4 +86,5 @@ QC_images(raw_counts_uncollapsed_path= args$raw_counts_uncollapsed,
           control_type= args$control_type, 
           count_threshold= as.numeric(args$count_threshold), 
           reverse_index2= args$reverse_index2,
-          out= args$out)
+          out= args$out,
+          pseudocount = as.numeric(args$pseudocount))
diff --git a/scripts/filteredCounts_QC.sh b/scripts/filteredCounts_QC.sh
index fe1b1b21..0d084d1c 100644
--- a/scripts/filteredCounts_QC.sh
+++ b/scripts/filteredCounts_QC.sh
@@ -99,6 +99,7 @@ echo COUNT_THRESHOLD is: $COUNT_THRESHOLD
 echo RAW_COUNTS_UNCOLLAPSED is: $RAW_COUNTS_UNCOLLAPSED
 echo LFC is: $LFC
 echo REVERSE_INDEX2 is: $REVERSE_INDEX2
+echo PSEUDOCOUNT is: $PSEUDOCOUNT
 
 args=(
 --sample_meta "$SAMPLE_META"
@@ -117,6 +118,7 @@ args=(
 --reverse_index2 "$REVERSE_INDEX2"
 --chunk_size "$CHUNK_SIZE"
 --cell_line_cols "$CELL_LINE_COLS"
+--pseudocount "$PSEUDOCOUNT"
 )
 
 echo Rscript filteredCounts_QC.R "${args[@]}"
diff --git a/scripts/src/QC_images.R b/scripts/src/QC_images.R
index f278d8d3..d908e5ab 100755
--- a/scripts/src/QC_images.R
+++ b/scripts/src/QC_images.R
@@ -286,8 +286,9 @@ create_cdf_plot= function(input_df, id_cols, counts_col= 'n', mark1= 0.5, mark2=
 #' @param normalized_counts Dataframe output from the normalize module.
 #' @param id_cols Vector of column names that identify every PCR well.
 #' @param value_col Name of the column that contains the values.
+#' @param pseudocount a pseudo raw count used for missing cell line counts, also used when computing log2 on raw counts
 #' @returns Returns a ggplot object.
-create_ctrlBC_scatterplots= function(normalized_counts, id_cols, value_col= 'log2_n') {
+create_ctrlBC_scatterplots= function(normalized_counts, id_cols, value_col= 'log2_n', pseudocount = 20) {
   # Validation: Check that id_cols and value_col exist in filtered counts.
   if(value_col == "log2_n" & validate_columns_exist(c(id_cols, "n"), normalized_counts) & 
      !(validate_columns_exist(c("log2_n"), normalized_counts))){
@@ -470,6 +471,7 @@ create_replicate_scatterplots= function(input_df, cell_line_cols, replicate_grou
 #' @param count_threshold Threshold for low read counts.
 #' @param reverse_index2 Boolean set to TRUE if the sequencing involved the reverse complement workflow.
 #' @param out Path to the directory to save the QC images.
+#' @param pseudocount a pseudo raw count used for missing cell line counts, also used when computing log2 on raw counts
 #' @returns NA. QC images are written out to the specified folder.
 QC_images= function(raw_counts_uncollapsed_path,
                     prism_barcode_counts, unknown_barcode_counts,
@@ -477,10 +479,10 @@ QC_images= function(raw_counts_uncollapsed_path,
                     sample_meta,
                     barcode_col= 'forward_read_barcode',
                     id_cols= c('pcr_plate', 'pcr_well'),
-                    cell_line_cols= c('depmap_id'), 
+                    cell_line_cols= c('depmap_id', 'lua'), 
                     sig_cols,
                     control_type= 'ctl_vehicle', count_threshold= 40, 
-                    chunk_size= 10^6,
+                    chunk_size= 10^6, pseudocount = 20,
                     reverse_index2= FALSE, out= NA) {
   
   # Required packages ----
@@ -697,7 +699,8 @@ QC_images= function(raw_counts_uncollapsed_path,
     print('8. Generating control_barcode_trend image')
     potential_error= base::tryCatch({
       trend_sc= create_ctrlBC_scatterplots(normalized_counts %>% dplyr::filter(!cb_ladder %in% c(NA, FALSE, 'none', '')), 
-                                           id_cols, value_col= 'log2_n')
+                                           id_cols, value_col= 'log2_n',
+                                           pseudocount = pseudocount)
       
       pdf(file=paste(out, "control_barcode_trend.pdf", sep="/"),
           width= sqrt(num_profiles) * 2, height= sqrt(num_profiles) * 2)

From 7d35a89af570451b96a63aba9e87765550965116 Mon Sep 17 00:00:00 2001
From: uwidocki <widocki@broadinstitute.org>
Date: Thu, 20 Mar 2025 11:08:49 -0400
Subject: [PATCH 6/6] removed a space in one of the library functions

---
 scripts/src/qc_table_functions.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/src/qc_table_functions.R b/scripts/src/qc_table_functions.R
index f9520845..18aec5d4 100644
--- a/scripts/src/qc_table_functions.R
+++ b/scripts/src/qc_table_functions.R
@@ -2,7 +2,7 @@ options(cli.unicode = FALSE)
 library(tidyverse)
 library(data.table)
 library(magrittr)
-library (PRROC)
+library(PRROC)
 
 # BY ID_COLS (PCR_PLATE, PCR_WELL) ----------