From 9d1c76e3037501b2c1686c28560c61a510705cc7 Mon Sep 17 00:00:00 2001 From: uwidocki Date: Mon, 10 Mar 2025 14:41:11 -0400 Subject: [PATCH 1/6] if flowcell_lanes is a column, cforces it to be a string otherwise the collating function will not work if it is a single integer. Minor edits to make defaults in the script match new defaults we also use on Jenkins UI --- scripts/collate_fastq_reads.R | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/scripts/collate_fastq_reads.R b/scripts/collate_fastq_reads.R index 03af8264..cbadc68a 100644 --- a/scripts/collate_fastq_reads.R +++ b/scripts/collate_fastq_reads.R @@ -11,12 +11,12 @@ parser <- ArgumentParser() # specify desired options parser$add_argument("-v", "--verbose", action="store_true", default=TRUE, help="Print extra output [default]") parser$add_argument("-q", "--quietly", action="store_false", dest="verbose", help="Print little output") -parser$add_argument('--raw_counts_uncollapsed', default= "raw_counts_uncollapsed.csv", +parser$add_argument('--raw_counts_uncollapsed', default= "raw_counts_uncollapsed.csv.gz", help= "path to file containing uncollapsed raw counts file") parser$add_argument("--sample_meta", default="sample_meta.csv", help= "Sample metadata") parser$add_argument("--cell_line_meta", default="cell_line_meta.csv", help= "Cell line metadata") parser$add_argument("--CB_meta", default= "CB_meta.csv", help= "Control Barcode metadata") -parser$add_argument('--sequencing_index_cols', default= 'index_1,index_2', +parser$add_argument('--sequencing_index_cols', default= 'flowcell_names,index_1,index_2', help= 'List of sequencing columns in the sample meta.') parser$add_argument("--id_cols", default= "pcr_plate,pcr_well", help = "Columns that identify a unique PCR well") @@ -38,9 +38,13 @@ if (args$out == "") { } # Read in metadata files as data.table objects ---- -sample_meta= data.table::fread(args$sample_meta, header= TRUE, sep= ',') cell_line_meta= data.table::fread(args$cell_line_meta, header= TRUE, sep= ',') CB_meta= data.table::fread(args$CB_meta, header= TRUE, sep= ',') +sample_meta= data.table::fread(args$sample_meta, header= TRUE, sep= ',') +## if flowcell_lanes one lane, the integer gets read in as numeric, so force this to be a string +if(validate_columns_exist("flowcell_lanes", sample_meta)){ + sample_meta %<>% mutate(flowcell_lanes = as.character(flowcell_lanes)) +} # Parse some parameters into vectors ---- sequencing_index_cols= unlist(strsplit(args$sequencing_index_cols, ",")) From b8a5a5a00bb7b69f511452de3533740c1e16f3d4 Mon Sep 17 00:00:00 2001 From: uwidocki Date: Fri, 14 Mar 2025 09:32:49 -0400 Subject: [PATCH 2/6] added statement to make index_purity 0 if it is NA to prevent collate modules from quitting --- scripts/src/collate_fastq_reads.R | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/scripts/src/collate_fastq_reads.R b/scripts/src/collate_fastq_reads.R index 264ceb6e..5f4b4bc3 100644 --- a/scripts/src/collate_fastq_reads.R +++ b/scripts/src/collate_fastq_reads.R @@ -180,6 +180,11 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta, # Calculate index purity ---- # This is only accurate if the Nori input file is small enough to fit into a chunk. index_purity= sum(summed_reads$n) / sum(uncollapsed_raw_counts$n) + # if index_purity is NA, make it 0 + if(is.na(index_purity)){ + index_purity = 0 + } + # Throw an error if the purity is greater than 1. # Throw a warning if the purity is below 0.5. print(paste0('Index purity in chunk: ', round(index_purity, 4))) From f718c86e71d69066fd46a9bcfffe076173748c74 Mon Sep 17 00:00:00 2001 From: uwidocki Date: Mon, 17 Mar 2025 09:36:21 -0400 Subject: [PATCH 3/6] edit to QC image #8 to calculate log2_n in normalized counts if not present but is the value_col parameter --- scripts/src/QC_images.R | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/scripts/src/QC_images.R b/scripts/src/QC_images.R index 51467275..7e70d3fe 100755 --- a/scripts/src/QC_images.R +++ b/scripts/src/QC_images.R @@ -289,7 +289,10 @@ create_cdf_plot= function(input_df, id_cols, counts_col= 'n', mark1= 0.5, mark2= #' @returns Returns a ggplot object. create_ctrlBC_scatterplots= function(normalized_counts, id_cols, value_col= 'log2_n') { # Validation: Check that id_cols and value_col exist in filtered counts. - if(!validate_columns_exist(c(id_cols, value_col), normalized_counts)) { + if(value_col == "log2_n" & validate_columns_exist(c(id_cols, "n"), normalized_counts) & + !(validate_columns_exist(c("log2_n"), normalized_counts))){ + normalized_counts %<>% mutate(log2_n = log2(n+1)) + }else if(!validate_columns_exist(c(id_cols, value_col), normalized_counts)) { stop('Some input columns were not detected in normalized counts.') } @@ -303,10 +306,10 @@ create_ctrlBC_scatterplots= function(normalized_counts, id_cols, value_col= 'log dplyr::filter(!is.na(cb_ladder), n != 0) %>% dplyr::group_by(pick(all_of(id_cols))) %>% dplyr::mutate(mean_y= mean(cb_log2_dose), - residual2= (cb_log2_dose - log2_n)^2, + residual2= (cb_log2_dose - log2(n+1))^2, squares2= (cb_log2_dose - mean_y)^2, norm_r2= 1 - sum(residual2) / sum(squares2), - norm_mae= median(abs(cb_log2_dose- log2_n))) %>% ungroup() + norm_mae= median(abs(cb_log2_dose- log2(n+1)))) %>% ungroup() } # Filter for just the control barcodes, create a profile_id for faceting, From c8aa298e1ed89207ae2717002d75da17180c0ef5 Mon Sep 17 00:00:00 2001 From: uwidocki Date: Mon, 17 Mar 2025 09:52:20 -0400 Subject: [PATCH 4/6] added print statement --- scripts/src/QC_images.R | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scripts/src/QC_images.R b/scripts/src/QC_images.R index 7e70d3fe..f278d8d3 100755 --- a/scripts/src/QC_images.R +++ b/scripts/src/QC_images.R @@ -291,6 +291,7 @@ create_ctrlBC_scatterplots= function(normalized_counts, id_cols, value_col= 'log # Validation: Check that id_cols and value_col exist in filtered counts. if(value_col == "log2_n" & validate_columns_exist(c(id_cols, "n"), normalized_counts) & !(validate_columns_exist(c("log2_n"), normalized_counts))){ + print("Calculating log2_n") normalized_counts %<>% mutate(log2_n = log2(n+1)) }else if(!validate_columns_exist(c(id_cols, value_col), normalized_counts)) { stop('Some input columns were not detected in normalized counts.') @@ -306,10 +307,10 @@ create_ctrlBC_scatterplots= function(normalized_counts, id_cols, value_col= 'log dplyr::filter(!is.na(cb_ladder), n != 0) %>% dplyr::group_by(pick(all_of(id_cols))) %>% dplyr::mutate(mean_y= mean(cb_log2_dose), - residual2= (cb_log2_dose - log2(n+1))^2, + residual2= (cb_log2_dose - log2_n)^2, squares2= (cb_log2_dose - mean_y)^2, norm_r2= 1 - sum(residual2) / sum(squares2), - norm_mae= median(abs(cb_log2_dose- log2(n+1)))) %>% ungroup() + norm_mae= median(abs(cb_log2_dose - log2_n))) %>% ungroup() } # Filter for just the control barcodes, create a profile_id for faceting, From 6ccb75cdc00bddc575752ed088f91d33293a44f5 Mon Sep 17 00:00:00 2001 From: uwidocki Date: Mon, 17 Mar 2025 16:08:23 -0400 Subject: [PATCH 5/6] made changes to use pseudocount in create control barcode plot function in QC images --- scripts/filteredCounts_QC.R | 4 +++- scripts/filteredCounts_QC.sh | 2 ++ scripts/src/QC_images.R | 11 +++++++---- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/scripts/filteredCounts_QC.R b/scripts/filteredCounts_QC.R index b33b2e47..a1aa8404 100755 --- a/scripts/filteredCounts_QC.R +++ b/scripts/filteredCounts_QC.R @@ -40,6 +40,7 @@ parser$add_argument('--count_threshold', default= 40, help= 'Low counts theshold parser$add_argument('--reverse_index2', type= "logical", default= FALSE, help= 'Switch to reverse complement index_2 for some sequencers') parser$add_argument('-o', '--out', default= '', help= 'Output path, defaults to working directory') +parser$add_argument("--pseudocount", default=20, help = "pseudo raw count used in normalization") # get command line options, if help option encountered print help and exit args <- parser$parse_args() @@ -85,4 +86,5 @@ QC_images(raw_counts_uncollapsed_path= args$raw_counts_uncollapsed, control_type= args$control_type, count_threshold= as.numeric(args$count_threshold), reverse_index2= args$reverse_index2, - out= args$out) + out= args$out, + pseudocount = as.numeric(args$pseudocount)) diff --git a/scripts/filteredCounts_QC.sh b/scripts/filteredCounts_QC.sh index fe1b1b21..0d084d1c 100644 --- a/scripts/filteredCounts_QC.sh +++ b/scripts/filteredCounts_QC.sh @@ -99,6 +99,7 @@ echo COUNT_THRESHOLD is: $COUNT_THRESHOLD echo RAW_COUNTS_UNCOLLAPSED is: $RAW_COUNTS_UNCOLLAPSED echo LFC is: $LFC echo REVERSE_INDEX2 is: $REVERSE_INDEX2 +echo PSEUDOCOUNT is: $PSEUDOCOUNT args=( --sample_meta "$SAMPLE_META" @@ -117,6 +118,7 @@ args=( --reverse_index2 "$REVERSE_INDEX2" --chunk_size "$CHUNK_SIZE" --cell_line_cols "$CELL_LINE_COLS" +--pseudocount "$PSEUDOCOUNT" ) echo Rscript filteredCounts_QC.R "${args[@]}" diff --git a/scripts/src/QC_images.R b/scripts/src/QC_images.R index f278d8d3..d908e5ab 100755 --- a/scripts/src/QC_images.R +++ b/scripts/src/QC_images.R @@ -286,8 +286,9 @@ create_cdf_plot= function(input_df, id_cols, counts_col= 'n', mark1= 0.5, mark2= #' @param normalized_counts Dataframe output from the normalize module. #' @param id_cols Vector of column names that identify every PCR well. #' @param value_col Name of the column that contains the values. +#' @param pseudocount a pseudo raw count used for missing cell line counts, also used when computing log2 on raw counts #' @returns Returns a ggplot object. -create_ctrlBC_scatterplots= function(normalized_counts, id_cols, value_col= 'log2_n') { +create_ctrlBC_scatterplots= function(normalized_counts, id_cols, value_col= 'log2_n', pseudocount = 20) { # Validation: Check that id_cols and value_col exist in filtered counts. if(value_col == "log2_n" & validate_columns_exist(c(id_cols, "n"), normalized_counts) & !(validate_columns_exist(c("log2_n"), normalized_counts))){ @@ -470,6 +471,7 @@ create_replicate_scatterplots= function(input_df, cell_line_cols, replicate_grou #' @param count_threshold Threshold for low read counts. #' @param reverse_index2 Boolean set to TRUE if the sequencing involved the reverse complement workflow. #' @param out Path to the directory to save the QC images. +#' @param pseudocount a pseudo raw count used for missing cell line counts, also used when computing log2 on raw counts #' @returns NA. QC images are written out to the specified folder. QC_images= function(raw_counts_uncollapsed_path, prism_barcode_counts, unknown_barcode_counts, @@ -477,10 +479,10 @@ QC_images= function(raw_counts_uncollapsed_path, sample_meta, barcode_col= 'forward_read_barcode', id_cols= c('pcr_plate', 'pcr_well'), - cell_line_cols= c('depmap_id'), + cell_line_cols= c('depmap_id', 'lua'), sig_cols, control_type= 'ctl_vehicle', count_threshold= 40, - chunk_size= 10^6, + chunk_size= 10^6, pseudocount = 20, reverse_index2= FALSE, out= NA) { # Required packages ---- @@ -697,7 +699,8 @@ QC_images= function(raw_counts_uncollapsed_path, print('8. Generating control_barcode_trend image') potential_error= base::tryCatch({ trend_sc= create_ctrlBC_scatterplots(normalized_counts %>% dplyr::filter(!cb_ladder %in% c(NA, FALSE, 'none', '')), - id_cols, value_col= 'log2_n') + id_cols, value_col= 'log2_n', + pseudocount = pseudocount) pdf(file=paste(out, "control_barcode_trend.pdf", sep="/"), width= sqrt(num_profiles) * 2, height= sqrt(num_profiles) * 2) From 7d35a89af570451b96a63aba9e87765550965116 Mon Sep 17 00:00:00 2001 From: uwidocki Date: Thu, 20 Mar 2025 11:08:49 -0400 Subject: [PATCH 6/6] removed a space in one of the library functions --- scripts/src/qc_table_functions.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/src/qc_table_functions.R b/scripts/src/qc_table_functions.R index f9520845..18aec5d4 100644 --- a/scripts/src/qc_table_functions.R +++ b/scripts/src/qc_table_functions.R @@ -2,7 +2,7 @@ options(cli.unicode = FALSE) library(tidyverse) library(data.table) library(magrittr) -library (PRROC) +library(PRROC) # BY ID_COLS (PCR_PLATE, PCR_WELL) ----------