diff --git a/scripts/collate_fastq_reads.R b/scripts/collate_fastq_reads.R index 03af826..9daa941 100644 --- a/scripts/collate_fastq_reads.R +++ b/scripts/collate_fastq_reads.R @@ -42,6 +42,9 @@ sample_meta= data.table::fread(args$sample_meta, header= TRUE, sep= ',') cell_line_meta= data.table::fread(args$cell_line_meta, header= TRUE, sep= ',') CB_meta= data.table::fread(args$CB_meta, header= TRUE, sep= ',') +# Check that sample_meta has complete flowcell names and lanes +check_flowcell_names_in_sample_meta(sample_meta, args$out) + # Parse some parameters into vectors ---- sequencing_index_cols= unlist(strsplit(args$sequencing_index_cols, ",")) id_cols= unlist(strsplit(args$id_cols, ",")) @@ -86,7 +89,8 @@ chunked_results= process_in_chunks(large_file_path= args$raw_counts_uncollapsed, CB_meta[[args$barcode_col]])), reverse_index2= args$reverse_index2, barcode_col= args$barcode_col, - low_abundance_threshold= as.numeric(args$low_abundance_threshold)) + low_abundance_threshold= as.numeric(args$low_abundance_threshold), + out_dir = args$out) # From each chunk, extract prism_barcode_counts or unknown_barcode_counts and bind those rows together. # Then use data.table to aggregate and sum up reads across the chunks. diff --git a/scripts/create_cellDB_metadata.R b/scripts/create_cellDB_metadata.R index 03d2173..d95a0e0 100644 --- a/scripts/create_cellDB_metadata.R +++ b/scripts/create_cellDB_metadata.R @@ -115,11 +115,13 @@ if(all(cell_set_meta_long$cell_set %in% assay_pools_meta$davepool_id)) { cell_set_assay_pool_meta <- cell_set_meta_long %>% inner_join(assay_pools_meta, by = c("cell_set" = "davepool_id", "members" = "depmap_id")) %>% select(cell_set, pool_id, barcode_id, depmap_id = members) %>% - rename("lua" = "barcode_id") + rename("lua" = "barcode_id") %>% + dplyr::distinct() } else { print("One or more cell sets passed in sample_meta have not been registered in CellDB. Unable to pull pool_id in cell_set_meta.") cell_set_assay_pool_meta <- cell_set_meta_long %>% - select(cell_set, depmap_id = members) + select(cell_set, depmap_id = members) %>% + dplyr::distinct() } CB_meta <- control_bc_df diff --git a/scripts/create_sample_meta.py b/scripts/create_sample_meta.py index 1bcbd42..9a6ec46 100644 --- a/scripts/create_sample_meta.py +++ b/scripts/create_sample_meta.py @@ -50,7 +50,8 @@ def rename_sample_meta(df): "treatment": "pert_name", "dose": "pert_dose", "dose_unit": "pert_dose_unit", - "control_barcodes": "cb_ladder" + "control_barcodes": "cb_ladder", + "replicate": "replicate_plate" } return df.rename(columns=rename_map) @@ -60,15 +61,24 @@ def remove_sample_meta_columns(df, columns_to_remove): return df.drop(columns=columns_to_remove, errors='ignore') -def filter_nan_flowcells(df): +def filter_nan_flowcells(df, build_dir): # Count rows with NaN in "flowcell_names" nan_count = df["flowcell_names"].isna().sum() - # Print the count if any rows are dropped + # Print the count if any rows are dropped and append that output to a file if nan_count > 0: - print( - f"Warning: {nan_count} rows with NaN in 'flowcell_names' will be dropped." - ) + # If the build_dir/logs directory does not exist, create it + os.makedirs(os.path.join(build_dir, "logs"), exist_ok=True) + warning_message = f"Warning: {nan_count} rows with NaN in 'flowcell_names' will be dropped." + print(warning_message) + output_path = os.path.join(build_dir, "logs/critical_output.txt") + with open(output_path, "a") as f: + f.write(warning_message + "\n") + + # Write the filtered DataFrame to a CSV file + df_filtered = df.dropna(subset=["flowcell_names"]) + filtered_path = os.path.join(build_dir, "logs/na_flowcells.csv") + df_filtered.to_csv(filtered_path, index=False, mode='a', header=False) # Drop rows with NaN in "flowcell_names" return df.dropna(subset=["flowcell_names"]) @@ -104,7 +114,7 @@ def main(): api_key=api_key) .pipe(rename_sample_meta) .pipe(remove_sample_meta_columns, ["id", "pert_platemap_id", "prism_pcr_plate_id", "pcr_plate_well_id", "index_set"]) - .pipe(filter_nan_flowcells) + .pipe(filter_nan_flowcells, build_dir) ) print("Retrieved following sample_metadata from COMET: ") print(df.head()) diff --git a/scripts/filter_counts.R b/scripts/filter_counts.R index ad0da33..af5e1a1 100755 --- a/scripts/filter_counts.R +++ b/scripts/filter_counts.R @@ -122,8 +122,7 @@ if(args$rm_data){ } # Filter skipped wells if needed ---- -print(paste("Filtering skipped wells:", args$rm_data)) -if(skipped_wells & args$filter_skipped_wells){ +if(exists("skipped_wells") && args$filter_skipped_wells){ print('filter_skipped_wells is TRUE and the skipped wells file contains entries, removing skipped wells.') print('Skipped wells:') print(head(skipped_wells)) diff --git a/scripts/generate_qc_tables.sh b/scripts/generate_qc_tables.sh index fe6d01e..29df91b 100644 --- a/scripts/generate_qc_tables.sh +++ b/scripts/generate_qc_tables.sh @@ -28,11 +28,14 @@ enforce_abs_path ANNOTATED_COUNTS enforce_abs_path CELL_SET_AND_POOL_META enforce_abs_path FILTERED_COUNTS enforce_abs_path UNKNOWN_BARCODE_COUNTS +enforce_abs_path QC_PARAMS +enforce_abs_path SAMPLE_META args=( --normalized_counts "$NORMALIZED_COUNTS" --annotated_counts "$ANNOTATED_COUNTS" --filtered_counts "$FILTERED_COUNTS" +--sample_meta "$SAMPLE_META" --negcon_type "$CTL_TYPES" --poscon_type "$POSCON_TYPE" --cell_set_and_pool_meta "$CELL_SET_AND_POOL_META" @@ -43,6 +46,7 @@ args=( --control_barcode_meta "$BUILD_DIR/CB_meta.csv" --unknown_barcode_counts "$UNKNOWN_BARCODE_COUNTS" --filter_qc_flags "$FILTER_QC_FLAGS" +--qc_params "$QC_PARAMS" ) echo Rscript qc_tables.R "${args[@]}" diff --git a/scripts/launch_job.sh b/scripts/launch_job.sh index 785e236..a868b28 100644 --- a/scripts/launch_job.sh +++ b/scripts/launch_job.sh @@ -40,6 +40,8 @@ PARAMS=( VIABILITY_CAP # biomarker parameters UNIVARIATE_BIOMARKER MULTIVARIATE_BIOMARKER BIOMARKER_FILE DR_COLUMN LFC_BIOMARKER AUC_BIOMARKER DR_PATH COLLAPSED_L2FC_COLUMN + # qc parameters + QC_PARAMS ) # Load parameters @@ -135,6 +137,7 @@ echo "Running in container:" -e FILTER_SKIPPED_WELLS="$FILTER_SKIPPED_WELLS" \ -e SKIPPED_WELLS="$SKIPPED_WELLS" \ -e FILTER_QC_FLAGS="$FILTER_QC_FLAGS" \ + -e QC_PARAMS="$QC_PARAMS" \ -v "$WORKSPACE:/workspace" \ -v /cmap/tools/analysis2clue/credentials:/root/.aws/credentials:ro \ -v /local/jenkins/.clue_api_key:/local/jenkins/.clue_api_key:ro \ diff --git a/scripts/make_config_file.groovy b/scripts/make_config_file.groovy index 1b5b727..19689cb 100644 --- a/scripts/make_config_file.groovy +++ b/scripts/make_config_file.groovy @@ -35,7 +35,7 @@ pipeline { ) string(name: 'BUILD_DIR', defaultValue: '/cmap/obelix/pod/prismSeq/', description: 'Directory where the build output will go. Must contain the raw counts file from Nori. If not getting your metadata from cellDB & COMET this directory must also include the sample and cell line/pool metadata.') string(name: 'BUILD_NAME', defaultValue: '', description: 'Build name; used to name output files from the adapter and QC scripts') - string(name: 'SIG_COLS', defaultValue: 'cell_set,pert_name,pert_id,pert_dose,pert_dose_unit,day,x_project_id,pert_plate', description: 'List of signature columns found in the sample meta that describe unique treatment conditions. Generally, this list should NOT include replicate information such as \"tech_rep\" or \"bio_rep\".') + string(name: 'SIG_COLS', defaultValue: 'cell_set,pert_type,pert_name,pert_id,pert_dose,pert_dose_unit,day,x_project_id,pert_plate', description: 'List of signature columns found in the sample meta that describe unique treatment conditions. Generally, this list should NOT include replicate information such as \"tech_rep\" or \"bio_rep\".') separator( name: "metadata", @@ -101,7 +101,7 @@ pipeline { ) string(name: 'CTL_TYPES', defaultValue: 'ctl_vehicle', description: 'Value in the pert_type column of the sample meta that identifies the negative contols.') string(name: 'POSCON_TYPE', defaultValue: 'trt_poscon', description: 'Value in the pert_type column of the sample meta that identifies the positive controls.') - string(name: 'CONTROL_COLS', defaultValue: 'cell_set,day,pert_plate', description: 'List of columns found in the sample meta that describe individual negative control conditions.') + string(name: 'CONTROL_COLS', defaultValue: 'cell_set,day,pcr_plate,replicate_plate', description: 'List of columns found in the sample meta that describe individual negative control conditions.') // Parameters that we don't expect users to change separator( @@ -111,6 +111,23 @@ pipeline { sectionHeaderStyle: sectionHeaderStyleRed ) + // QC Paramters + string(name: 'QC_PARAMS', defaultValue: 'qc_params.json', description: 'File name in BUILD_DIR containing the QC parameters.') + string(name: 'nc_variability_threshold', defaultValue: '1', description: 'Threshold for negative control variability') + string(name: 'error_rate_threshold', defaultValue: '0.05', description: 'Threshold for error rate') + string(name: 'pc_viability_threshold', defaultValue: '0.25', description: 'Threshold for positive control viability') + string(name: 'nc_raw_count_threshold', defaultValue: '40', description: 'Threshold for negative control raw counts') + string(name: 'contamination_threshold', defaultValue: '0.8', description: 'Threshold for fraction of expected reads') + string(name: 'cb_mae_threshold', defaultValue: '1', description: 'Threshold for mean absolute error of control barcodes') + string(name: 'cb_spearman_threshold', defaultValue: '0.8', description: 'Threshold for control barcode spearman correlation') + string(name: 'cb_cl_ratio_low_negcon', defaultValue: '0', description: 'Low threshold for control barcode ratio in negative controls') + string(name: 'cb_cl_ratio_high_negcon', defaultValue: '100', description: 'High threshold for control barcode ratio in negative controls') + string(name: 'cb_cl_ratio_low_poscon', defaultValue: '0', description: 'Low threshold for control barcode ratio in positive controls') + string(name: 'cb_cl_ratio_high_poscon', defaultValue: '100', description: 'High threshold for control barcode ratio in positive controls') + string(name: 'well_reads_threshold', defaultValue: '100', description: 'Minimum median control barcode reads per well') + string(name: 'pool_well_delta_threshold', defaultValue: '5', description: 'Maximum delta of log2_normalized_n between a cell line and the pool median in a given well before it is considered an outlier') + string(name: 'pool_well_fraction_threshold', defaultValue: '0.4', description: 'Minimum fraction of cells in a pool that must be outliers in order to flag that pool/well') + string(name: 'fraction_expected_controls', defaultValue: '0.667', description: 'Fraction of expected controls that must be present in a given pcr_plate for it to be considered a valid well. If either vehicle or poscon wells fall below this threshold, the entire pcr_plate will be removed.') // Sequencing tech string(name: 'SEQ_TYPE', defaultValue: 'DRAGEN', description: 'Choose DRAGEN, MiSeq, HiSeq, or NovaSeq. MiSeq and HiSeq/NovaSeq return files named differently. This setting sets the INDEX_1, INDEX_2, and BARCODE_SUFFIX parameters in fastq2readcount. Select DRAGEN if fastq files are from the DRAGEN pipeline from GP. Choosing NovaSeq reverses index 2.') @@ -125,7 +142,7 @@ pipeline { string(name: 'SAMPLE_META', defaultValue: 'sample_meta.csv', description: 'File name in BUILD_DIR of the sample meta.') string(name: 'CELL_SET_AND_POOL_META', defaultValue: 'cell_set_and_pool_meta.csv', description: 'Cell set and pool information for this run.') string(name: 'CELL_LINE_META', defaultValue: 'cell_line_meta.csv', description: 'File in BUILD_DIR containing cell line metadata') - string(name: 'CONTROL_BARCODE_META', defaultValue: 'h-b', description: 'Metadata for control barcodes. If the CBs exist in cellDB, this can simply be the cb_ladder name (ie, h-a). Otherwise, this must be a csv file located in the build directory.') + string(name: 'CONTROL_BARCODE_META', defaultValue: 'h-a', description: 'Metadata for control barcodes. If the CBs exist in cellDB, this can simply be the lowercase cb_ladder name (ie, h-a). Otherwise, this must be a csv file located in the build directory.') // Additional parameters ordered by when they first appear string(name: 'BARCODE_COL', defaultValue: 'forward_read_barcode', description: 'Name of the column containing the barcode sequence. The column containing the barcode sequence should have the same name across the Nori output file, the cell line metadata, and the CB metadata. This defaults to \"forward_read_barcode\", and the paramter is first used in COLLATE_FASTQ_READS.') @@ -134,7 +151,7 @@ pipeline { string(name: 'CHUNK_SIZE', defaultValue: '10000000', description: 'Number of rows for a chunk. Due to the large size of the Nori output, some actions are performed in chunks to conserve memory. This parameter sets the size of a chunk and defaults to 10^6 or \"10000000\". This paramter is first used in COLLATE_FASTQ_READS.') string(name: 'LOW_ABUNDANCE_THRESHOLD', defaultValue: '20', description: 'Threshold for unknown barcodes. Unknown barcodes below this threshold will be deidentified and their counts will be included under the term unknown_low_abundance_barcode. This paramter defaults to \"20\" and is used in COLLATE_FASTQ_READS.') string(name: 'PSEUDOCOUNT', defaultValue: '20', description: 'Pseudocount value added to all reads before log transformations. This defaults to \"20\" and is used in CBNORMALIZE.') - string(name: 'CELL_LINE_COLS', defaultValue: 'pool_id,depmap_id,lua', description: 'List of columns across the metadata files that are used to identify a unique cell line. This defaults to \"pool_id,depmap_id,lua\", but can also include \"cell_set\" or descriptive columns like \"project_code\" that you would like to pass through the pipeline. This parameter is first used in COMPUTE_LFC.') + string(name: 'CELL_LINE_COLS', defaultValue: 'pool_id,depmap_id,lua,cell_set', description: 'List of columns across the metadata files that are used to identify a unique cell line. This defaults to \"pool_id,depmap_id,lua\", but can also include \"cell_set\" or descriptive columns like \"project_code\" that you would like to pass through the pipeline. This parameter is first used in COMPUTE_LFC.') string(name: 'COUNT_COL_NAME', defaultValue: 'log2_normalized_n', description: 'Name of the numerical column that should be used to compute log2 fold change values. This defaults to \"normalized_n\" and is used in COMPUTE_LFC.') string(name: 'COUNT_THRESHOLD', defaultValue: '40', description: 'Threshold for filtering the negative controls. In the negative control conditions, cell lines whose median counts are below this threshold are not confidently detected and thus are dropped. This defaults to \"40\" and is used in COMPUTE_LFC.') string(name: 'L2FC_COLUMN', defaultValue: 'l2fc', description: 'Name of the column containing the log2 fold change values used in DRC. This defaults to \"l2fc\".') @@ -162,6 +179,7 @@ pipeline { environment { CONFIG_FILE_PATH = "${params.BUILD_DIR}/config.json" + QC_PARAMS_FILE_PATH = "${params.BUILD_DIR}/qc_params.json" } stages { @@ -235,7 +253,10 @@ pipeline { 'DR_PATH', //io parameters - 'MERGE_PATTERNS' + 'MERGE_PATTERNS', + + //qc parameters + 'QC_PARAMS', 'FRACTION_EXPECTED_CONTROLS' ] def config = [:] @@ -268,6 +289,37 @@ pipeline { } } + stage('Generate qc_flag_params') { + steps { + script { + def paramList = [ + 'nc_variability_threshold', 'error_rate_threshold', 'pc_viability_threshold', + 'nc_raw_count_threshold', 'contamination_threshold', 'cb_mae_threshold', + 'cb_spearman_threshold', 'cb_cl_ratio_low_negcon', 'cb_cl_ratio_high_negcon', + 'cb_cl_ratio_low_poscon', 'cb_cl_ratio_high_poscon', 'well_reads_threshold', + 'pool_well_delta_threshold', 'pool_well_fraction_threshold', 'fraction_expected_controls' + ] + + def config = [:] + + // Load existing params if they exist + if (fileExists(env.QC_PARAMS_FILE_PATH)) { + def configText = readFile(file: env.QC_PARAMS_FILE_PATH) + config = new HashMap(new JsonSlurper().parseText(configText)) + } + + // Add parameters from Jenkins UI only if they don't exist in the config + paramList.each { paramName -> + if (!config.containsKey(paramName) && params.containsKey(paramName)) { + config[paramName] = params[paramName] + } + } + // Write the config back to file after all updates + writeFile file: env.QC_PARAMS_FILE_PATH, text: groovy.json.JsonOutput.prettyPrint(groovy.json.JsonOutput.toJson(config)) + } + } + } + stage('Add Commit ID to Config') { steps { script { @@ -369,7 +421,7 @@ pipeline { script { def buildDir = params.BUILD_DIR def buildName = params.BUILD_NAME ?: "default_build_name" - def logFilePath = "${buildDir}/${buildName}_console_output.log" + def logFilePath = "${buildDir}/logs/console_output.log" // Capture the entire console log def log = currentBuild.rawBuild.getLog(999999) diff --git a/scripts/qc_tables.R b/scripts/qc_tables.R index 53c927e..f67fbf4 100644 --- a/scripts/qc_tables.R +++ b/scripts/qc_tables.R @@ -1,40 +1,46 @@ options(cli.unicode = FALSE) -suppressPackageStartupMessages( - { - library(argparse) - library(magrittr) - library(tidyverse) - library(data.table) - library(jsonlite) - } -) +suppressPackageStartupMessages({ + library(argparse) + library(magrittr) + library(tidyverse) + library(data.table) + library(jsonlite) + library(dplyr) +}) source("./src/qc_table_functions.R") source("./src/kitchen_utensils.R") # Argument parser ---- parser <- ArgumentParser() parser$add_argument( - "--cell_set_and_pool_meta", default = "cell_set_and_pool_meta.csv", - help = "Cell line metadata" + "--cell_set_and_pool_meta", + default = "cell_set_and_pool_meta.csv", + help = "Cell line metadata" ) parser$add_argument( - "--normalized_counts", default = "normalized_counts.csv", help = "normalized counts file" + "--normalized_counts", + default = "normalized_counts.csv", help = "normalized counts file" ) parser$add_argument( - "--annotated_counts", default = "annotated_counts.csv", help = "annotated counts file" + "--annotated_counts", + default = "annotated_counts.csv", help = "annotated counts file" ) parser$add_argument( - "--filtered_counts", default = "filtered_counts.csv", help = "filtered counts file" + "--filtered_counts", + default = "filtered_counts.csv", help = "filtered counts file" ) parser$add_argument( - "-o", "--out", default = getwd(), help = "Output path. Default is working directory" + "-o", "--out", + default = getwd(), help = "Output path. Default is working directory" ) parser$add_argument( - "--control_barcode_meta", default = "CB_meta.csv", help = "Control barcode metadata" + "--control_barcode_meta", + default = "CB_meta.csv", help = "Control barcode metadata" ) parser$add_argument( - "--unknown_barcode_counts", default = "unknown_barcode_counts.csv", - help = "Unknown barcode counts file" + "--unknown_barcode_counts", + default = "unknown_barcode_counts.csv", + help = "Unknown barcode counts file" ) parser$add_argument("-n", "--negcon_type", default = "ctl_vehicle") parser$add_argument("-p", "--poscon_type", default = "trt_poscon") @@ -42,31 +48,53 @@ parser$add_argument("--cell_line_cols", default = "depmap_id,pool_id,lua") parser$add_argument("--id_cols", default = "pcr_plate,pcr_well") parser$add_argument("--count_threshold", default = 40) parser$add_argument("--pseudocount", default = 20) -parser$add_argument("--filter_qc_flags", default = "true", - help = "Filter out wells with QC flags. Default is TRUE") +parser$add_argument("--filter_qc_flags", + default = "true", + help = "Filter out wells with QC flags. Default is TRUE" +) +parser$add_argument( + "--qc_params", + default = "qc_params.csv", + help = "File containing QC parameters" +) +parser$add_argument( + "--sample_meta", + default = "sample_meta.csv", + help = "Sample metadata file" +) args <- parser$parse_args() # Read in metadata files as data.table objects ---- -paste0("Reading in ", args$cell_set_and_pool_meta, ".....") +print(paste0("Reading in ", args$cell_set_and_pool_meta, ".....")) cell_set_meta <- data.table::fread(args$cell_set_and_pool_meta, header = TRUE, sep = ",") -paste0("Reading in ", args$normalized_counts, ".....") -normalized_counts <- data.table::fread(args$normalized_counts, header = TRUE, sep = ",") -paste0("Reading in ", args$annotated_counts, ".....") +print(paste0("Reading in ", args$annotated_counts, ".....")) annotated_counts <- data.table::fread(args$annotated_counts, header = TRUE, sep = ",") -paste0("Reading in ", args$filtered_counts, ".....") +print(paste0("Reading in ", args$filtered_counts, ".....")) filtered_counts <- data.table::fread(args$filtered_counts, header = TRUE, sep = ",") -paste0("Reading in ", args$control_barcode_meta, ".....") +print(paste0("Reading in ", args$control_barcode_meta, ".....")) cb_meta <- data.table::fread(args$control_barcode_meta, header = TRUE, sep = ",") -paste0("Reading in ", args$unknown_barcode_counts, header = TRUE, sep = ",") +print(paste0("Reading in ", args$unknown_barcode_counts, header = TRUE, sep = ",")) unknown_counts <- data.table::fread(args$unknown_barcode_counts, header = TRUE, sep = ",") +print(paste0("Reading in ", args$sample_meta, ".....")) +sample_meta <- data.table::fread(args$sample_meta, header = TRUE, sep = ",") +# If normzlied_counts_original.csv exists, use that, otherwise use args$normalized_counts +normalized_counts_original_path <- paste0(args$out, "/normalized_counts_original.csv") +if (file.exists(normalized_counts_original_path)) { + print("Original normalized counts found, will use this file for QC flags and tables.") + print("Reading in normalized_counts_original.csv.....") + normalized_counts <- data.table::fread(normalized_counts_original_path, header = TRUE, sep = ",") +} else { +print(paste0("Reading in ", args$normalized_counts, ".....")) +normalized_counts <- data.table::fread(args$normalized_counts, header = TRUE, sep = ",") +} # Check if the output directory exists, if not create it -if (!dir.exists(paste0(args$out, "/qc_tables"))) - { - dir.create(paste0(args$out, "/qc_tables")) +if (!dir.exists(paste0(args$out, "/qc_tables"))) { + dir.create(paste0(args$out, "/qc_tables")) } + # DEFINE COLUMNS cell_line_cols <- args$cell_line_cols cell_line_cols_list <- strsplit(cell_line_cols, ",")[[1]] @@ -77,101 +105,172 @@ count_threshold <- as.numeric(args$count_threshold) pseudocount <- as.numeric(args$pseudocount) filter_qc_flags <- as.logical(toupper(args$filter_qc_flags)) -# BY WELL (PCR_PLATE, PCR_WELL) ---------- + +# LOAD QC PARAMETERS +thresholds <- load_thresholds_from_json(args$qc_params) + + +# Calculate the number of expected poscons and negcons +n_expected_controls <- sample_meta %>% + filter(pert_type %in% c("trt_poscon", "ctl_vehicle")) %>% + group_by(pert_plate, pcr_plate, pert_type) %>% + summarize(unique_bio_rep = n_distinct(bio_rep), .groups = "drop") %>% + pivot_wider( + names_from = pert_type, + values_from = unique_bio_rep, + names_prefix = "n_expected_" + ) + +# ID COLS id_cols_table <- generate_id_cols_table( - normalized_counts = normalized_counts, annotated_counts = annotated_counts, unknown_counts = unknown_counts, - cell_set_meta = cell_set_meta, id_cols_list = id_cols_list, cell_line_cols = cell_line_cols_list, - count_threshold = count_threshold, cb_meta = cb_meta, pseudocount = pseudocount -) - -# ID_COLS QC FLAGS -# Generate QC flags for the id_cols table and filter out flagged wells -result <- id_cols_qc_flags( - annotated_counts = annotated_counts, - normalized_counts = normalized_counts, - unknown_counts = unknown_counts, - cb_meta = cb_meta -) -id_cols_qc_flags_table <- result$well_flags -filtered_normalized_counts <- result$result - -# CELL LINE BY PLATE (pcr_plate,depmap_id) --------- -# Filter out control barcodes (where depmap_id is NA) from normalized and filtered counts -normalized_counts_rm_ctl <- filtered_normalized_counts %>% - filter_control_barcodes() + normalized_counts = normalized_counts, annotated_counts = annotated_counts, unknown_counts = unknown_counts, + cell_set_meta = cell_set_meta, id_cols_list = id_cols_list, cell_line_cols = cell_line_cols_list, + count_threshold = count_threshold, cb_meta = cb_meta, pseudocount = pseudocount +) + +id_cols_qc_flags_table <- id_cols_qc_flags(id_cols_table = id_cols_table, + contamination_threshold = thresholds$contamination_threshold, + cb_mae_threshold = thresholds$cb_mae_threshold, + cb_spearman_threshold = thresholds$cb_spearman_threshold, + cb_cl_ratio_low_negcon = thresholds$cb_cl_ratio_low_negcon, + cb_cl_ratio_high_poscon = thresholds$cb_cl_ratio_high_poscon, + cb_cl_ratio_low_poscon = thresholds$cb_cl_ratio_low_poscon, + cb_cl_ratio_high_negcon = thresholds$cb_cl_ratio_high_negcon, + well_reads_threshold = thresholds$well_reads_threshold) + +id_cols_filtered_normalized_counts <- dplyr::anti_join(normalized_counts, id_cols_qc_flags_table, by = c("pcr_plate", "pcr_well")) + + +# POOL WELL +pool_well_table <- generate_pool_well_qc_table( + normalized_counts = id_cols_filtered_normalized_counts, + pool_well_delta_threshold = thresholds$pool_well_delta_threshold, + pool_well_fraction_threshold = thresholds$pool_well_fraction_threshold +) + +pool_well_qc_flags_table <- pool_well_table %>% + dplyr::filter(!is.na(qc_flag)) %>% + unique() + +pool_well_filtered_normalized_counts <- dplyr::anti_join(id_cols_filtered_normalized_counts, pool_well_qc_flags_table, by = c("pcr_plate", "pcr_well", "pool_id")) + + +# PLATE CELL + +filtered_normalized_counts_rm_ctl <- pool_well_filtered_normalized_counts %>% + filter_control_barcodes() filtered_counts_rm_ctl <- filtered_counts %>% - filter_control_barcodes() -plate_cell_table <- generate_cell_plate_table( - normalized_counts = normalized_counts_rm_ctl, filtered_counts = filtered_counts_rm_ctl, - cell_line_cols = cell_plate_list, pseudocount = pseudocount) + filter_control_barcodes() -# POOL WELL QC FLAGS -# Generate QC flags for the pool + well table and filter out flagged wells -pool_well_qc_table <- generate_pool_well_qc_table( - normalized_counts = filtered_normalized_counts +plate_cell_table <- generate_cell_plate_table( + normalized_counts = filtered_normalized_counts_rm_ctl, filtered_counts = filtered_counts_rm_ctl, + cell_line_cols = cell_plate_list, pseudocount = pseudocount ) -result <- pool_well_qc_flags( - normalized_counts = filtered_normalized_counts, - pool_well_qc = pool_well_qc_table +plate_cell_qc_flags_table <- plate_cell_qc_flags( + plate_cell_table = plate_cell_table, + nc_variability_threshold = thresholds$nc_variability_threshold, + error_rate_threshold = thresholds$error_rate_threshold, + pc_viability_threshold = thresholds$pc_viability_threshold, + nc_raw_count_threshold = thresholds$nc_raw_count_threshold) %>% + dplyr::filter(!is.na(qc_flag)) + + +plate_cell_filtered_normalized_counts <- + dplyr::anti_join( + pool_well_filtered_normalized_counts, plate_cell_qc_flags_table, + by = c("pcr_plate", "depmap_id", "lua", "pool_id") ) -pool_well_qc_flags_table <- result$pool_well_flags -filtered_normalized_counts <- result$result + +# PCR PLATE + +pcr_plate_qc_flags_table <- generate_pcr_plate_qc_flags_table(plate_cell_table = plate_cell_table, + fraction_expected_controls = thresholds$fraction_expected_controls) + +final_filtered_normalized_counts <- + dplyr::anti_join( + plate_cell_filtered_normalized_counts, pcr_plate_qc_flags_table, + by = c("pcr_plate", "pert_plate")) # WRITE OUT RESULTS -------- -# Write to file for internal use ---------- -plate_cell_outpath <- paste0(args$out, "/qc_tables/plate_cell_qc_table_internal.csv") -print(paste0("Writing out internal plate_cell_qc_table to ", plate_cell_outpath)) + +# Write pcr_plate_qc_flags table +pcr_plate_qc_flags_outpath <- paste0(args$out, "/qc_tables/pcr_plate_qc_flags.csv") +print(paste0("Writing out pcr_plate_qc_flags to ", pcr_plate_qc_flags_outpath)) write.csv( - x = plate_cell_table, file = plate_cell_outpath, row.names = FALSE, - quote = FALSE + x = pcr_plate_qc_flags_table, file = pcr_plate_qc_flags_outpath, row.names = FALSE, + quote = FALSE ) -check_file_exists(plate_cell_outpath) +# Write plate_cell_qc_table for internal use +plate_cell_qc_table_outpath <- paste0(args$out, "/qc_tables/plate_cell_qc_table_internal.csv") +print(paste0("Writing out internal plate_cell_qc_table to ", plate_cell_qc_table_outpath)) +write.csv( + x = plate_cell_table, file = plate_cell_qc_table_outpath, row.names = FALSE, + quote = FALSE +) +check_file_exists(plate_cell_qc_table_outpath) -# Write to file for portal use---------- -plate_cell_outpath <- paste0(args$out, "/qc_tables/plate_cell_qc_table.csv") -print(paste0("Writing out external plate_cell_qc_table to ", plate_cell_outpath)) +# Write plate_cell_qc_table for portal use +plate_cell_qc_table_outpath_external <- paste0(args$out, "/qc_tables/plate_cell_qc_table.csv") +print(paste0("Writing out external plate_cell_qc_table to ", plate_cell_qc_table_outpath_external)) write.csv( - x = plate_cell_table %>% - dplyr::select( - c("pool_id", "depmap_id", "lua", "pcr_plate", - "pert_plate", - "error_rate", "lfc_trt_poscon", - "median_raw_ctl_vehicle", "mad_log_normalized_ctl_vehicle", - "median_log_normalized_ctl_vehicle", - "n_replicates_ctl_vehicle", "n_replicates_trt_poscon", - "viability_trt_poscon", "qc_pass", "qc_pass_pert_plate")), - file = plate_cell_outpath, row.names = FALSE, - quote = FALSE + x = plate_cell_table %>% + dplyr::select( + c( + "pool_id", "depmap_id", "lua", "pcr_plate", + "pert_plate", "project_code", + "error_rate", "lfc_trt_poscon", + "median_raw_ctl_vehicle", "mad_log_normalized_ctl_vehicle", + "median_log_normalized_ctl_vehicle", + "n_replicates_ctl_vehicle", "n_replicates_trt_poscon", + "viability_trt_poscon", "qc_pass", "qc_pass_pert_plate" + ) + ), + file = plate_cell_qc_table_outpath_external, row.names = FALSE, + quote = FALSE ) -check_file_exists(plate_cell_outpath) +check_file_exists(plate_cell_qc_table_outpath_external) +# Write cl_pertplate_pass_rate_qc_table for portal +pertplate_cell_pass_rate_outpath <- paste0(args$out, "/qc_tables/pertplate_cell_pass_rate.csv") +pertplate_cell_pass_rate <- plate_cell_table %>% + dplyr::distinct(pert_plate, project_code, qc_pass_pert_plate, depmap_id, lua, cell_set) %>% + dplyr::group_by(pert_plate, project_code) %>% + dplyr::summarise(num_cl_pass=sum(qc_pass_pert_plate), + num_cl_failed=sum(!qc_pass_pert_plate)) %>% + dplyr::ungroup() +write.csv(pertplate_cell_pass_rate, + pertplate_cell_pass_rate_outpath, + row.names = F, quote = F) +check_file_exists(pertplate_cell_pass_rate_outpath) -# BY WELL (PCR_PLATE, PCR_WELL) ---------- -id_cols_table <- generate_id_cols_table( - normalized_counts = normalized_counts, annotated_counts = annotated_counts, unknown_counts = unknown_counts, - cell_set_meta = cell_set_meta, id_cols_list = id_cols_list, cell_line_cols = cell_line_cols_list, - count_threshold = count_threshold, cb_meta = cb_meta, pseudocount = pseudocount +# Write plate_cell_qc_flags table +plate_cell_qc_flags_outpath <- paste0(args$out, "/qc_tables/plate_cell_qc_flags.csv") +print(paste0("Writing out plate_cell_qc_flags to ", plate_cell_qc_flags_outpath)) +write.csv( + x = plate_cell_qc_flags_table, file = plate_cell_qc_flags_outpath, row.names = FALSE, + quote = FALSE ) -check_file_exists(plate_cell_outpath) +check_file_exists(plate_cell_qc_flags_outpath) # Write pool_well_qc_table ---------- pool_well_qc_table_outpath <- paste0(args$out, "/qc_tables/pool_well_qc_table.csv") print(paste0("Writing out pool_well_qc_table to ", pool_well_qc_table_outpath)) write.csv( - x = pool_well_qc_table, file = pool_well_qc_table_outpath, row.names = FALSE, - quote = FALSE + x = pool_well_table, file = pool_well_qc_table_outpath, row.names = FALSE, + quote = FALSE ) +check_file_exists(pool_well_qc_table_outpath) # Write pool_well_qc_flags table ---------- pool_well_qc_flags_outpath <- paste0(args$out, "/qc_tables/pool_well_qc_flags.csv") print(paste0("Writing out pool_well_qc_flags to ", pool_well_qc_flags_outpath)) write.csv( - x = pool_well_qc_flags_table, file = pool_well_qc_flags_outpath, row.names = FALSE, - quote = FALSE + x = pool_well_qc_flags_table, file = pool_well_qc_flags_outpath, row.names = FALSE, + quote = FALSE ) check_file_exists(pool_well_qc_flags_outpath) @@ -179,7 +278,7 @@ check_file_exists(pool_well_qc_flags_outpath) id_cols_outpath <- paste0(args$out, "/qc_tables/id_cols_qc_table.csv") print(paste0("Writing out id_cols_qc_table to ", id_cols_outpath)) write.csv( - x = id_cols_table, file = id_cols_outpath, row.names = FALSE, quote = FALSE + x = id_cols_table, file = id_cols_outpath, row.names = FALSE, quote = FALSE ) check_file_exists(id_cols_outpath) @@ -187,32 +286,33 @@ check_file_exists(id_cols_outpath) id_cols_qc_flags_outpath <- paste0(args$out, "/qc_tables/id_cols_qc_flags.csv") print(paste0("Writing out id_cols_qc_flags to ", id_cols_qc_flags_outpath)) write.csv( - x = id_cols_qc_flags_table, file = id_cols_qc_flags_outpath, row.names = FALSE, - quote = FALSE + x = id_cols_qc_flags_table, file = id_cols_qc_flags_outpath, row.names = FALSE, + quote = FALSE ) check_file_exists(id_cols_qc_flags_outpath) if (args$filter_qc_flags) { - # Filter out wells with QC flags - print("Filtering out wells with QC flags") - # Write original normalized counts ---------- - normalized_counts_original_outpath <- paste0(args$out, "/normalized_counts_original.csv") - print(paste0("Writing unfiltered normalized_counts to ", normalized_counts_original_outpath)) - write.csv( - x = normalized_counts, file = normalized_counts_original_outpath, row.names = FALSE, - quote = FALSE) - check_file_exists(normalized_counts_original_outpath) - - # Write filtered normalized counts ---------- - filtered_normalized_counts_outpath <- paste0(args$out, "/normalized_counts.csv") - print(paste0("Writing filtered normalized_counts to ", filtered_normalized_counts_outpath)) - write.csv( - x = filtered_normalized_counts, file = filtered_normalized_counts_outpath, row.names = FALSE, - quote = FALSE) - check_file_exists(filtered_normalized_counts_outpath) - } else { - print("Nomalized counts not filtered for qc_flags.") - } + # Filter out wells with QC flags + print("Filtering out wells with QC flags") + # Write original normalized counts ---------- + normalized_counts_original_outpath <- paste0(args$out, "/normalized_counts_original.csv") + print(paste0("Writing unfiltered normalized_counts to ", normalized_counts_original_outpath)) + write.csv( + x = normalized_counts, file = normalized_counts_original_outpath, row.names = FALSE, + quote = FALSE + ) + check_file_exists(normalized_counts_original_outpath) + # Write filtered normalized counts ---------- + filtered_normalized_counts_outpath <- paste0(args$out, "/normalized_counts.csv") + print(paste0("Writing filtered normalized_counts to ", filtered_normalized_counts_outpath)) + write.csv( + x = final_filtered_normalized_counts, file = filtered_normalized_counts_outpath, row.names = FALSE, + quote = FALSE + ) + check_file_exists(filtered_normalized_counts_outpath) +} else { + print("Nomalized counts not filtered for qc_flags.") +} paste0("QC module completed.") diff --git a/scripts/src/cellDB_metadata.R b/scripts/src/cellDB_metadata.R index 81885b6..6741573 100644 --- a/scripts/src/cellDB_metadata.R +++ b/scripts/src/cellDB_metadata.R @@ -44,7 +44,7 @@ get_cell_api_info = function(api_url, api_key, filter = NULL) { get_LUAs_from_sets <- function(cell_set_name) { - cell_set_definition_api_url <- "https://api.clue.io/api/cell_set_definition_files" + cell_set_definition_api_url <- "https://api.clue.io/api/e_cell_set_definition_files" # Construct the filter object as a JSON string filter <- list(where = list(davepool_id = cell_set_name), fields = c("davepool_id","depmap_id") @@ -55,7 +55,7 @@ get_LUAs_from_sets <- function(cell_set_name) { } get_LUAs_from_pools <- function(cell_pool_name) { - v_assay_pool_api_url <- "https://api.clue.io/api/v_assay_pools" + v_assay_pool_api_url <- "https://api.clue.io/api/v_e_assay_pools" filter <- list(where = list(assay_pool = cell_pool_name), fields = c("assay_pool","depmap_id")) cell_pool_DepMap_df <- get_cell_api_info(v_assay_pool_api_url, api_key, filter) depmap_ids <- list(cell_pool_DepMap_df$depmap_id) diff --git a/scripts/src/collate_fastq_reads.R b/scripts/src/collate_fastq_reads.R index 264ceb6..69499cf 100644 --- a/scripts/src/collate_fastq_reads.R +++ b/scripts/src/collate_fastq_reads.R @@ -53,7 +53,8 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta, known_barcodes, reverse_index2= FALSE, barcode_col= 'forward_read_barcode', - low_abundance_threshold= 20) { + low_abundance_threshold= 20, + out_dir) { require(tidyverse) require(data.table) @@ -134,6 +135,8 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta, flowcell_lane= base::strsplit(flowcell_lanes, split='[,;:]', fixed=F)) %>% tidyr::unnest(cols= flowcell_name) %>% tidyr::unnest(cols= flowcell_lane) %>% dplyr::mutate(flowcell_lane= as.numeric(flowcell_lane)) + print(paste0('Expected ', nrow(expected_flowcells), ' unique flowcell + lane combos in the sample meta:')) + print(expected_flowcells) # Note: This code uses base::strsplit and tidyr::unnest from an older version of tidyverse. # If there is any update to the tidyverse verision, this can be refactored to use # tidyr::separate_longer_delim @@ -194,3 +197,20 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta, return(list(prism_barcode_counts= summed_reads[summed_reads[[barcode_col]] %chin% known_barcodes,], unknown_barcode_counts= summed_reads[!summed_reads[[barcode_col]] %chin% known_barcodes,])) } + +# Record sample_meta rows where either flowcell_names or flowcell_lanes are NA. +check_flowcell_names_in_sample_meta <- function(sample_meta, out_dir) { + # Check for rows in flowcell_names that equate to empty - NA, "NA", "", " " + # Error out if the flowcell_names are not filled out in the sample meta. + missing_flowcell_names= sample_meta %>% dplyr::filter(if_any(all_of(c('flowcell_names')), ~ . %in% c(NA, 'NA', '', ' '))) + if(nrow(missing_flowcell_names) > 0) { + append_critical_output('The following rows in the sample meta are not filled out for flowcell_names.', out_dir) + append_critical_output(missing_flowcell_names, out_dir) # show the empty rows + append_critical_output('This means that these samples will not have any data in subsequent outputs. Ensure this is intentional.', out_dir) + append_critical_output('These samples will be saved in the missing_flowcells.csv file', out_dir) + # Write out missing flowcell names + write.csv(missing_flowcell_names, paste0(out_dir, '/missing_flowcells.csv'), row.names= FALSE, quote= FALSE) + } else { + append_critical_output('All flowcell_names and lanes are filled out in the sample meta.', out_dir) + } +} \ No newline at end of file diff --git a/scripts/src/kitchen_utensils.R b/scripts/src/kitchen_utensils.R index a6340ec..53ba3de 100644 --- a/scripts/src/kitchen_utensils.R +++ b/scripts/src/kitchen_utensils.R @@ -32,7 +32,11 @@ process_in_chunks= function(large_file_path, chunk_size= 10^6, action, ...) { print(paste('Working on chunk', chunk_idx, 'with', current_chunk_size, 'rows.', sep= ' ')) # Call the action over the chunk - chunk_collector[[chunk_idx]]= do.call(action, list(current_chunk, ...)) + chunk_collector[[chunk_idx]] <- do.call( + action, + c(list(uncollapsed_raw_counts = current_chunk), list(...)) + ) + chunk_idx= chunk_idx + 1 } @@ -122,3 +126,24 @@ check_file_exists <- function(file_path) { filter_control_barcodes <- function(df) { df %>% dplyr::filter(tryCatch(is.na(cb_name), error = function(e) TRUE)) } + +#' Append a print statement to a file to track critical console output +#' +#' This function, when applied to a given print statement, will append the statement to a file in order to +#' track critical console outputs for human review. It will also write the statement to the console as usual. +#' +#' @param print_statement A string to print and append to a file +append_critical_output <- function(statement, out) { + # If the out/logs directory does not exist, create it + if (!dir.exists(paste0(out, "/logs"))) { + dir.create(paste0(out, "/logs"), recursive = TRUE) + } + # Convert data frames or lists into a string if necessary + if (!is.character(statement)) { + statement <- capture.output(print(statement)) + } + # Print to console + cat(statement, sep = "\n") + # Append to file + cat(statement, file = paste0(out, "/logs/critical_output.txt"), append = TRUE, sep = "\n") +} diff --git a/scripts/src/qc_table_functions.R b/scripts/src/qc_table_functions.R index 7b24107..9b95b91 100644 --- a/scripts/src/qc_table_functions.R +++ b/scripts/src/qc_table_functions.R @@ -2,7 +2,7 @@ options(cli.unicode = FALSE) library(tidyverse) library(data.table) library(magrittr) -library (PRROC) +library(PRROC) # BY ID_COLS (PCR_PLATE, PCR_WELL) ---------- @@ -19,19 +19,19 @@ library (PRROC) #' @return A data frame with one row per group and a column `skew` containing the computed skew (auc). #' #' @import dplyr -compute_skew <- function(df, group_cols = c("pcr_plate","pcr_well","pert_plate"), metric = "n") { +compute_skew <- function(df, group_cols = c("pcr_plate", "pcr_well", "pert_plate"), metric = "n") { result <- df %>% dplyr::group_by(across(all_of(group_cols))) %>% - arrange(desc(.data[[metric]])) %>% # Sort by metric in descending order + arrange(desc(.data[[metric]])) %>% # Sort by metric in descending order mutate( - rank_fraction = row_number() / n(), # Calculate rank fraction of each cell line + rank_fraction = row_number() / n(), # Calculate rank fraction of each cell line cumulative_fraction_reads = cumsum(.data[[metric]]) / sum(.data[[metric]], na.rm = TRUE) # Calculate cumulative fraction of reads ) %>% summarise( skew = if (n() > 1) { # Compute auc sum((rank_fraction[-1] - rank_fraction[-n()]) * - (cumulative_fraction_reads[-1] + cumulative_fraction_reads[-n()]) / 2, na.rm = TRUE) + (cumulative_fraction_reads[-1] + cumulative_fraction_reads[-n()]) / 2, na.rm = TRUE) } ) %>% dplyr::ungroup() @@ -49,19 +49,13 @@ compute_skew <- function(df, group_cols = c("pcr_plate","pcr_well","pert_plate") #' #' @import dplyr compute_expected_lines <- function(cell_set_meta, cell_line_cols) { - # Get number of expected cell lines for each cell_set - result <- cell_set_meta %>% - dplyr::group_by(across(all_of(cell_line_cols))) %>% - dplyr::filter(n() == 1) %>% - dplyr::ungroup() %>% + cell_set_meta %>% + dplyr::distinct(cell_set, across(all_of(cell_line_cols))) %>% dplyr::group_by(cell_set) %>% - dplyr::summarise( - n_expected_lines = dplyr::n_distinct(across(all_of(cell_line_cols))), # Count unique cell_lines for each cell_set - ) %>% - dplyr::ungroup() - return(result) + dplyr::summarise(n_expected_lines = n(), .groups = "drop") } + #' Compute read stats #' #' This function calculates summary statistics related to reads and cell line recovery for annotated counts data, @@ -78,23 +72,25 @@ compute_expected_lines <- function(cell_set_meta, cell_line_cols) { #' recovered cell lines, and their fractions. #' #' @import dplyr -compute_read_stats <- function(annotated_counts, cell_set_meta, unknown_counts, group_cols = c("pcr_plate","pcr_well","pert_type","pert_plate"), - metric = "n", cell_line_cols = c("depmap_id", "pool_id"), count_threshold = 40) { +compute_read_stats <- function(annotated_counts, cell_set_meta, unknown_counts, cb_metrics, group_cols = c("pcr_plate", "pcr_well", "pert_type", "pert_plate"), + metric = "n", cell_line_cols = c("depmap_id", "pool_id"), count_threshold = 40, + expected_reads_threshold = 0.8, cb_threshold = 100, cb_spearman_threshold = 0.8, cb_mae_threshold = 1) { # Compute expected lines from cell_set_meta expected_lines <- compute_expected_lines(cell_set_meta, cell_line_cols) # Group unknown_counts by group_cols unknown_counts <- unknown_counts %>% - dplyr::left_join(unique(annotated_counts %>% select(pcr_plate, pcr_well, pert_type, pert_plate)), - by = c("pcr_plate","pcr_well")) %>% - dplyr::group_by(across(all_of(group_cols))) %>% - dplyr::summarise( + dplyr::left_join(unique(annotated_counts %>% select(pcr_plate, pcr_well, pert_type, pert_plate)), + by = c("pcr_plate", "pcr_well") + ) %>% + dplyr::group_by(across(all_of(group_cols))) %>% + dplyr::summarise( n = sum(.data[[metric]], na.rm = TRUE), expected_read = FALSE - ) + ) plate_well <- annotated_counts %>% - dplyr::left_join(expected_lines, by = "cell_set") %>% # Add n_expected_lines from lookup + dplyr::left_join(expected_lines, by = "cell_set") %>% # Append unknown_counts, filling NA for any column not present in unknown_counts dplyr::bind_rows(unknown_counts) %>% dplyr::group_by(across(all_of(group_cols))) %>% @@ -110,24 +106,28 @@ compute_read_stats <- function(annotated_counts, cell_set_meta, unknown_counts, # Fraction of reads mapping to cell lines fraction_expected_reads = n_expected_reads / n_total_reads, # Number of cell lines with coverage above 40 reads - n_lines_recovered = sum(.data[[metric]] >= count_threshold & (is.na(cb_name) | cb_name == ""), na.rm = TRUE), + n_lines_recovered = sum(.data[[metric]] >= count_threshold & (is.na(cb_name) | cb_name == "") & expected_read == TRUE, na.rm = TRUE), # Number of expected lines based on metadata n_expected_lines = max(n_expected_lines, na.rm = TRUE), # Bring forward from join # Fraction of cell lines with coverage above count threshold fraction_cl_recovered = n_lines_recovered / max(n_expected_lines, na.rm = TRUE), # Ratio of control barcode reads to cell line reads - cb_cl_ratio_well = n_cb_reads / n_expected_reads, + cb_cl_ratio_well = n_cb_reads / n_expected_reads ) %>% dplyr::ungroup() plate_pert_type <- plate_well %>% + dplyr::left_join(cb_metrics %>% select(pcr_plate, pcr_well, cb_mae, cb_spearman), by = c("pcr_plate", "pcr_well")) %>% + dplyr::filter(median_cb_reads > cb_threshold) %>% + dplyr::filter(fraction_expected_reads > expected_reads_threshold) %>% + dplyr::filter(cb_spearman > cb_spearman_threshold & cb_mae < cb_mae_threshold) %>% dplyr::group_by(pcr_plate, pert_type) %>% dplyr::summarise( cb_cl_ratio_plate = median(cb_cl_ratio_well, na.rm = TRUE), ) %>% dplyr::ungroup() - # Combine plate_well and plate_pert_type + # Combine plate_well and plate_pert_type result <- plate_well %>% dplyr::left_join(plate_pert_type, by = c("pcr_plate", "pert_type")) @@ -146,22 +146,29 @@ compute_read_stats <- function(annotated_counts, cell_set_meta, unknown_counts, #' #' @import dplyr #' -calculate_cb_metrics <- function(normalized_counts,cb_meta, group_cols = c("pcr_plate", "pcr_well", "pert_plate"), pseudocount = 20) { - valid_profiles= normalized_counts %>% dplyr::filter(!pert_type %in% c(NA, "empty", "", "CB_only"), n != 0, - cb_ladder %in% unique(cb_meta$cb_ladder), - cb_name %in% unique(cb_meta$cb_name)) %>% - dplyr::group_by(across(all_of(group_cols))) %>% dplyr::filter(dplyr::n() > 4) %>% dplyr::ungroup() - fit_stats= valid_profiles %>% - dplyr::group_by(across(all_of(group_cols))) %>% - dplyr::mutate(log2_normalized_n= log2(n+pseudocount) + cb_intercept, - cb_mae= median(abs(cb_log2_dose- log2_normalized_n)), - mean_y= mean(cb_log2_dose), - residual2= (cb_log2_dose- log2_normalized_n)^2, - squares2= (cb_log2_dose- mean_y)^2, - cb_r2= 1- sum(residual2)/sum(squares2), - cb_spearman= cor(cb_log2_dose, log2(n+pseudocount), method= 'spearman', use= 'pairwise.complete.obs')) %>% +calculate_cb_metrics <- function(normalized_counts, cb_meta, group_cols = c("pcr_plate", "pcr_well", "pert_plate"), pseudocount = 20) { + valid_profiles <- normalized_counts %>% + dplyr::filter( + !pert_type %in% c(NA, "empty", "", "CB_only"), n != 0, + cb_ladder %in% unique(cb_meta$cb_ladder), + cb_name %in% unique(cb_meta$cb_name) + ) %>% + dplyr::group_by(across(all_of(group_cols))) %>% + dplyr::filter(dplyr::n() > 4) %>% + dplyr::ungroup() + fit_stats <- valid_profiles %>% + dplyr::group_by(across(all_of(group_cols))) %>% + dplyr::mutate( + log2_normalized_n = log2(n + pseudocount) + cb_intercept, + cb_mae = median(abs(cb_log2_dose - log2_normalized_n)), + mean_y = mean(cb_log2_dose), + residual2 = (cb_log2_dose - log2_normalized_n)^2, + squares2 = (cb_log2_dose - mean_y)^2, + cb_r2 = 1 - sum(residual2) / sum(squares2), + cb_spearman = cor(cb_log2_dose, log2(n + pseudocount), method = "spearman", use = "pairwise.complete.obs") + ) %>% dplyr::ungroup() %>% - dplyr::distinct(across(all_of(c(group_cols, 'cb_mae', 'cb_r2', 'cb_spearman', 'cb_intercept')))) + dplyr::distinct(across(all_of(c(group_cols, "cb_mae", "cb_r2", "cb_spearman", "cb_intercept")))) return(fit_stats) } @@ -183,22 +190,25 @@ calculate_cb_metrics <- function(normalized_counts,cb_meta, group_cols = c("pcr_ #' #' @import dplyr generate_id_cols_table <- function(annotated_counts, normalized_counts, unknown_counts, cell_set_meta, cb_meta, id_cols_list, cell_line_cols, - count_threshold= 40, pseudocount= 20) { - print(paste0("Computing QC metrics grouping by ", paste0(id_cols_list, collapse = ","), ".....")) + count_threshold = 40, pseudocount = 20) { + print(paste0("Computing id_cols QC metrics grouping by ", paste0(id_cols_list, collapse = ","), ".....")) read_stats_grouping_cols <- c(id_cols_list, "pert_type", "pert_plate") - read_stats <- compute_read_stats(annotated_counts = annotated_counts, unknown_counts = unknown_counts, group_cols = read_stats_grouping_cols, - cell_set_meta= cell_set_meta, metric = "n", cell_line_cols = cell_line_cols, - count_threshold = count_threshold) + cb_metrics <- calculate_cb_metrics(normalized_counts, cb_meta, group_cols = c(id_cols_list, "pert_plate"), pseudocount = pseudocount) + + read_stats <- compute_read_stats( + annotated_counts = annotated_counts, unknown_counts = unknown_counts, cb_metrics = cb_metrics, group_cols = read_stats_grouping_cols, + cell_set_meta = cell_set_meta, metric = "n", cell_line_cols = cell_line_cols, + count_threshold = count_threshold + ) skew <- compute_skew(annotated_counts, group_cols = c(id_cols_list, "pert_plate"), metric = "n") - cb_metrics <- calculate_cb_metrics(normalized_counts, cb_meta, group_cols = c(id_cols_list, "pert_plate"), pseudocount = pseudocount) id_cols_table <- read_stats %>% dplyr::left_join(skew, by = c(id_cols_list, "pert_plate")) %>% - dplyr::left_join(cb_metrics, by = id_cols_list, "pert_plate") + dplyr::left_join(cb_metrics, by = c(id_cols_list, "pert_plate")) return(id_cols_table) } @@ -233,7 +243,7 @@ generate_id_cols_table <- function(annotated_counts, normalized_counts, unknown_ #' #' @import dplyr #' @import PRROC -compute_error_rate <- function(df, metric = 'log2_normalized_n', group_cols = c("depmap_id", "pcr_plate", "pert_plate"), +compute_error_rate <- function(df, metric = "log2_normalized_n", group_cols = c("depmap_id", "pcr_plate", "pert_plate"), negcon = "ctl_vehicle", poscon = "trt_poscon") { print(paste0("Computing error rate using ", negcon, " and ", poscon, ".....")) print(paste0("Grouping by ", paste0(group_cols, collapse = ","), ".....")) @@ -277,7 +287,7 @@ compute_error_rate <- function(df, metric = 'log2_normalized_n', group_cols = c( #' @importFrom tidyr pivot_wider #' @importFrom stats mad pnorm compute_ctl_medians_and_mad <- function(df, group_cols = c("depmap_id", "pcr_plate", "pert_plate"), - negcon = "ctl_vehicle", poscon = "trt_poscon", pseudocount = 20) { + negcon = "ctl_vehicle", poscon = "trt_poscon", pseudocount = 20) { print(paste0("Adding control median and MAD values for ", negcon, " and ", poscon, ".....")) print(paste0("Computing falses sensitivity probability for ", negcon, ".....")) # Group and compute medians/MADs @@ -285,11 +295,11 @@ compute_ctl_medians_and_mad <- function(df, group_cols = c("depmap_id", "pcr_pla dplyr::filter(pert_type %in% c(negcon, poscon)) %>% dplyr::group_by(across(all_of(c(group_cols, "pert_type")))) %>% dplyr::summarise( - median_log_normalized = median(log2_normalized_n, na.rm = TRUE), + median_log_normalized = median(log2_normalized_n), n_replicates = n(), - mad_log_normalized = mad(log2_normalized_n, na.rm = TRUE), - median_raw = median(log2(n+pseudocount), na.rm = TRUE), - mad_raw = mad(log2(n+pseudocount), na.rm = TRUE) + mad_log_normalized = mad(log2_normalized_n), + median_raw = median(log2(n + pseudocount)), + mad_raw = mad(log2(n + pseudocount)) ) %>% dplyr::ungroup() %>% pivot_wider( @@ -301,11 +311,11 @@ compute_ctl_medians_and_mad <- function(df, group_cols = c("depmap_id", "pcr_pla dplyr::mutate( false_sensitivity_probability_50 = pnorm( -1, - sd = .data[[paste0('mad_log_normalized_',negcon)]] * sqrt((1/.data[[paste0('n_replicates_',negcon)]] * pi/2) + 1) + sd = .data[[paste0("mad_log_normalized_", negcon)]] * sqrt((1 / .data[[paste0("n_replicates_", negcon)]] * pi / 2) + 1) ), false_sensitivity_probability_25 = pnorm( -2, - sd = .data[[paste0('mad_log_normalized_',negcon)]] * sqrt((1/.data[[paste0('n_replicates_',negcon)]] * pi/2) + 1) + sd = .data[[paste0("mad_log_normalized_", negcon)]] * sqrt((1 / .data[[paste0("n_replicates_", negcon)]] * pi / 2) + 1) ) ) return(result) @@ -330,12 +340,12 @@ compute_control_lfc <- function(df, negcon = "ctl_vehicle", poscon = "trt_poscon result <- df %>% dplyr::mutate( lfc_trt_poscon = .data[[paste0("median_log_normalized_", poscon)]] - - .data[[paste0("median_log_normalized_", negcon)]], + .data[[paste0("median_log_normalized_", negcon)]], lfc_raw_trt_poscon = .data[[paste0("median_raw_", poscon)]] - - .data[[paste0("median_raw_", negcon)]] + .data[[paste0("median_raw_", negcon)]] ) %>% - dplyr::select(all_of(grouping_cols), lfc_trt_poscon, lfc_raw_trt_poscon) %>% - dplyr::mutate(viability_trt_poscon=2^lfc_trt_poscon) + dplyr::select(all_of(grouping_cols), lfc_trt_poscon, lfc_raw_trt_poscon) %>% + dplyr::mutate(viability_trt_poscon = 2^lfc_trt_poscon) return(result) } @@ -357,16 +367,14 @@ compute_cl_fractions <- function(df, metric = "n", grouping_cols = c("pcr_plate" result <- df %>% dplyr::group_by(across(all_of(grouping_cols))) %>% dplyr::summarise( - total_reads = sum(.data[[metric]], na.rm = TRUE), # Total reads per group - fraction_of_reads = sum(.data[[metric]], na.rm = TRUE) / sum(.data[[metric]], na.rm = TRUE) # Fraction of reads for each entry + total_reads = sum(.data[[metric]], na.rm = TRUE), # Total reads per group + fraction_of_reads = sum(.data[[metric]], na.rm = TRUE) / sum(.data[[metric]], na.rm = TRUE) # Fraction of reads for each entry ) %>% dplyr::ungroup() %>% dplyr::select(all_of(grouping_cols), total_reads, fraction_of_reads) return(result) } -# TABLE GENERATION FUNCTION ---------- - #' Generate cell plate table #' #' This function generates a comprehensive QC table for cell_lines +pcr plates by computing and merging various QC metrics, including medians, MADs, error rates, log fold changes (LFC), and cell line fractions. @@ -384,8 +392,8 @@ compute_cl_fractions <- function(df, metric = "n", grouping_cols = c("pcr_plate" #' @import dplyr generate_cell_plate_table <- function(normalized_counts, filtered_counts, cell_line_cols, pseudocount = 20) { cell_line_list <- strsplit(cell_line_cols, ",")[[1]] - cell_line_plate_grouping <- c(cell_line_list,"pcr_plate","pert_plate") # Define columns to group by - print(paste0("Computing QC metrics grouping by ", paste0(cell_line_plate_grouping, collapse = ","), ".....")) + cell_line_plate_grouping <- c(cell_line_list, "pcr_plate", "pert_plate", "project_code") # Define columns to group by + print(paste0("Computing cell + plate QC metrics grouping by ", paste0(cell_line_plate_grouping, collapse = ","), ".....")) # Compute control medians and MAD medians_and_mad <- compute_ctl_medians_and_mad( @@ -419,6 +427,7 @@ generate_cell_plate_table <- function(normalized_counts, filtered_counts, cell_l grouping_cols = cell_line_plate_grouping ) + # Merge all tables together print(paste0("Merging ", paste0(cell_line_plate_grouping, collapse = ","), " QC tables together.....")) print("medians_and_mad") @@ -435,202 +444,154 @@ generate_cell_plate_table <- function(normalized_counts, filtered_counts, cell_l dplyr::left_join(cell_line_fractions, by = cell_line_plate_grouping) # QC pass criteria, currently with hardcoded pert_types plate_cell_table <- plate_cell_table %>% - dplyr::mutate(qc_pass= error_rate < 0.05 & viability_trt_poscon < 0.25 & - median_raw_ctl_vehicle > 40 & mad_log_normalized_ctl_vehicle < 1 ) %>% - dplyr::group_by(across(all_of(c(cell_line_list, "pert_plate")))) %>% - dplyr::mutate(n_passing_plates=sum(qc_pass)) %>% - dplyr::mutate(qc_pass_pert_plate= n_passing_plates>1) %>% - dplyr::ungroup() %>% + dplyr::mutate(qc_pass = error_rate < 0.05 & viability_trt_poscon < 0.25 & + median_raw_ctl_vehicle > log(40) & mad_log_normalized_ctl_vehicle < 1) %>% + dplyr::group_by(across(all_of(c(cell_line_list, "pert_plate")))) %>% + dplyr::mutate(n_passing_plates = sum(qc_pass)) %>% + dplyr::mutate(qc_pass_pert_plate = n_passing_plates > 1) %>% + dplyr::ungroup() %>% dplyr::select(-n_passing_plates) + # Add the n_expected_controls values + plate_cell_table <- plate_cell_table %>% + dplyr::left_join( + n_expected_controls, + by = c("pcr_plate", "pert_plate") + ) %>% + dplyr::mutate( + fraction_expected_poscon = n_replicates_trt_poscon/n_expected_trt_poscon, + fraction_expected_negcon = n_replicates_ctl_vehicle/n_expected_ctl_vehicle + ) %>% return(plate_cell_table) } -# QC FLAG FUNCTIONS ---------- -id_cols_qc_flags <- function(annotated_counts, - normalized_counts, - unknown_counts, - cb_meta, - group_cols = c("pcr_plate","pcr_well","pert_type"), - metric = "n", - pseudocount = 20, - contamination_threshold = 0.8, - cb_mae_threshold = 1, - cb_spearman_threshold = 0.88, - cb_cl_ratio_low_negcon = 0.5, - cb_cl_ratio_high_negcon = 2, - cb_cl_ratio_low_poscon = 0.5, - cb_cl_ratio_high_poscon = 2, - well_reads_threshold = 400) { - - # Calculate cb metrics from normalized counts - cb_metrics <- calculate_cb_metrics(normalized_counts, cb_meta, group_cols = c("pcr_plate", "pcr_well", "pert_type", "pert_plate"), pseudocount = pseudocount) - - # Group unknown_counts by group_cols - unknown_counts_proc <- unknown_counts %>% - # Add pert_type from annotated_counts - dplyr::left_join(unique(annotated_counts %>% select(pcr_plate, pcr_well, pert_type)), - by = c("pcr_plate","pcr_well")) %>% - dplyr::group_by(across(all_of(group_cols))) %>% - dplyr::summarise( - n = sum(.data[[metric]], na.rm = TRUE), - expected_read = FALSE - ) - - # Combine annotated_counts (after adding expected_lines) with unknown_counts and cb_metrics - combined_data <- annotated_counts %>% - # Add uknown barcode reads - bind_rows(unknown_counts_proc) %>% - # Add cb_metrics - left_join(cb_metrics, by = group_cols) +# QC FLAG FUNCTIONS ---------- - # Initialize a container to record flagged wells. - flagged_all <- tibble() - - # Working dataset: we will progressively filter out flagged wells. - working <- combined_data +#' Compute QC Flags for Plate Cell Data +#' +#' This function processes a plate cell table by applying a series of quality control +#' thresholds to assign a flag to each well. The flag indicates the first QC criterion that +#' a well fails, based on the provided thresholds. +#' +#' @param plate_cell_table A data frame containing plate cell data. It must include the columns: +#' \code{mad_log_normalized_ctl_vehicle}, \code{error_rate}, \code{viability_trt_poscon}, and +#' \code{median_raw_ctl_vehicle}. +#' @param nc_variability_threshold A numeric threshold for negative control variability +#' (default: 1). +#' @param error_rate_threshold A numeric threshold for the error rate (default: 0.05). +#' @param pc_viability_threshold A numeric threshold for positive control viability +#' (default: 0.25). +#' @param nc_raw_count_threshold A numeric threshold for negative control raw counts, where the +#' raw count is compared to the log of this value (default: 40). +#' +#' @return A data frame identical to \code{plate_cell_table} with an additional column \code{qc_flag} +#' that indicates the first QC flag applicable for each well. +#' +#' @import dplyr - ### WELL_READS - ## Flag wells based on the median count of control barcodes in each well. +plate_cell_qc_flags <- function(plate_cell_table, + nc_variability_threshold = 1, + error_rate_threshold = 0.05, + pc_viability_threshold = 0.25, + nc_raw_count_threshold = 40) { + # Add a qc_flag column using case_when (conditions are checked in order) + qc_table <- plate_cell_table %>% + mutate(qc_flag = case_when( + mad_log_normalized_ctl_vehicle > nc_variability_threshold ~ "nc_variability", + error_rate > error_rate_threshold ~ "error_rate", + viability_trt_poscon > pc_viability_threshold ~ "pc_viability", + median_raw_ctl_vehicle < log(nc_raw_count_threshold) ~ "nc_raw_count", + TRUE ~ NA_character_ + )) + return(qc_table) +} - # Calculate the median control barcode reads for each well. - well_reads <- working %>% - group_by(across(all_of(group_cols))) %>% - summarise( - n_cb_reads = sum(.data[[metric]][cb_name != ""], na.rm = TRUE), - median_cb_reads = median(.data[[metric]][cb_name != ""], na.rm = TRUE), - .groups = "drop" - ) %>% - # Flag wells where median_cb_reads is less than 20 * pseudocount. - mutate(qc_flag = if_else(median_cb_reads < (well_reads_threshold), "well_reads", NA_character_)) - # Record flagged wells and remove them from the working dataset. - flagged_well_reads <- well_reads %>% +#' Generate QC Flags for ID Columns in Plate Cell Data +#' +#' This function processes a data frame containing plate cell measurements and assigns quality control (QC) flags +#' based on several thresholds. The flags are determined in a sequential order (i.e., only the first applicable +#' flag per well is recorded). The function returns a data frame with a unique record of flagged wells, identified +#' by the specified grouping columns. +#' +#' @param id_cols_table A data frame containing plate cell measurements. It must include the following columns: +#' \code{median_cb_reads}, \code{fraction_expected_reads}, \code{cb_mae}, \code{cb_spearman}, +#' \code{cb_cl_ratio_well}, and \code{pert_type}. +#' @param group_cols A character vector specifying the columns used to uniquely identify each well. Default is +#' \code{c("pcr_plate", "pcr_well", "pert_type")}. +#' @param contamination_threshold Numeric threshold for contamination based on the fraction of expected reads. +#' Default is 0.8. +#' @param cb_mae_threshold Numeric threshold for the CB mean absolute error (MAE). Default is 1. +#' @param cb_spearman_threshold Numeric threshold for the CB Spearman correlation. Default is 0.8. +#' @param cb_cl_ratio_low_negcon Numeric threshold for the lower bound of the CB/CL ratio in negative control wells. +#' Default is 0. +#' @param cb_cl_ratio_high_negcon Numeric threshold for the upper bound of the CB/CL ratio in negative control wells. +#' Default is 2. +#' @param cb_cl_ratio_low_poscon Numeric threshold for the lower bound of the CB/CL ratio in positive control wells. +#' Default is 0.5. +#' @param cb_cl_ratio_high_poscon Numeric threshold for the upper bound of the CB/CL ratio in positive control wells. +#' Default is 2. +#' @param well_reads_threshold Numeric threshold for well reads; wells with \code{median_cb_reads} below the logarithm +#' of this value are flagged as "well_reads". Default is 100. +#' +#' @return A data frame containing the unique flagged wells. The returned data frame includes the columns specified +#' in \code{group_cols} and a \code{qc_flag} column that indicates the first QC flag applied to each well. +#' +#' @import dplyr +id_cols_qc_flags <- function(id_cols_table, + group_cols = c("pcr_plate", "pcr_well", "pert_type"), + contamination_threshold = contamination_threshold, + cb_mae_threshold = 1, + cb_spearman_threshold = 0.8, + cb_cl_ratio_low_negcon = 0, + cb_cl_ratio_high_negcon = 2, + cb_cl_ratio_low_poscon = 0.5, + cb_cl_ratio_high_poscon = 2, + well_reads_threshold = 100) { + # Add a qc_flag column using case_when (conditions are checked in order) + qc_table <- id_cols_table %>% + mutate(qc_flag = case_when( + median_cb_reads < well_reads_threshold ~ "well_reads", + fraction_expected_reads < contamination_threshold ~ "contamination", + cb_mae > cb_mae_threshold | cb_spearman < cb_spearman_threshold ~ "cb_linearity", + pert_type == "trt_poscon" & (cb_cl_ratio_well < cb_cl_ratio_low_poscon | + cb_cl_ratio_well > cb_cl_ratio_high_poscon) ~ "cb_cl_ratio", + pert_type == "ctl_vehicle" & (cb_cl_ratio_well < cb_cl_ratio_low_negcon | + cb_cl_ratio_well > cb_cl_ratio_high_negcon) ~ "cb_cl_ratio", + TRUE ~ NA_character_ + )) + + # Extract flagged wells (keeping only the first flag per well) + flagged_all <- qc_table %>% filter(!is.na(qc_flag)) %>% - select(all_of(group_cols), qc_flag) - flagged_all <- bind_rows(flagged_all, flagged_well_reads) - - working <- working %>% anti_join(flagged_well_reads, by = group_cols) - - ### CONTAMINATION - ## Flag wells based on the fraction of reads mapping to expected cell lines or control barcodes. - - # Calculate the total and expected reads for each well. - containation <- working %>% - group_by(across(all_of(group_cols))) %>% - summarise( - n_total_reads = sum(.data[[metric]], na.rm = TRUE), - n_expected_reads = sum(.data[[metric]][expected_read], na.rm = TRUE), - .groups = "drop" - ) %>% - mutate(fraction_expected_reads = n_expected_reads / n_total_reads, - qc_flag = if_else(fraction_expected_reads < contamination_threshold, "contamination", NA_character_)) - - # Record flagged wells and remove them from the working dataset. - flagged_contamination <- containation %>% - filter(qc_flag == "contamination") %>% - select(all_of(group_cols), qc_flag) - flagged_all <- bind_rows(flagged_all, flagged_contamination) - - working <- working %>% anti_join(flagged_contamination, by = group_cols) - - ### CB_LINEARITY - ## Flag wells based on the linearity of the control barcodes in each well defined by the median absolute error (MAE) and Spearman correlation coefficient. - - # Calculate the MAE and Spearman correlation - cb_linearity <- working %>% - group_by(across(all_of(group_cols))) %>% - summarise( - cb_mae = median(.data[["cb_mae"]], na.rm = TRUE), - cb_spearman = median(.data[["cb_spearman"]], na.rm = TRUE), - .groups = "drop" - ) %>% - mutate(qc_flag = if_else(cb_mae > cb_mae_threshold | cb_spearman < cb_spearman_threshold, "cb_linearity", NA_character_)) - - # Record flagged wells and remove them from the working dataset. - flagged_cb_linearity <- cb_linearity %>% - filter(qc_flag == "cb_linearity") %>% - select(all_of(group_cols), qc_flag) - flagged_all <- bind_rows(flagged_all, flagged_cb_linearity) - working <- working %>% anti_join(flagged_cb_linearity, by = group_cols) - - ### CB_CL_RATIO - ## POSITIVE CONTROL - ## Flag wells based on the ratio of control barcode reads to cell line reads. - cb_cl_ratio_poscon = working %>% - filter(pert_type == "trt_poscon") %>% - group_by(across(all_of(group_cols))) %>% - summarise( - n_expected_reads = sum(.data[[metric]][expected_read], na.rm = TRUE), - n_cb_reads = sum(.data[[metric]][cb_name != ""], na.rm = TRUE), - cb_cl_ratio_well = n_cb_reads / n_expected_reads, - .groups = "drop" - ) %>% - group_by(pcr_plate, pert_type) %>% - mutate(cb_cl_ratio_plate = median(cb_cl_ratio_well, na.rm = TRUE)) %>% - ungroup() %>% - mutate(qc_flag = if_else(cb_cl_ratio_well < cb_cl_ratio_low_poscon * cb_cl_ratio_plate | cb_cl_ratio_well > cb_cl_ratio_high_poscon * cb_cl_ratio_plate, "cb_cl_ratio", NA_character_)) - - # Record flagged wells and remove them from the working dataset. - flagged_cb_cl_ratio_poscon <- cb_cl_ratio_poscon %>% - filter(qc_flag == "cb_cl_ratio") %>% - select(all_of(group_cols), qc_flag) - flagged_all <- bind_rows(flagged_all, flagged_cb_cl_ratio_poscon) - working <- working %>% anti_join(flagged_cb_cl_ratio_poscon, by = group_cols) - - ## NEGATIVE CONTROL - ## Flag wells based on the ratio of control barcode reads to cell line reads. - cb_cl_ratio_negcon = working %>% - filter(pert_type == "ctl_vehicle") %>% - group_by(across(all_of(group_cols))) %>% - summarise( - n_expected_reads = sum(.data[[metric]][expected_read], na.rm = TRUE), - n_cb_reads = sum(.data[[metric]][cb_name != ""], na.rm = TRUE), - cb_cl_ratio_well = n_cb_reads / n_expected_reads, - .groups = "drop" - ) %>% - group_by(pcr_plate, pert_type) %>% - mutate(cb_cl_ratio_plate = median(cb_cl_ratio_well, na.rm = TRUE)) %>% - ungroup() %>% - mutate(qc_flag = if_else(cb_cl_ratio_well < cb_cl_ratio_low_negcon * cb_cl_ratio_plate | cb_cl_ratio_well > cb_cl_ratio_high_negcon * cb_cl_ratio_plate, "cb_cl_ratio", NA_character_)) + select(all_of(group_cols), qc_flag) %>% + unique() - # Record flagged wells and remove them from the working dataset. - flagged_cb_cl_ratio_negcon <- cb_cl_ratio_negcon %>% - filter(qc_flag == "cb_cl_ratio") %>% - select(all_of(group_cols), qc_flag) - flagged_all <- bind_rows(flagged_all, flagged_cb_cl_ratio_negcon) - - - ### RETURN RESULTS - # Filter normalized_counts for only the wells that were not flagged - normalized_filtered <- flagged_all %>% - select(pcr_plate, pcr_well, pert_type) %>% - unique() %>% - dplyr::left_join(normalized_counts, by = group_cols) - - # Return a list containing both the filtered normalized_counts and a record of all flagged wells. - list(result = normalized_filtered, well_flags = flagged_all) + # Return both outputs + return(flagged_all) } -## POOL WELL QC FLAGS -pool_well_qc_flags <- function(normalized_counts, pool_well_qc) { - ### POOL_WELL_OUTLIERS - ## Flag pool/well combinations based on the fraction of cell lines in a pool + well that are some distance from the pool + well median. - ## This is in control wells only - flagged_all <- pool_well_qc %>% - filter(!is.na(qc_flag)) - - ### RETURN RESULTS - # Filter normalized_counts for only the wells that were not flagged - normalized_filtered <- normalized_counts %>% - dplyr::left_join(flagged_all %>% - select(pcr_plate, pcr_well, pert_type, pool_id), - by = c("pool_id", "pcr_plate", "pcr_well", "pert_type")) - - # Return a list containing both the filtered normalized_counts and a record of all flagged wells. - list(result = normalized_filtered, pool_well_flags = flagged_all) -} +#' Generate Pool Well QC Table +#' +#' This function flags pool/well combinations in control wells based on the variability of cell line +#' measurements relative to the pool/well median. It calculates the median normalized count for each +#' pool/well group, computes the absolute difference between each cell line's normalized count and the +#' group median, and then determines the fraction of outliers. Wells with a fraction of outliers exceeding +#' the specified threshold are flagged as having "pool_well_outliers". +#' +#' @param normalized_counts A data frame containing normalized count data. Required columns include: +#' \code{cb_name}, \code{pcr_plate}, \code{pcr_well}, \code{pert_type}, \code{pool_id}, +#' \code{log2_normalized_n}, \code{lua}, \code{depmap_id}, and \code{cell_set}. +#' @param pool_well_delta_threshold A numeric threshold specifying the minimum absolute difference from +#' the pool/well median for a cell line to be considered an outlier (default: 5). +#' @param pool_well_fraction_threshold A numeric threshold specifying the minimum fraction of outlier cell +#' lines required for the well to be flagged (default: 0.4). +#' +#' @return A data frame with an added \code{qc_flag} column that indicates pool/well combinations flagged +#' as having excessive outliers. +#' +#' @import dplyr generate_pool_well_qc_table <- function(normalized_counts, pool_well_delta_threshold = 5, pool_well_fraction_threshold = 0.4) { ### POOL_WELL_OUTLIERS ## Flag pool/well combinations based on the fraction of cell lines in a pool + well that are some distance from the pool + well median. @@ -650,9 +611,42 @@ generate_pool_well_qc_table <- function(normalized_counts, pool_well_delta_thres group_by(pcr_plate, pcr_well, pert_type, pool_id) %>% reframe( n_outliers = sum(delta_from_pool_well_median > pool_well_delta_threshold, na.rm = TRUE), - fraction_outliers = n_outliers / n_cell_lines + n_cell_lines = max(n_cell_lines) ) %>% ungroup() %>% - mutate(qc_flag = if_else(fraction_outliers > pool_well_fraction_threshold, "pool_well_outliers", NA_character_)) + mutate(fraction_outliers = n_outliers / n_cell_lines, + qc_flag = if_else(fraction_outliers > pool_well_fraction_threshold, "pool_well_outliers", NA_character_)) } +# Get the qc parameters from the qc_params json file +load_thresholds_from_json <- function(json_file_path) { + # Load required package + if (!requireNamespace("jsonlite", quietly = TRUE)) { + stop("Package 'jsonlite' is required but not installed.") + } + + # Read JSON file into a list + params <- jsonlite::fromJSON(json_file_path) + + # Convert each value to numeric + numeric_params <- lapply(params, as.numeric) + + return(numeric_params) +} + +# PCR PLATE FLAGS +generate_pcr_plate_qc_flags_table <- function(plate_cell_table, fraction_expected_controls) { + # Add a qc_flag when either fraction_expected_poscon or fraction_expected_negcon is below the threshold + table <- plate_cell_table %>% + dplyr::select(fraction_expected_poscon, fraction_expected_negcon, pcr_plate, pert_plate) %>% + unique() %>% + dplyr::mutate( + qc_flag = dplyr::case_when( + fraction_expected_poscon < fraction_expected_controls ~ "fraction_expected_controls", + fraction_expected_negcon < fraction_expected_controls ~ "fraction_expected_controls", + TRUE ~ NA_character_ + ) + ) %>% + filter(!is.na(qc_flag)) + return(table) +} \ No newline at end of file