Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

if flowcell_lanes is a column in sample metadata, forces it to be a string… #100

Merged
merged 6 commits into from
Mar 27, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions scripts/collate_fastq_reads.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,12 @@ parser <- ArgumentParser()
# specify desired options
parser$add_argument("-v", "--verbose", action="store_true", default=TRUE, help="Print extra output [default]")
parser$add_argument("-q", "--quietly", action="store_false", dest="verbose", help="Print little output")
parser$add_argument('--raw_counts_uncollapsed', default= "raw_counts_uncollapsed.csv",
parser$add_argument('--raw_counts_uncollapsed', default= "raw_counts_uncollapsed.csv.gz",
help= "path to file containing uncollapsed raw counts file")
parser$add_argument("--sample_meta", default="sample_meta.csv", help= "Sample metadata")
parser$add_argument("--cell_line_meta", default="cell_line_meta.csv", help= "Cell line metadata")
parser$add_argument("--CB_meta", default= "CB_meta.csv", help= "Control Barcode metadata")
parser$add_argument('--sequencing_index_cols', default= 'index_1,index_2',
parser$add_argument('--sequencing_index_cols', default= 'flowcell_names,index_1,index_2',
help= 'List of sequencing columns in the sample meta.')
parser$add_argument("--id_cols", default= "pcr_plate,pcr_well",
help = "Columns that identify a unique PCR well")
Expand All @@ -38,9 +38,13 @@ if (args$out == "") {
}

# Read in metadata files as data.table objects ----
sample_meta= data.table::fread(args$sample_meta, header= TRUE, sep= ',')
cell_line_meta= data.table::fread(args$cell_line_meta, header= TRUE, sep= ',')
CB_meta= data.table::fread(args$CB_meta, header= TRUE, sep= ',')
sample_meta= data.table::fread(args$sample_meta, header= TRUE, sep= ',')
## if flowcell_lanes one lane, the integer gets read in as numeric, so force this to be a string
if(validate_columns_exist("flowcell_lanes", sample_meta)){
sample_meta %<>% mutate(flowcell_lanes = as.character(flowcell_lanes))
}

# Parse some parameters into vectors ----
sequencing_index_cols= unlist(strsplit(args$sequencing_index_cols, ","))
Expand Down
4 changes: 3 additions & 1 deletion scripts/filteredCounts_QC.R
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ parser$add_argument('--count_threshold', default= 40, help= 'Low counts theshold
parser$add_argument('--reverse_index2', type= "logical", default= FALSE,
help= 'Switch to reverse complement index_2 for some sequencers')
parser$add_argument('-o', '--out', default= '', help= 'Output path, defaults to working directory')
parser$add_argument("--pseudocount", default=20, help = "pseudo raw count used in normalization")

# get command line options, if help option encountered print help and exit
args <- parser$parse_args()
Expand Down Expand Up @@ -85,4 +86,5 @@ QC_images(raw_counts_uncollapsed_path= args$raw_counts_uncollapsed,
control_type= args$control_type,
count_threshold= as.numeric(args$count_threshold),
reverse_index2= args$reverse_index2,
out= args$out)
out= args$out,
pseudocount = as.numeric(args$pseudocount))
2 changes: 2 additions & 0 deletions scripts/filteredCounts_QC.sh
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ echo COUNT_THRESHOLD is: $COUNT_THRESHOLD
echo RAW_COUNTS_UNCOLLAPSED is: $RAW_COUNTS_UNCOLLAPSED
echo LFC is: $LFC
echo REVERSE_INDEX2 is: $REVERSE_INDEX2
echo PSEUDOCOUNT is: $PSEUDOCOUNT

args=(
--sample_meta "$SAMPLE_META"
Expand All @@ -117,6 +118,7 @@ args=(
--reverse_index2 "$REVERSE_INDEX2"
--chunk_size "$CHUNK_SIZE"
--cell_line_cols "$CELL_LINE_COLS"
--pseudocount "$PSEUDOCOUNT"
)

echo Rscript filteredCounts_QC.R "${args[@]}"
Expand Down
19 changes: 13 additions & 6 deletions scripts/src/QC_images.R
Original file line number Diff line number Diff line change
Expand Up @@ -286,10 +286,15 @@ create_cdf_plot= function(input_df, id_cols, counts_col= 'n', mark1= 0.5, mark2=
#' @param normalized_counts Dataframe output from the normalize module.
#' @param id_cols Vector of column names that identify every PCR well.
#' @param value_col Name of the column that contains the values.
#' @param pseudocount a pseudo raw count used for missing cell line counts, also used when computing log2 on raw counts
#' @returns Returns a ggplot object.
create_ctrlBC_scatterplots= function(normalized_counts, id_cols, value_col= 'log2_n') {
create_ctrlBC_scatterplots= function(normalized_counts, id_cols, value_col= 'log2_n', pseudocount = 20) {
# Validation: Check that id_cols and value_col exist in filtered counts.
if(!validate_columns_exist(c(id_cols, value_col), normalized_counts)) {
if(value_col == "log2_n" & validate_columns_exist(c(id_cols, "n"), normalized_counts) &
!(validate_columns_exist(c("log2_n"), normalized_counts))){
print("Calculating log2_n")
normalized_counts %<>% mutate(log2_n = log2(n+1))
}else if(!validate_columns_exist(c(id_cols, value_col), normalized_counts)) {
stop('Some input columns were not detected in normalized counts.')
}

Expand All @@ -306,7 +311,7 @@ create_ctrlBC_scatterplots= function(normalized_counts, id_cols, value_col= 'log
residual2= (cb_log2_dose - log2_n)^2,
squares2= (cb_log2_dose - mean_y)^2,
norm_r2= 1 - sum(residual2) / sum(squares2),
norm_mae= median(abs(cb_log2_dose- log2_n))) %>% ungroup()
norm_mae= median(abs(cb_log2_dose - log2_n))) %>% ungroup()
}

# Filter for just the control barcodes, create a profile_id for faceting,
Expand Down Expand Up @@ -466,17 +471,18 @@ create_replicate_scatterplots= function(input_df, cell_line_cols, replicate_grou
#' @param count_threshold Threshold for low read counts.
#' @param reverse_index2 Boolean set to TRUE if the sequencing involved the reverse complement workflow.
#' @param out Path to the directory to save the QC images.
#' @param pseudocount a pseudo raw count used for missing cell line counts, also used when computing log2 on raw counts
#' @returns NA. QC images are written out to the specified folder.
QC_images= function(raw_counts_uncollapsed_path,
prism_barcode_counts, unknown_barcode_counts,
annotated_counts, normalized_counts= NA, l2fc,
sample_meta,
barcode_col= 'forward_read_barcode',
id_cols= c('pcr_plate', 'pcr_well'),
cell_line_cols= c('depmap_id'),
cell_line_cols= c('depmap_id', 'lua'),
sig_cols,
control_type= 'ctl_vehicle', count_threshold= 40,
chunk_size= 10^6,
chunk_size= 10^6, pseudocount = 20,
reverse_index2= FALSE, out= NA) {

# Required packages ----
Expand Down Expand Up @@ -693,7 +699,8 @@ QC_images= function(raw_counts_uncollapsed_path,
print('8. Generating control_barcode_trend image')
potential_error= base::tryCatch({
trend_sc= create_ctrlBC_scatterplots(normalized_counts %>% dplyr::filter(!cb_ladder %in% c(NA, FALSE, 'none', '')),
id_cols, value_col= 'log2_n')
id_cols, value_col= 'log2_n',
pseudocount = pseudocount)

pdf(file=paste(out, "control_barcode_trend.pdf", sep="/"),
width= sqrt(num_profiles) * 2, height= sqrt(num_profiles) * 2)
Expand Down
5 changes: 5 additions & 0 deletions scripts/src/collate_fastq_reads.R
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,11 @@ collate_fastq_reads= function(uncollapsed_raw_counts, sample_meta,
# Calculate index purity ----
# This is only accurate if the Nori input file is small enough to fit into a chunk.
index_purity= sum(summed_reads$n) / sum(uncollapsed_raw_counts$n)
# if index_purity is NA, make it 0
if(is.na(index_purity)){
index_purity = 0
}

# Throw an error if the purity is greater than 1.
# Throw a warning if the purity is below 0.5.
print(paste0('Index purity in chunk: ', round(index_purity, 4)))
Expand Down
2 changes: 1 addition & 1 deletion scripts/src/qc_table_functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ options(cli.unicode = FALSE)
library(tidyverse)
library(data.table)
library(magrittr)
library (PRROC)
library(PRROC)

# BY ID_COLS (PCR_PLATE, PCR_WELL) ----------

Expand Down