02_process_pipeline.Rmd

---
author: "Systems Biomedicine Team"
date: "Last compiled on `r format(Sys.time(), '%d %B, %Y')`"
output:
    rmdformats::material:
        use_bookdown: true
        thumbnails: false
        df_print: kable
        code_folding: hide
        number_sections: yes
    pdf_document:
        number_sections: yes
        toc: yes
        toc_depth: 3
        keep_tex: no
---

```{r, echo=FALSE}
if (goodQ){
    title_var <- paste0("|
    | Step 5: Preprocessing cleaned data on scenario ", scenario, " -- ", DATASET)
} else if (combinedD) {
    title_var <- paste0("|
    | Step 8: Preprocessing the ", ifelse(combine_meth=="merge", "merged", "integrated"), " data on scenario ", scenario, " -- ", DATASET)
} else {
    title_var <- paste0("|
    | Step 2: Preprocessing ", FILTER, " data -- ", DATASET)
}
```

---
title: `r title_var`
---

<!-- Javascript for zooming on figures (adapted from: https://stackoverflow.com/questions/40401680) -->

<!-- Jquery import conflicts with DT::datatable so needs to be commented here -->
<!-- <script src = "https://ajax.googleapis.com/ajax/libs/jquery/3.1.1/jquery.min.js"></script> -->
<!-- 
<style>
.zoomDiv {
  display: none;
  position: fixed;
  top: 50%;
  left: 50%;
  z-index: 50;
  transform: translate(-50%, -50%);
  background-color: #FFFFFF;
  box-shadow: 0px 0px 50px #888888;
  width: fit-content;
  max-width: 90%;
  max-height: 90%;
  overflow: auto;
}

.zoomImg {
  width: 150%;
}
</style>

<script type = "text/javascript">
  $(document).ready(function() {
    $("body").prepend("<div class = \"zoomDiv\"><img src = \"\" class = \"zoomImg\"></div>");
    // onClick for all img except the zoomed one and link ones (filter)
    // use "img.zoom" and out.extra = "class = \"zoom\"" in chunk to specify manually which chunk images can be zoomed
    $("img:not(.zoomImg)").filter(":not(a *)").click(function() {
      $(".zoomImg").attr("src", $(this).attr("src"));
      $(".zoomDiv").show();
    })
    // onClick function for hiding div
    $("img.zoomImg").click(function() {
      $(".zoomDiv").hide();
    })
  })
</script> -->


```{r setup, include = FALSE}
options(knitr.purl.inline = TRUE)
knitr::opts_chunk$set(
  # code evaluation
  eval = TRUE,

  # text output
  echo = TRUE,
  results = "hold",
  warning = FALSE,
  error = FALSE,
  message = FALSE,
  strip.white = TRUE,

  # code decoration
  tidy.opts = list(width.cutoff = 90),
  comment = "",
  attr.output = ".numberLines",

  # plots
  fig.path = paste0(PATH_OUT_FIG, ifelse(!goodQ, "/02_process_", paste0("/05_process_scenario_", scenario, "_"))),
  fig.show = "asis",         # tuned to "hold" in multiple plots chunk
  dev = c("png", "pdf"),
  out.width = "50%",
  fig.width = 12,
  fig.height = 12,
  fig.align = "center"   # should be tuned to default in multiple plots chunk
)
```

```{r load-libraries, include = FALSE}
library(kableExtra)
library(dplyr)
library(Seurat)
library(stringr)
library(ggplot2)
library(tibble)

source("./data_management.R", local = TRUE)
source("./checkDirHierarchy.R", local = TRUE)
```

```{r other-settings, include = FALSE}
set.seed(general_seed)
```

```{r dir-managment, include = FALSE}
# PATHS
if (goodQ){
    PATH_OUT_ANALYSIS <- file.path(PATH_ROOT, "06_process")
} else if (combinedD) {
    PATH_OUT_ANALYSIS <- file.path(PATH_ROOT, "08_combineData")
} else {
    PATH_OUT_ANALYSIS <- file.path(PATH_ROOT, "04_process")
}

if (!dir.exists(PATH_OUT_ANALYSIS)) { dir.create(PATH_OUT_ANALYSIS) }
source("./checkDirHierarchy.R", local = TRUE)
if (goodQ) {
    PATH_CLUSTER_TABLE <- paste0(PATH_ROOT, "/04_process/clustering_", DATASET, "_filtered_meth.", clust_meth, "_res.", clust_res, ".csv")
    checkPath(PATH_CLUSTER_TABLE)
}
```

# Load `r DATASET` data

```{r load-data-1, eval=!goodQ && !combinedD, echo=!goodQ && !combinedD}
SO <- readRDS(paste0(PATH_RDS_OBJECTS, "/01_qc_", DATASET, "_", FILTER, ".rds"))
```

```{r load-data-2, eval=goodQ, echo=goodQ}
SO <- readRDS(paste0(PATH_RDS_OBJECTS, "/04_", scenario, "_goodQualityCells_", DATASET, ".rds"))

clusters <- read.table(PATH_CLUSTER_TABLE, header = TRUE, sep = ",", row.names = "cellIDs")
clustOfInterest <- grep(paste0("RNA_snn_res.", clust_res), names(clusters), value = TRUE)
```

```{r load-data-3, eval=combinedD, echo=combinedD}
SO <- readRDS(paste0(PATH_RDS_OBJECTS, "/06_combine_", DATASET, "_scenario.", scenario, "_method.", combine_meth, ".rds"))
```
```{r remove-clusters, eval=goodQ && !is.na(rm_clust), results="asis"}
cat("# Removal of the selected clusters\n")

cat(paste0("Removing the following clusters: ", paste(rm_clust, collapse = ', '), "\n"))

remove_clusters <- function(SO, scenario){
    SO@meta.data <- merge(SO@meta.data, clusters[clustOfInterest], by = 'row.names')
    SO@meta.data <- tibble::column_to_rownames(SO@meta.data, "Row.names")
    cells_to_keep <- which(!unlist(SO@meta.data[clustOfInterest]) %in% rm_clust)

    cat(paste0("\nRemoved ", length(Cells(SO)) - length(cells_to_keep), " cells from scenario ", scenario, "\n"))

    # If a non existent cluster name is in the list, it will just be silently ignored.
    SO <- SO[,cells_to_keep]
    SO@meta.data[clustOfInterest] <- NULL
    return(SO)
}
SO.list <- mapply(remove_clusters, SO.list, names(SO.list), SIMPLIFY = FALSE)
```

# Preprocessing workflow

The preprocessing workflow consists in the steps of

- data normalization

- data centering

- highly variable genes identification

- principal components analysis (PCA)

- non-linear dimension reduction for visualization, and

- clustering.

```{r preprocess-1, eval=hvg_meth != "mvp", echo=hvg_meth != "mvp", message=TRUE}
# Normalization is done at the cell level, so we only need to do it once on individual datasets
# By default, we don't scale the data and just use the centering property of ScaleData
if (combinedD) {
    SO <- ScaleData(SO, features=rownames(SO), do.scale=do_scale, verbose=FALSE)
} else if (!goodQ) {
    SO <- NormalizeData(SO, normalization.method = norm_meth, verbose=FALSE)
    SO <- ScaleData(SO, features=rownames(SO), do.scale=do_scale, verbose=FALSE)
}

SO <- FindVariableFeatures(SO, nfeatures=hvg_num, selection.method = hvg_meth, verbose=FALSE)

SO <- RunPCA(SO, npcs = pca_npcs, nfeatures.print = pca_print, seed.use = general_seed, verbose=TRUE)
SO <- RunUMAP(SO, dims = 1:top_pcs, seed.use = general_seed, verbose = FALSE)

if (goodQ) {
    saveRDS(SO, file=file.path(PATH_RDS_OBJECTS, paste0("05_preprocessed_", DATASET, "_scenario.", scenario, ".rds")))
} else if (combinedD) {
    saveRDS(SO, file=file.path(PATH_RDS_OBJECTS, paste0("07_preprocessed_", DATASET, "_scenario.", scenario, ".rds")))
} else {
    saveRDS(SO, file=file.path(PATH_RDS_OBJECTS, paste0("02_preprocessed_", DATASET, "_", FILTER, ".rds")))
}

var.feats.list <- VariableFeatures(SO)
```

```{r preprocess-2, eval=hvg_meth == "mvp", echo=hvg_meth == "mvp", message=TRUE}
if (combinedD) {
    SO <- ScaleData(SO, features=rownames(SO), do.scale=do_scale, verbose=FALSE)
} else if (!goodQ) {
SO <- NormalizeData(SO, normalization.method = norm_meth, verbose=FALSE)
SO <- ScaleData(SO, features=rownames(SO), do.scale=do_scale, verbose=FALSE)
}

SO <- FindVariableFeatures(SO, selection.method = hvg_meth, verbose=FALSE)
    
t(as.data.frame(table(SO@assays[["RNA"]]@meta.features$mvp.variable))) %>%
    knitr::kable(caption = "Number of features selected as highly variable genes (HVG)",
                    row.names = FALSE,
                    escape = F,
                    align = "cc") %>%
    kable_styling(bootstrap_options = c("striped", "hover"))

SO <- RunPCA(SO, npcs = pca_npcs, nfeatures.print = pca_print, seed.use = general_seed, verbose=TRUE)
SO <- RunUMAP(SO, dims = 1:top_pcs, seed.use = general_seed, verbose = FALSE)

if (goodQ) {
    saveRDS(SO, file=file.path(PATH_RDS_OBJECTS, paste0("05_preprocessed_", DATASET, "_scenario.", scenario, ".rds")))
} else if (combinedD) {
    saveRDS(SO, file=file.path(PATH_RDS_OBJECTS, paste0("07_preprocessed_", DATASET, "_scenario.", scenario, ".rds")))
} else {
    saveRDS(SO, file=file.path(PATH_RDS_OBJECTS, paste0("02_preprocessed_", DATASET, "_", FILTER, ".rds")))
}

var.feats.list <- VariableFeatures(SO)
```

```{r clustering-process, out.width='50%', results='asis', warning=FALSE}
SO <- FindNeighbors(SO, dims = 1:top_pcs, verbose=FALSE)
SO <- FindClusters(
    SO,
    resolution = clust_res,
    algorithm = clust_meth,
    random.seed = general_seed,
    verbose = FALSE,
    method = ifelse(length(colnames(SO)) <= 40000, "matrix", "igraph")
)
```

# Visualization

```{r dimplot-cluster-cplt, eval=FILTER == "complete", echo=FILTER == "complete"}
invisible(Idents(SO) <- SO$seurat_clusters)
print(DimPlot(SO))

```

```{r dimplot-cluster-fltd, eval=FILTER == "filtered", echo=FILTER == "filtered", fig.align='default', results='asis'}

Idents(SO) <- SO$seurat_clusters
print(DimPlot(SO))

if (goodQ) {
    cat("In the second plot, colors correspond to cell cycle phases.")

    # Load cell cycle scores and phase table
    phaseTable <- read.table(paste0("../05_additionalControls/cellCycleScoresAssignation_", DATASET, "_scenario_", scenario, ".csv"), sep = ",", header = TRUE)
    phaseTable <- column_to_rownames(phaseTable, var = "cellIDs")
    
    # merge phase column of cell cycle scores to the SO metadata table according to the cellnames
    SO@meta.data <- merge(SO@meta.data, phaseTable["Phase"], by = "row.names", all.y = FALSE)
    SO@meta.data <- column_to_rownames(SO@meta.data, var = "Row.names")

    Idents(SO) <- SO@meta.data$Phase
    print(DimPlot(SO, shuffle = TRUE))
}
```

# Exports

We export the list of highly variable features and the PCA and UMAP coordinates
of the dataset. We also save the clustering information as a csv file.

```{r exports}
# variable feature
if (!goodQ) {
    write.table(
        var.feats.list,
        file = file.path(
            PATH_OUT_ANALYSIS,
            paste0("varFeatures_meth.", hvg_meth, "_nb.", hvg_num, "_", DATASET, "_", ifelse(!goodQ && !combinedD, FILTER, paste0("scenario.", scenario)), ".txt")),
        row.names = FALSE,
        col.names = FALSE
    )
}


# PCA and UMAP tables

invisible(sapply(c("pca", "umap"), function(dr) {
    export_dimred(SO, dr, paste0("DR_", dr, "_", DATASET, "_", ifelse(!goodQ && !combinedD, FILTER, paste0("scenario.", scenario)), ".csv"))
}))


# Clustering
defaultMDnames <- c("orig.ident", "nCount_RNA", "nFeature_RNA")
if (goodQ) {
    defaultMDnames <- c(defaultMDnames, "celltype_DF")
}


export_metadata(
    SO@meta.data,
    paste0(
        "clustering_", DATASET, "_",
        ifelse(!goodQ && !combinedD, FILTER, paste0("scenario.", scenario)),
        "_meth.", clust_meth,
        "_res.", clust_res,
        ".csv"))


invisible(gc())
```


```{r metadata, echo=FALSE, child="61_show_metadata.Rmd"}
```