07_meld_pipeline.Rmd

---
author: "Systems Biomedicine Team"
date: "Last compiled on `r format(Sys.time(), '%d %B, %Y')`"
output:
    rmdformats::material:
        use_bookdown: true
        thumbnails: false
        df_print: kable
        code_folding: hide
        number_sections: yes
    pdf_document:
        number_sections: yes
        toc: yes
        toc_depth: 3
        keep_tex: no
title: |
  | Step 10: Differential abundance analysis -- `r DATASET` 
---

<!-- Javascript for zooming on figures (adapted from: https://stackoverflow.com/questions/40401680) -->

<!-- Jquery import conflicts with DT::datatable so needs to be commented here -->
<!-- <script src = "https://ajax.googleapis.com/ajax/libs/jquery/3.1.1/jquery.min.js"></script> -->
<!-- 
<style>
.zoomDiv {
  display: none;
  position: fixed;
  top: 50%;
  left: 50%;
  z-index: 50;
  transform: translate(-50%, -50%);
  background-color: #FFFFFF;
  box-shadow: 0px 0px 50px #888888;
  width: fit-content;
  max-width: 90%;
  max-height: 90%;
  overflow: auto;
}

.zoomImg {
  width: 150%;
}
</style>

<script type = "text/javascript">
  $(document).ready(function() {
    $("body").prepend("<div class = \"zoomDiv\"><img src = \"\" class = \"zoomImg\"></div>");
    // onClick for all img except the zoomed one and link ones (filter)
    // use "img.zoom" and out.extra = "class = \"zoom\"" in chunk to specify manually which chunk images can be zoomed
    $("img:not(.zoomImg)").filter(":not(a *)").click(function() {
      $(".zoomImg").attr("src", $(this).attr("src"));
      $(".zoomDiv").show();
    })
    // onClick function for hiding div
    $("img.zoomImg").click(function() {
      $(".zoomDiv").hide();
    })
  })
</script> -->


```{r setup, include = FALSE}
library(knitr)
library(reticulate)
# we force the use of our env's python. We faced situation where reticulates were using the banse env's python
reticulate::use_python(system("which python", intern = TRUE))
knitr::knit_engines$set(python = reticulate::eng_python)
options(knitr.purl.inline = TRUE)
knitr::opts_chunk$set(
  # code evaluation
  eval = TRUE,

  # text output
  echo = TRUE,
  results = "hold",
  warning = FALSE,
  error = FALSE,
  message = FALSE,
  strip.white = TRUE,

  # code decoration
  tidy.opts = list(width.cutoff = 90),
  comment = "",
  attr.output = ".numberLines",

  # plots
  fig.path = paste0(PATH_OUT_FIG, "/07_MELD_scenario_", scenario, "_"),      # is set later, in chunk setup2
  fig.show = "hold",         # tuned to "hold" in multiple plots chunk
  dev = "png",
  out.width = "50%",
  fig.width = 12,
  fig.height = 12,
  fig.align = "center"   # should be tuned to default in multiple plots chunk
)
```

# The MELD method

To perform differential abundance analysis, the MELD method calculates the relative likelihood that each cell would be observed in either a condition or the other over a manifold approximated from all cells of all conditions.
In other words, it gives for each cell, how likely it is to be observed in one condition. Using the calculated ratio, it is then possible to identify which cell population react differently to the conditions.

<a href="https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8122059/">Original paper available here.</a>
```{r load-r-libraries, include=FALSE}
source("./conversions.R")
source("./checkDirHierarchy.R", local = TRUE)
```

```{r dir-management, include=FALSE}
# PATHS
PATH_CLUSTER_TABLE <- paste0(PATH_ROOT, "/08_combineData/clustering_", DATASET, "_scenario.", scenario, "_meth.", CLUST_METH, "_res.", CLUST_RES, ".csv")
PATH_RDS_FILE <- file.path(PATH_RDS_OBJECTS, paste0("07_preprocessed_", DATASET, "_scenario.", scenario, ".rds"))
PATH_OUT_ANALYSIS <- file.path(PATH_ROOT, "05_additionalControls")
if (!dir.exists(PATH_OUT_ANALYSIS)) { dir.create(PATH_OUT_ANALYSIS) }

# check file existence
checkPath(PATH_CLUSTER_TABLE)
checkPath(PATH_RDS_FILE)

if (atlas) {
    PATH_LABEL_TRANSFER <- paste0(PATH_ROOT, "/08_combineData/transferAnnotation_combinedData_", DATASET, "_scenario_", scenario, ".csv")
    
    checkPath(PATH_LABEL_TRANSFER)
    checkPath(file.path(PATH_ATLAS, "colorsSheet.csv"))
}

if (manAnn) {
    checkPath(PATH_MANUAL_ANNOTATION)
}
```

# Load `r DATASET` data for scenario `r scenario` and compute MELD

```{r convert-to-h5ad, include=FALSE}
converted_file <- gsub("\\.rds", ".h5ad", PATH_RDS_FILE)
# Run the conversion if the converted file does not exist or the input file is newer
if(!(file.exists(converted_file) && file.info(PATH_RDS_FILE)$mtime < file.info(converted_file)$mtime)){
  seurat_to_h5ad(PATH_RDS_FILE)
}
```

```{python load-py-libraries, include=FALSE}
import pandas as pd
import numpy as np
import meld
import scanpy as sc
import matplotlib as mpl
import matplotlib.pyplot as plt
import scprep
import sklearn
from pathlib import Path
np.random.seed(42)
```

```{python mpl-init, include=FALSE}
mpl.use("agg", force = True)
plt.rc("axes", titlesize=20, labelsize=20)
plt.rc("ytick", labelsize=18)
plt.rc("xtick", labelsize=18)
plt.rc("figure", titlesize=26)
plt.rc("legend", fontsize=18)
```

```{python load-data, include = FALSE}
adata = sc.read_h5ad(Path(r.PATH_RDS_FILE).with_suffix(".h5ad"))

# Get the loadings of the different reductions, the metadata and raw data as a dataframe. We will mostly be using the umap to project the data.
umap = adata.obsm["X_umap"]
pca = adata.obsm["X_pca"]
metadata = adata.obs
data = adata.to_df()

conditions = np.unique(metadata["orig.ident"])
experimental_samples = np.unique(metadata[metadata["orig.ident"] == conditions[0], "replicates"]) if "replicates" in metadata else [conditions[0]]
replicate_markers = np.unique(metadata["replicate_markers"]) if "replicate_markers" in metadata else None
del adata
```

```{python compute-meld}
# create the affinity graph and perform the MELD algorithm on it. The beta and knn values come from the paper and were determined optimals after a benchmark by the authors.
meld_op = meld.MELD(beta = 67, knn=7)
sample_densities = meld_op.fit_transform(pca, sample_labels=metadata["orig.ident"])
```

```{python sample-likelihood, include=FALSE}
def replicate_normalize_densities(sample_densities, replicates):
    # Get the unique replicates
    sample_likelihoods = sample_densities.copy()
    
    # If there is no replicate, we normalize across the complete conditions
    if replicates is None:
      sample_likelihoods[conditions] = sklearn.preprocessing.normalize(sample_densities[conditions], norm="l1")
    else:
      for rep in replicates:
          # Select the columns of `sample_densities` for that replicate
          curr_cols = sample_densities.columns[[rep in col for col in sample_densities.columns]]
          # Apply L1 normalization between the same technical replicate of the conditions. If there is no replicate, apply L1 normalization on the conditions
          sample_likelihoods[curr_cols] = sklearn.preprocessing.normalize(sample_densities[curr_cols], norm='l1')

    return sample_likelihoods

sample_likelihoods = replicate_normalize_densities(sample_densities, replicate_markers)
```

```{python metadata-prep, include=FALSE}
metadata['cond_likelihood'] = sample_likelihoods[experimental_samples].mean(axis=1).values # store the mean of the condition's replicate

# Clusters
clusters = pd.read_csv(r.PATH_CLUSTER_TABLE, header=0)
metadata["clusters"] = clusters["integrated_snn_res.1"].values
cluster_colors = mpl.colormaps["tab20"].resampled(len(np.unique(metadata["clusters"])))

# Manual annotation
if r.manAnn:
    # load file
    file = pd.read_csv(Path(r.PATH_MANUAL_ANNOTATION,
    f"manualAnnotation_{r.DATASET}_scenario.{r.scenario}.csv"), header = 0)
    # fill metadata table
    metadata["clusters"] = file["integrated_snn_res.1"].values
    metadata["manualAnnot"] = file["fine_annotation"].values
    # get colors
    cluster_colors = mpl.colormaps["tab20"].resampled(len(np.unique(metadata["clusters"])))
    annotation_colors = mpl.colormaps["tab20"].resampled(len(np.unique(metadata["manualAnnot"])))

# Label tranfer annotations from Atlas
if r.atlas:
  pred = pd.read_csv(r.PATH_LABEL_TRANSFER, header=0)
  colortable = pd.read_csv(Path(r.PATH_ATLAS, "colorsSheet.csv"), header=0)
  metadata["prediction"] = pred["predicted.id"].values
  celltype_colors = dict(zip(colortable["celltype"],colortable["celltype_color"]))
  predcol = [celltype_colors[celltype] for celltype in metadata["prediction"]]
```

# MELD visualisation

The MELD method computes the relative likelihood that each cell is in one sample or the other. The plots below display the umap projection of the likelihood manifold for the `r py$conditions[1]`condition.
The more a cell is close to 1, the higher the likelihood of it belonging to the `r py$conditions[1]` sample, and inversely. The cells with a likelihood around 0.5 are equivalent in each conditions.

Only one sample is represented here, because the value of a cell for the `r py$conditions[2]` dataset is simply 1 - `r py$conditions[1]` value.

```{python plot-likelihoods, fig.align='default'}
vmin = np.min(metadata["cond_likelihood"])
vmax = np.max(metadata["cond_likelihood"])

# Display the likelihood of cells belonging to the condition of the replicate (higher values mean higher likelihood).
# When there are only two conditions, the values of the second condition are just 1 - values of the first condition,
# which is pointless to represent.
if(len(experimental_samples) > 1):
    #fig, axes = plt.subplots(len(experimental_samples),2, figsize=(len(experimental_samples)*5,8))

    for curr_sample in experimental_samples:
        scprep.plot.scatter2d(umap, c=sample_likelihoods[curr_sample], cmap=meld.get_meld_cmap(),
                            vmin=0, vmax=1,
                            title=curr_sample, ticks=False)
        plt.tight_layout()
        plt.show()
        scprep.plot.scatter2d(umap, c=sample_likelihoods[curr_sample], cmap=meld.get_meld_cmap(),
                            vmin=vmin, vmax=vmax,
                            title=f"{curr_sample} high contrast", ticks=False)
        plt.tight_layout()
        plt.show()
else:
    scprep.plot.scatter2d(umap, c=sample_likelihoods[experimental_samples[0]], cmap=meld.get_meld_cmap(),
                            vmin=0, vmax=1,
                            title=experimental_samples[0], ticks=False)
    
    plt.tight_layout()
    plt.show()
    scprep.plot.scatter2d(umap, c=sample_likelihoods[experimental_samples[0]], cmap=meld.get_meld_cmap(),
                            vmin=vmin, vmax=vmax,
                            title=f"{experimental_samples[0]} high contrast", ticks=False)
    plt.tight_layout()
    plt.show()
plt.clf()
```

`r if(length(py$experimental_samples) > 1){ cat("## Replicates mean and standard deviation")}`

```{python plot-likelihoods-stats, eval=(length(py$experimental_samples) > 1), echo=(length(py$experimental_samples) > 1), fig.align="default"}
# Display the mean and std between replicates. Only meaningful with technical replicates in one condition.
# Low standard deviation highlight areas where the cells are similarly enriched or depleted between replicates

scprep.plot.scatter2d(umap, c=sample_likelihoods[experimental_samples].mean(axis=1), 
                    cmap=meld.get_meld_cmap(), vmin=0, vmax=1,
                    title='Mean', ticks=False)
plt.tight_layout()
plt.show()
scprep.plot.scatter2d(umap, c=sample_likelihoods[experimental_samples].std(axis=1), vmin=0, 
                    cmap='inferno', title='St. Dev.', ticks=False)

plt.tight_layout()
plt.show()
```

## Clusters vs MELD

```{python plot-clustering, fig.align='default'}
scprep.plot.scatter2d(umap, c=metadata["clusters"], cmap=cluster_colors,
                      title='Cluster visualisation', ticks=False, discrete=True, legend=True)

for cluster in metadata["clusters"]:
  plt.annotate(cluster, 
    umap[metadata['clusters']==cluster].mean(axis=0),
    horizontalalignment='center',
    verticalalignment='center',
    size=10, weight='bold',
    color="black")

plt.tight_layout()
plt.show()

if r.manAnn:
  scprep.plot.scatter2d(umap, c=metadata["manualAnnot"], cmap=annotation_colors,
                        title='Manual annotation visualisation', ticks=False, legend=False) # Legend is finicky, will either try to correctly display or delete the chunk

  for annot in metadata["manualAnnot"]:
    plt.annotate(annot, 
      umap[metadata["manualAnnot"]==annot].mean(axis=0),
      horizontalalignment='center',
      verticalalignment='center',
      size=8,
      color="black")

  plt.tight_layout()
  plt.show()

plt.clf()
```

```{r, out.width="50%"}
checkPath(paste0(PATH_OUT_FIG, "/06_annotDEAviz_scenario_", scenario, "_celltype-DimPlot-1.png"))
include_graphics(paste0(PATH_OUT_FIG, "/06_annotDEAviz_scenario_", scenario, "_celltype-DimPlot-1.png"))
```

```{python plot-confidence-interval-func, include=FALSE}
# Function adapted from https://stackoverflow.com/a/70949996

def plot_confidence_interval(y, values, z=1.96, thresholds=[0.5,0.5], colors=["green", 'blue', "red"], vertical_line_width=0.35):
    mean = np.mean(values)
    stdev = np.std(values)
    confidence_interval = z * stdev #/ np.sqrt(len(values))
    if (mean < thresholds[0] and mean + confidence_interval < thresholds[0]):
        color = colors[0]
    elif (mean > thresholds[1] and mean - confidence_interval > thresholds[1]):
        color = colors[1]
    else:
        color = colors[2]
        
    top = y - vertical_line_width / 2
    left = mean - confidence_interval
    bottom = y + vertical_line_width / 2
    right = mean + confidence_interval
    plt.plot([left, right], [y, y], color=color)
    plt.plot([left, left], [top, bottom], color=color)
    plt.plot([right, right], [top, bottom], color=color)
    plt.plot(mean, y, "o", color="black")

    return {y: color}
```

# Confidence interval of the likelihood
```{python likelihood-plot-prep, include=FALSE}
#Sort the index of each cluster from lowest to highest average chd likelihood value
metadata[f"clustersID"] = scprep.utils.sort_clusters_by_values(metadata["clusters"], metadata["cond_likelihood"])
likelihood_cmap = {"clusters":{}}
if r.atlas:
    metadata[f"predictionID"] = scprep.utils.sort_clusters_by_values(metadata["prediction"], metadata["cond_likelihood"])
    likelihood_cmap["prediction"] = dict()
if r.manAnn:
    metadata[f"manualAnnotID"] = scprep.utils.sort_clusters_by_values(metadata["manualAnnot"], metadata["cond_likelihood"])
    likelihood_cmap["manualAnnot"] = dict()

likelihood_colors = ['blue', "red", "grey"]
likelihood_thresholds = [0.5,0.5]
```

The likelihood of each cluster can be plotted just like the individual cells. Here, the cells the clusters and celltypes significantly **depleted** in the `r py$conditions[1]` dataset are highlighted
in `r py$likelihood_colors[1]`, and those significantly **enriched** are highlighted in `r py$likelihood_colors[2]`. Clusters and celltypes with no significant difference are in `r py$likelihood_colors[3]`.
To be considered significantly depleted, a cluster must have a 95% credible interval below `r py$likelihood_thresholds[1]` likelihood, and above `r py$likelihood_thresholds[2]` to be significantly enriched.

Below, the clusters are colored depending on their depletion/enrichment.

```{python plot-likelihood-clusters, fig.align='default'}

legend_lines = [mpl.lines.Line2D([0], [0], marker = "o", color = likelihood_colors[0], markerfacecolor = "black"),
                mpl.lines.Line2D([0], [0], marker = "o", color = likelihood_colors[1], markerfacecolor = "black"),
                mpl.lines.Line2D([0], [0], marker = "o", color = likelihood_colors[2], markerfacecolor = "black")]
legend_scatter = [mpl.lines.Line2D([0], [0], marker="o", color = "white", markerfacecolor = likelihood_colors[0]),
                  mpl.lines.Line2D([0], [0], marker="o", color = "white", markerfacecolor = likelihood_colors[1]),
                  mpl.lines.Line2D([0], [0], marker="o", color = "white", markerfacecolor = likelihood_colors[2])]
legend_labels =  ["Significantly depleted","Significantly enriched","No significant difference"]

if r.atlas and r.manAnn:
    clustering_type = ["manualAnnot", "prediction"]
elif r.atlas:
    clustering_type = ["clusters", "prediction"]
elif r.manAnn:
    clustering_type = ["clusters", "manualAnnot"]
else:
    clustering_type = ["clusters"]
plt.figure(figsize=(18,10), dpi=120)


for clustering in clustering_type:
  clusters, counts = np.unique(metadata[clustering], return_counts=True)

  for pred in np.unique(metadata[f"{clustering}ID"]):
    clust_cmap = plot_confidence_interval(pred, metadata.loc[metadata[f"{clustering}ID"] == pred, "cond_likelihood"], thresholds = likelihood_thresholds, colors=likelihood_colors)
    likelihood_cmap[clustering].update(clust_cmap)

  plt.axvline(0.5, linestyle = '--', color='grey', zorder=0)

  plt.grid(visible=True)
  plt.xlim(0,1)

  ylabel = [f"{clust} ({counts[list(clusters).index(clust)]})" for clust in metadata.set_index(f"{clustering}ID")[clustering].drop_duplicates()]
  plt.yticks(metadata[f"{clustering}ID"].drop_duplicates(), ylabel, fontsize=14)

  plt.title(f"Likelihood of the {'cluster' if clustering == 'clusters' else 'celltype'} cells enrichment in {conditions[0]} condition")

  plt.legend(legend_lines,legend_labels, loc="center right")
  plt.tight_layout()
  plt.show()


  scprep.plot.scatter2d(umap, c=metadata[f"{clustering}ID"], cmap=likelihood_cmap[clustering],
                        title=f'{clustering} likelihoods', ticks=False, legend=False)
  for cluster in metadata[clustering]:
    plt.annotate(cluster, 
      umap[metadata[clustering]==cluster].mean(axis=0),
      horizontalalignment='center',
      verticalalignment='center',
      size=10, weight='bold',
      color="black", backgroundcolor="white")

  plt.legend(legend_scatter,legend_labels, loc="upper right",fontsize="large")
  plt.tight_layout()
  plt.show()

scprep.plot.scatter2d(umap, c=metadata["orig.ident"], cmap=mpl.colormaps["tab10"](range(len(conditions))),
                      title='Cell distribution by conditions', ticks=False, legend=True)
plt.tight_layout()
plt.show()

plt.clf()
```

```{r metadata, echo=FALSE, child="61_show_metadata.Rmd"}
```