-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path07_meld_pipeline.Rmd
450 lines (371 loc) · 17.8 KB
/
07_meld_pipeline.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
---
author: "Systems Biomedicine Team"
date: "Last compiled on `r format(Sys.time(), '%d %B, %Y')`"
output:
rmdformats::material:
use_bookdown: true
thumbnails: false
df_print: kable
code_folding: hide
number_sections: yes
pdf_document:
number_sections: yes
toc: yes
toc_depth: 3
keep_tex: no
title: |
| Step 10: Differential abundance analysis -- `r DATASET`
---
<!-- Javascript for zooming on figures (adapted from: https://stackoverflow.com/questions/40401680) -->
<!-- Jquery import conflicts with DT::datatable so needs to be commented here -->
<!-- <script src = "https://ajax.googleapis.com/ajax/libs/jquery/3.1.1/jquery.min.js"></script> -->
<!--
<style>
.zoomDiv {
display: none;
position: fixed;
top: 50%;
left: 50%;
z-index: 50;
transform: translate(-50%, -50%);
background-color: #FFFFFF;
box-shadow: 0px 0px 50px #888888;
width: fit-content;
max-width: 90%;
max-height: 90%;
overflow: auto;
}
.zoomImg {
width: 150%;
}
</style>
<script type = "text/javascript">
$(document).ready(function() {
$("body").prepend("<div class = \"zoomDiv\"><img src = \"\" class = \"zoomImg\"></div>");
// onClick for all img except the zoomed one and link ones (filter)
// use "img.zoom" and out.extra = "class = \"zoom\"" in chunk to specify manually which chunk images can be zoomed
$("img:not(.zoomImg)").filter(":not(a *)").click(function() {
$(".zoomImg").attr("src", $(this).attr("src"));
$(".zoomDiv").show();
})
// onClick function for hiding div
$("img.zoomImg").click(function() {
$(".zoomDiv").hide();
})
})
</script> -->
```{r setup, include = FALSE}
library(knitr)
library(reticulate)
# we force the use of our env's python. We faced situation where reticulates were using the banse env's python
reticulate::use_python(system("which python", intern = TRUE))
knitr::knit_engines$set(python = reticulate::eng_python)
options(knitr.purl.inline = TRUE)
knitr::opts_chunk$set(
# code evaluation
eval = TRUE,
# text output
echo = TRUE,
results = "hold",
warning = FALSE,
error = FALSE,
message = FALSE,
strip.white = TRUE,
# code decoration
tidy.opts = list(width.cutoff = 90),
comment = "",
attr.output = ".numberLines",
# plots
fig.path = paste0(PATH_OUT_FIG, "/07_MELD_scenario_", scenario, "_"), # is set later, in chunk setup2
fig.show = "hold", # tuned to "hold" in multiple plots chunk
dev = "png",
out.width = "50%",
fig.width = 12,
fig.height = 12,
fig.align = "center" # should be tuned to default in multiple plots chunk
)
```
# The MELD method
To perform differential abundance analysis, the MELD method calculates the relative likelihood that each cell would be observed in either a condition or the other over a manifold approximated from all cells of all conditions.
In other words, it gives for each cell, how likely it is to be observed in one condition. Using the calculated ratio, it is then possible to identify which cell population react differently to the conditions.
<a href="https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8122059/">Original paper available here.</a>
```{r load-r-libraries, include=FALSE}
source("./conversions.R")
source("./checkDirHierarchy.R", local = TRUE)
```
```{r dir-management, include=FALSE}
# PATHS
PATH_CLUSTER_TABLE <- paste0(PATH_ROOT, "/08_combineData/clustering_", DATASET, "_scenario.", scenario, "_meth.", CLUST_METH, "_res.", CLUST_RES, ".csv")
PATH_RDS_FILE <- file.path(PATH_RDS_OBJECTS, paste0("07_preprocessed_", DATASET, "_scenario.", scenario, ".rds"))
PATH_OUT_ANALYSIS <- file.path(PATH_ROOT, "05_additionalControls")
if (!dir.exists(PATH_OUT_ANALYSIS)) { dir.create(PATH_OUT_ANALYSIS) }
# check file existence
checkPath(PATH_CLUSTER_TABLE)
checkPath(PATH_RDS_FILE)
if (atlas) {
PATH_LABEL_TRANSFER <- paste0(PATH_ROOT, "/08_combineData/transferAnnotation_combinedData_", DATASET, "_scenario_", scenario, ".csv")
checkPath(PATH_LABEL_TRANSFER)
checkPath(file.path(PATH_ATLAS, "colorsSheet.csv"))
}
if (manAnn) {
checkPath(PATH_MANUAL_ANNOTATION)
}
```
# Load `r DATASET` data for scenario `r scenario` and compute MELD
```{r convert-to-h5ad, include=FALSE}
converted_file <- gsub("\\.rds", ".h5ad", PATH_RDS_FILE)
# Run the conversion if the converted file does not exist or the input file is newer
if(!(file.exists(converted_file) && file.info(PATH_RDS_FILE)$mtime < file.info(converted_file)$mtime)){
seurat_to_h5ad(PATH_RDS_FILE)
}
```
```{python load-py-libraries, include=FALSE}
import pandas as pd
import numpy as np
import meld
import scanpy as sc
import matplotlib as mpl
import matplotlib.pyplot as plt
import scprep
import sklearn
from pathlib import Path
np.random.seed(42)
```
```{python mpl-init, include=FALSE}
mpl.use("agg", force = True)
plt.rc("axes", titlesize=20, labelsize=20)
plt.rc("ytick", labelsize=18)
plt.rc("xtick", labelsize=18)
plt.rc("figure", titlesize=26)
plt.rc("legend", fontsize=18)
```
```{python load-data, include = FALSE}
adata = sc.read_h5ad(Path(r.PATH_RDS_FILE).with_suffix(".h5ad"))
# Get the loadings of the different reductions, the metadata and raw data as a dataframe. We will mostly be using the umap to project the data.
umap = adata.obsm["X_umap"]
pca = adata.obsm["X_pca"]
metadata = adata.obs
data = adata.to_df()
conditions = np.unique(metadata["orig.ident"])
experimental_samples = np.unique(metadata[metadata["orig.ident"] == conditions[0], "replicates"]) if "replicates" in metadata else [conditions[0]]
replicate_markers = np.unique(metadata["replicate_markers"]) if "replicate_markers" in metadata else None
del adata
```
```{python compute-meld}
# create the affinity graph and perform the MELD algorithm on it. The beta and knn values come from the paper and were determined optimals after a benchmark by the authors.
meld_op = meld.MELD(beta = 67, knn=7)
sample_densities = meld_op.fit_transform(pca, sample_labels=metadata["orig.ident"])
```
```{python sample-likelihood, include=FALSE}
def replicate_normalize_densities(sample_densities, replicates):
# Get the unique replicates
sample_likelihoods = sample_densities.copy()
# If there is no replicate, we normalize across the complete conditions
if replicates is None:
sample_likelihoods[conditions] = sklearn.preprocessing.normalize(sample_densities[conditions], norm="l1")
else:
for rep in replicates:
# Select the columns of `sample_densities` for that replicate
curr_cols = sample_densities.columns[[rep in col for col in sample_densities.columns]]
# Apply L1 normalization between the same technical replicate of the conditions. If there is no replicate, apply L1 normalization on the conditions
sample_likelihoods[curr_cols] = sklearn.preprocessing.normalize(sample_densities[curr_cols], norm='l1')
return sample_likelihoods
sample_likelihoods = replicate_normalize_densities(sample_densities, replicate_markers)
```
```{python metadata-prep, include=FALSE}
metadata['cond_likelihood'] = sample_likelihoods[experimental_samples].mean(axis=1).values # store the mean of the condition's replicate
# Clusters
clusters = pd.read_csv(r.PATH_CLUSTER_TABLE, header=0)
metadata["clusters"] = clusters["integrated_snn_res.1"].values
cluster_colors = mpl.colormaps["tab20"].resampled(len(np.unique(metadata["clusters"])))
# Manual annotation
if r.manAnn:
# load file
file = pd.read_csv(Path(r.PATH_MANUAL_ANNOTATION,
f"manualAnnotation_{r.DATASET}_scenario.{r.scenario}.csv"), header = 0)
# fill metadata table
metadata["clusters"] = file["integrated_snn_res.1"].values
metadata["manualAnnot"] = file["fine_annotation"].values
# get colors
cluster_colors = mpl.colormaps["tab20"].resampled(len(np.unique(metadata["clusters"])))
annotation_colors = mpl.colormaps["tab20"].resampled(len(np.unique(metadata["manualAnnot"])))
# Label tranfer annotations from Atlas
if r.atlas:
pred = pd.read_csv(r.PATH_LABEL_TRANSFER, header=0)
colortable = pd.read_csv(Path(r.PATH_ATLAS, "colorsSheet.csv"), header=0)
metadata["prediction"] = pred["predicted.id"].values
celltype_colors = dict(zip(colortable["celltype"],colortable["celltype_color"]))
predcol = [celltype_colors[celltype] for celltype in metadata["prediction"]]
```
# MELD visualisation
The MELD method computes the relative likelihood that each cell is in one sample or the other. The plots below display the umap projection of the likelihood manifold for the `r py$conditions[1]`condition.
The more a cell is close to 1, the higher the likelihood of it belonging to the `r py$conditions[1]` sample, and inversely. The cells with a likelihood around 0.5 are equivalent in each conditions.
Only one sample is represented here, because the value of a cell for the `r py$conditions[2]` dataset is simply 1 - `r py$conditions[1]` value.
```{python plot-likelihoods, fig.align='default'}
vmin = np.min(metadata["cond_likelihood"])
vmax = np.max(metadata["cond_likelihood"])
# Display the likelihood of cells belonging to the condition of the replicate (higher values mean higher likelihood).
# When there are only two conditions, the values of the second condition are just 1 - values of the first condition,
# which is pointless to represent.
if(len(experimental_samples) > 1):
#fig, axes = plt.subplots(len(experimental_samples),2, figsize=(len(experimental_samples)*5,8))
for curr_sample in experimental_samples:
scprep.plot.scatter2d(umap, c=sample_likelihoods[curr_sample], cmap=meld.get_meld_cmap(),
vmin=0, vmax=1,
title=curr_sample, ticks=False)
plt.tight_layout()
plt.show()
scprep.plot.scatter2d(umap, c=sample_likelihoods[curr_sample], cmap=meld.get_meld_cmap(),
vmin=vmin, vmax=vmax,
title=f"{curr_sample} high contrast", ticks=False)
plt.tight_layout()
plt.show()
else:
scprep.plot.scatter2d(umap, c=sample_likelihoods[experimental_samples[0]], cmap=meld.get_meld_cmap(),
vmin=0, vmax=1,
title=experimental_samples[0], ticks=False)
plt.tight_layout()
plt.show()
scprep.plot.scatter2d(umap, c=sample_likelihoods[experimental_samples[0]], cmap=meld.get_meld_cmap(),
vmin=vmin, vmax=vmax,
title=f"{experimental_samples[0]} high contrast", ticks=False)
plt.tight_layout()
plt.show()
plt.clf()
```
`r if(length(py$experimental_samples) > 1){ cat("## Replicates mean and standard deviation")}`
```{python plot-likelihoods-stats, eval=(length(py$experimental_samples) > 1), echo=(length(py$experimental_samples) > 1), fig.align="default"}
# Display the mean and std between replicates. Only meaningful with technical replicates in one condition.
# Low standard deviation highlight areas where the cells are similarly enriched or depleted between replicates
scprep.plot.scatter2d(umap, c=sample_likelihoods[experimental_samples].mean(axis=1),
cmap=meld.get_meld_cmap(), vmin=0, vmax=1,
title='Mean', ticks=False)
plt.tight_layout()
plt.show()
scprep.plot.scatter2d(umap, c=sample_likelihoods[experimental_samples].std(axis=1), vmin=0,
cmap='inferno', title='St. Dev.', ticks=False)
plt.tight_layout()
plt.show()
```
## Clusters vs MELD
```{python plot-clustering, fig.align='default'}
scprep.plot.scatter2d(umap, c=metadata["clusters"], cmap=cluster_colors,
title='Cluster visualisation', ticks=False, discrete=True, legend=True)
for cluster in metadata["clusters"]:
plt.annotate(cluster,
umap[metadata['clusters']==cluster].mean(axis=0),
horizontalalignment='center',
verticalalignment='center',
size=10, weight='bold',
color="black")
plt.tight_layout()
plt.show()
if r.manAnn:
scprep.plot.scatter2d(umap, c=metadata["manualAnnot"], cmap=annotation_colors,
title='Manual annotation visualisation', ticks=False, legend=False) # Legend is finicky, will either try to correctly display or delete the chunk
for annot in metadata["manualAnnot"]:
plt.annotate(annot,
umap[metadata["manualAnnot"]==annot].mean(axis=0),
horizontalalignment='center',
verticalalignment='center',
size=8,
color="black")
plt.tight_layout()
plt.show()
plt.clf()
```
```{r, out.width="50%"}
checkPath(paste0(PATH_OUT_FIG, "/06_annotDEAviz_scenario_", scenario, "_celltype-DimPlot-1.png"))
include_graphics(paste0(PATH_OUT_FIG, "/06_annotDEAviz_scenario_", scenario, "_celltype-DimPlot-1.png"))
```
```{python plot-confidence-interval-func, include=FALSE}
# Function adapted from https://stackoverflow.com/a/70949996
def plot_confidence_interval(y, values, z=1.96, thresholds=[0.5,0.5], colors=["green", 'blue', "red"], vertical_line_width=0.35):
mean = np.mean(values)
stdev = np.std(values)
confidence_interval = z * stdev #/ np.sqrt(len(values))
if (mean < thresholds[0] and mean + confidence_interval < thresholds[0]):
color = colors[0]
elif (mean > thresholds[1] and mean - confidence_interval > thresholds[1]):
color = colors[1]
else:
color = colors[2]
top = y - vertical_line_width / 2
left = mean - confidence_interval
bottom = y + vertical_line_width / 2
right = mean + confidence_interval
plt.plot([left, right], [y, y], color=color)
plt.plot([left, left], [top, bottom], color=color)
plt.plot([right, right], [top, bottom], color=color)
plt.plot(mean, y, "o", color="black")
return {y: color}
```
# Confidence interval of the likelihood
```{python likelihood-plot-prep, include=FALSE}
#Sort the index of each cluster from lowest to highest average chd likelihood value
metadata[f"clustersID"] = scprep.utils.sort_clusters_by_values(metadata["clusters"], metadata["cond_likelihood"])
likelihood_cmap = {"clusters":{}}
if r.atlas:
metadata[f"predictionID"] = scprep.utils.sort_clusters_by_values(metadata["prediction"], metadata["cond_likelihood"])
likelihood_cmap["prediction"] = dict()
if r.manAnn:
metadata[f"manualAnnotID"] = scprep.utils.sort_clusters_by_values(metadata["manualAnnot"], metadata["cond_likelihood"])
likelihood_cmap["manualAnnot"] = dict()
likelihood_colors = ['blue', "red", "grey"]
likelihood_thresholds = [0.5,0.5]
```
The likelihood of each cluster can be plotted just like the individual cells. Here, the cells the clusters and celltypes significantly **depleted** in the `r py$conditions[1]` dataset are highlighted
in `r py$likelihood_colors[1]`, and those significantly **enriched** are highlighted in `r py$likelihood_colors[2]`. Clusters and celltypes with no significant difference are in `r py$likelihood_colors[3]`.
To be considered significantly depleted, a cluster must have a 95% credible interval below `r py$likelihood_thresholds[1]` likelihood, and above `r py$likelihood_thresholds[2]` to be significantly enriched.
Below, the clusters are colored depending on their depletion/enrichment.
```{python plot-likelihood-clusters, fig.align='default'}
legend_lines = [mpl.lines.Line2D([0], [0], marker = "o", color = likelihood_colors[0], markerfacecolor = "black"),
mpl.lines.Line2D([0], [0], marker = "o", color = likelihood_colors[1], markerfacecolor = "black"),
mpl.lines.Line2D([0], [0], marker = "o", color = likelihood_colors[2], markerfacecolor = "black")]
legend_scatter = [mpl.lines.Line2D([0], [0], marker="o", color = "white", markerfacecolor = likelihood_colors[0]),
mpl.lines.Line2D([0], [0], marker="o", color = "white", markerfacecolor = likelihood_colors[1]),
mpl.lines.Line2D([0], [0], marker="o", color = "white", markerfacecolor = likelihood_colors[2])]
legend_labels = ["Significantly depleted","Significantly enriched","No significant difference"]
if r.atlas and r.manAnn:
clustering_type = ["manualAnnot", "prediction"]
elif r.atlas:
clustering_type = ["clusters", "prediction"]
elif r.manAnn:
clustering_type = ["clusters", "manualAnnot"]
else:
clustering_type = ["clusters"]
plt.figure(figsize=(18,10), dpi=120)
for clustering in clustering_type:
clusters, counts = np.unique(metadata[clustering], return_counts=True)
for pred in np.unique(metadata[f"{clustering}ID"]):
clust_cmap = plot_confidence_interval(pred, metadata.loc[metadata[f"{clustering}ID"] == pred, "cond_likelihood"], thresholds = likelihood_thresholds, colors=likelihood_colors)
likelihood_cmap[clustering].update(clust_cmap)
plt.axvline(0.5, linestyle = '--', color='grey', zorder=0)
plt.grid(visible=True)
plt.xlim(0,1)
ylabel = [f"{clust} ({counts[list(clusters).index(clust)]})" for clust in metadata.set_index(f"{clustering}ID")[clustering].drop_duplicates()]
plt.yticks(metadata[f"{clustering}ID"].drop_duplicates(), ylabel, fontsize=14)
plt.title(f"Likelihood of the {'cluster' if clustering == 'clusters' else 'celltype'} cells enrichment in {conditions[0]} condition")
plt.legend(legend_lines,legend_labels, loc="center right")
plt.tight_layout()
plt.show()
scprep.plot.scatter2d(umap, c=metadata[f"{clustering}ID"], cmap=likelihood_cmap[clustering],
title=f'{clustering} likelihoods', ticks=False, legend=False)
for cluster in metadata[clustering]:
plt.annotate(cluster,
umap[metadata[clustering]==cluster].mean(axis=0),
horizontalalignment='center',
verticalalignment='center',
size=10, weight='bold',
color="black", backgroundcolor="white")
plt.legend(legend_scatter,legend_labels, loc="upper right",fontsize="large")
plt.tight_layout()
plt.show()
scprep.plot.scatter2d(umap, c=metadata["orig.ident"], cmap=mpl.colormaps["tab10"](range(len(conditions))),
title='Cell distribution by conditions', ticks=False, legend=True)
plt.tight_layout()
plt.show()
plt.clf()
```
```{r metadata, echo=FALSE, child="61_show_metadata.Rmd"}
```