Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ci-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ jobs:
strategy:
matrix:
os: [ubuntu-latest]
python-version: ['3.9', '3.10', '3.11', '3.12', '3.13']
python-version: ['3.10', '3.11', '3.12', '3.13', '3.14']

steps:
- uses: actions/checkout@v2
Expand Down
88 changes: 50 additions & 38 deletions pegasus/annotate_cluster/annotate_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from io import IOBase

import logging

logger = logging.getLogger(__name__)

from pegasusio import timer, MultimodalData, UnimodalData
Expand All @@ -30,8 +31,7 @@ def evaluate(
de_down: pd.DataFrame,
thre: float,
):
""" Calculate score for matching a cluster with a putative cell type.
"""
"""Calculate score for matching a cluster with a putative cell type."""
self.score = self.avgp = 0.0
self.weak_support = []
self.strong_support = []
Expand All @@ -56,14 +56,10 @@ def evaluate(

if fc >= thre:
numer += 2.0
self.strong_support.append(
(marker, f"{percent:.2f}%")
)
self.strong_support.append((marker, f"{percent:.2f}%"))
else:
numer += 1.0 + (fc - 1.0) / (thre - 1.0)
self.weak_support.append(
(marker, f"{percent:.2f}%")
)
self.weak_support.append((marker, f"{percent:.2f}%"))
else:
assert sign == "-"
if gsym not in de_up.index:
Expand All @@ -76,14 +72,10 @@ def evaluate(
percent = de_down.at[gsym, "percent"]
if fc >= thre:
numer += 2.0
self.strong_support.append(
(marker, f"{percent:.2f}%")
)
self.strong_support.append((marker, f"{percent:.2f}%"))
else:
numer += 1.0 + (fc - 1.0) / (thre - 1.0)
self.weak_support.append(
(marker, f"{percent:.2f}%")
)
self.weak_support.append((marker, f"{percent:.2f}%"))
elif not self.ignore_nonde:
numer += 1.0
self.weak_support.append((marker, "N/A"))
Expand Down Expand Up @@ -116,8 +108,7 @@ def __init__(self, markers: Dict, genes: List[str]) -> None:
self.recalibrate(self.object, genes)

def recalibrate(self, obj: dict, genes: List[str]) -> None:
""" Remove markers that are not expressed (not in genes) and calculate partial weights for existing genes.
"""
"""Remove markers that are not expressed (not in genes) and calculate partial weights for existing genes."""
for celltype in obj["cell_types"]:
denom = 0.0
for marker_set in celltype["markers"]:
Expand All @@ -141,8 +132,7 @@ def evaluate(
ignore_nonde: bool = False,
obj: dict = None,
):
""" Evaluate a cluster to determine its putative cell type.
"""
"""Evaluate a cluster to determine its putative cell type."""
if obj is None:
obj = self.object

Expand Down Expand Up @@ -172,16 +162,17 @@ def report(
ct_list: List["CellType"],
space: int = 4,
) -> None:
""" Write putative cell type reports to fout.
"""
"""Write putative cell type reports to fout."""
for ct in ct_list:
fout.write(" " * space + str(ct) + "\n")
if ct.subtypes is not None:
self.report(fout, ct.subtypes, space + 4)


def infer_cluster_names(
cell_type_dict: Dict[str, List["CellType"]], threshold: float = 0.5, is_human_immune: bool = False
cell_type_dict: Dict[str, List["CellType"]],
threshold: float = 0.5,
is_human_immune: bool = False,
) -> List[str]:
"""Decide cluster names based on cell types automatically.

Expand Down Expand Up @@ -222,25 +213,38 @@ def infer_cluster_names(
subname = None
has_naive_t = False
for subt in ct.subtypes:
if subt.score >= threshold and (subt.name != "T regulatory cell" or subt.avgp > 0.5):
if subt.score >= threshold and (
subt.name != "T regulatory cell" or subt.avgp > 0.5
):
if subt.name == "Naive T cell" and subt.score >= 0.6:
has_naive_t = True
elif subname is None:
subname = subt.name
if subname is None:
cell_name = "Naive T cell" if has_naive_t else "T cell"
elif has_naive_t and (subname in ["T helper cell", "Cytotoxic T cell"]):
cell_name = "CD4+ Naive T cell" if subname == "T helper cell" else "CD8+ Naive T cell"
cell_name = (
"CD4+ Naive T cell"
if subname == "T helper cell"
else "CD8+ Naive T cell"
)
else:
cell_name = subname
elif is_human_immune and ct.name == "CD1C+ dendritic cell":
cell_name = ct.name
for ctype in ct_list[1:]:
if ctype.score >= threshold and ctype.name == "CLEC9A+ dendritic cell":
if (
ctype.score >= threshold
and ctype.name == "CLEC9A+ dendritic cell"
):
cell_name = "Conventional dendritic cell (CD1C+/CLEC9A+)"
break
else:
while ct.subtypes is not None and len(ct.subtypes) > 0 and ct.subtypes[0].score >= threshold:
while (
ct.subtypes is not None
and len(ct.subtypes) > 0
and ct.subtypes[0].score >= threshold
):
ct = ct.subtypes[0]
cell_name = ct.name

Expand Down Expand Up @@ -315,7 +319,8 @@ def infer_cell_types(
if output_file is not None:
fout = open(output_file, "w")

import pkg_resources
from importlib import resources

predefined_markers = dict(
human_immune="human_immune_cell_markers.json",
mouse_immune="mouse_immune_cell_markers.json",
Expand All @@ -327,12 +332,13 @@ def infer_cell_types(
)

if isinstance(markers, str):
tokens = markers.split(',')
tokens = markers.split(",")
markers = None
for token in tokens:
if token in predefined_markers:
token = pkg_resources.resource_filename(
"pegasus.annotate_cluster", predefined_markers[token]
token = str(
resources.files("pegasus.annotate_cluster")
/ predefined_markers[token]
)
with open(token) as fin:
tmp_dict = json.load(fin)
Expand Down Expand Up @@ -379,7 +385,9 @@ def infer_cell_types(
de_up["fc"] = 2.0 ** de_up["fc"]
de_down["fc"] = 2.0 ** de_down["fc"]

results = anno.evaluate(de_up, de_down, threshold=threshold, ignore_nonde=ignore_nonde)
results = anno.evaluate(
de_up, de_down, threshold=threshold, ignore_nonde=ignore_nonde
)

if output_file is not None:
fout.write(f"Cluster {clust_id}:\n")
Expand All @@ -394,7 +402,7 @@ def infer_cell_types(


def annotate(
data: Union[MultimodalData, UnimodalData,AnnData],
data: Union[MultimodalData, UnimodalData, AnnData],
name: str,
based_on: str,
anno_dict: Union[Dict[str, str], List[str]],
Expand Down Expand Up @@ -425,10 +433,15 @@ def annotate(
>>> pg.annotate(data, 'anno', 'louvain_labels', ['T cell', 'B cell'])
"""
if isinstance(anno_dict, list):
cluster_ids = data.obs[based_on].cat.categories.values.astype('str')
cluster_ids = data.obs[based_on].cat.categories.values.astype("str")
anno_dict = dict(zip(cluster_ids, anno_dict))
from natsort import natsorted
data.obs[name] = pd.Categorical([anno_dict[x] for x in data.obs[based_on]], categories = natsorted(np.unique(list(anno_dict.values()))))
from natsort import natsorted

data.obs[name] = pd.Categorical(
[anno_dict[x] for x in data.obs[based_on]],
categories=natsorted(np.unique(list(anno_dict.values()))),
)


@timer(logger=logger)
def run_annotate_cluster(
Expand All @@ -441,8 +454,7 @@ def run_annotate_cluster(
threshold: float = 0.5,
ignore_nonde: bool = False,
) -> None:
""" For command line use.
"""
"""For command line use."""
from pegasusio import read_input

data = read_input(input_file, mode="r")
Expand All @@ -459,8 +471,8 @@ def run_annotate_cluster(


def annotate_data_object(input_file: str, annotation: str) -> None:
""" For command line use.
annotation: anno_name:clust_name:cell_type1;...cell_typen
"""For command line use.
annotation: anno_name:clust_name:cell_type1;...cell_typen
"""
from pegasusio import read_input, write_output

Expand Down
45 changes: 33 additions & 12 deletions pegasus/check_sample_indexes/check_sample_indexes.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,13 @@
from sys import exit

import json
import pkg_resources

from importlib import resources
from typing import List, Dict, Tuple

import logging
logger = logging.getLogger(__name__)

logger = logging.getLogger(__name__)


def load_json_index(input_file: str) -> Dict[str, List[str]]:
Expand All @@ -23,23 +23,35 @@ def load_json_index(input_file: str) -> Dict[str, List[str]]:

def load_chromium_indexes() -> Tuple[dict, dict]:
# Load chromium index sets
GA_indexes = load_json_index(pkg_resources.resource_filename("pegasus.check_sample_indexes", "chromium-shared-sample-indexes-plate.json"))
NA_indexes = load_json_index(pkg_resources.resource_filename("pegasus.check_sample_indexes", "Chromium-i7-Multiplex-Kit-N-Set-A-sample-indexes-plate.json"))
GA_indexes = load_json_index(
str(
resources.files("pegasus.check_sample_indexes")
/ "chromium-shared-sample-indexes-plate.json"
)
)
NA_indexes = load_json_index(
str(
resources.files("pegasus.check_sample_indexes")
/ "Chromium-i7-Multiplex-Kit-N-Set-A-sample-indexes-plate.json"
)
)
return GA_indexes, NA_indexes


def load_index_file(index_file: str, GA_indexes: Dict[str, List[str]], NA_indexes: Dict[str, List[str]]) -> List[str]:
def load_index_file(
index_file: str, GA_indexes: Dict[str, List[str]], NA_indexes: Dict[str, List[str]]
) -> List[str]:
# Load index file
index_arr = []
with open(index_file) as fin:
for line in fin:
index = line.strip().split(',')[0]
index = line.strip().split(",")[0]
if index in GA_indexes:
index_arr.extend([(x, index) for x in GA_indexes[index]])
elif index in NA_indexes:
index_arr.extend([(x, index) for x in NA_indexes[index]])
else:
index_arr.append((index, 'orig'))
index_arr.append((index, "orig"))
return index_arr


Expand Down Expand Up @@ -79,12 +91,21 @@ def run_check_sample_indexes(index_file, n_mis=1, n_report=-1):
min_hd, min_i, min_j = calc_min_hamming_dist(index_arr)

n_mismatch = (min_hd - 1) // 2
barcode1 = index_arr[min_i][0] if index_arr[min_i][1] == 'orig' else f"{index_arr[min_i][1]}({index_arr[min_i][0]})"
barcode2 = index_arr[min_j][0] if index_arr[min_j][1] == 'orig' else f"{index_arr[min_j][1]}({index_arr[min_j][0]})"

logger.info(f"Minimum hamming distance is {min_hd}, achieved between {barcode1} and {barcode2}. A n_mis = {n_mismatch} can be set.")
barcode1 = (
index_arr[min_i][0]
if index_arr[min_i][1] == "orig"
else f"{index_arr[min_i][1]}({index_arr[min_i][0]})"
)
barcode2 = (
index_arr[min_j][0]
if index_arr[min_j][1] == "orig"
else f"{index_arr[min_j][1]}({index_arr[min_j][0]})"
)

logger.info(
f"Minimum hamming distance is {min_hd}, achieved between {barcode1} and {barcode2}. A n_mis = {n_mismatch} can be set."
)


if n_mismatch < n_mis:
logger.error(f"Index collision detected in {index_file} with n_mis = {n_mis}!")
elif n_report > 0:
Expand Down
38 changes: 20 additions & 18 deletions pegasus/tools/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,29 +169,31 @@ def check_batch_key(data: Union[MultimodalData, UnimodalData], batch: Union[str,



import pkg_resources
from importlib import resources

data_path = resources.files("pegasus") / "data_files"

predefined_signatures = dict(
cell_cycle_human=pkg_resources.resource_filename("pegasus", "data_files/cell_cycle_human.gmt"),
cell_cycle_mouse=pkg_resources.resource_filename("pegasus", "data_files/cell_cycle_mouse.gmt"),
gender_human=pkg_resources.resource_filename("pegasus", "data_files/gender_human.gmt"),
gender_mouse=pkg_resources.resource_filename("pegasus", "data_files/gender_mouse.gmt"),
mitochondrial_genes_human=pkg_resources.resource_filename("pegasus", "data_files/mitochondrial_genes_human.gmt"),
mitochondrial_genes_mouse=pkg_resources.resource_filename("pegasus", "data_files/mitochondrial_genes_mouse.gmt"),
ribosomal_genes_human=pkg_resources.resource_filename("pegasus", "data_files/ribosomal_genes_human.gmt"),
ribosomal_genes_mouse=pkg_resources.resource_filename("pegasus", "data_files/ribosomal_genes_mouse.gmt"),
apoptosis_human=pkg_resources.resource_filename("pegasus", "data_files/apoptosis_human.gmt"),
apoptosis_mouse=pkg_resources.resource_filename("pegasus", "data_files/apoptosis_mouse.gmt"),
human_lung=pkg_resources.resource_filename("pegasus", "data_files/human_lung.gmt"),
mouse_lung=pkg_resources.resource_filename("pegasus", "data_files/mouse_lung.gmt"),
mouse_brain=pkg_resources.resource_filename("pegasus", "data_files/mouse_brain.gmt"),
mouse_liver=pkg_resources.resource_filename("pegasus", "data_files/mouse_liver.gmt"),
emt_human=pkg_resources.resource_filename("pegasus", "data_files/emt_human.gmt"),
cell_cycle_human=str(data_path / "cell_cycle_human.gmt"),
cell_cycle_mouse=str(data_path / "cell_cycle_mouse.gmt"),
gender_human=str(data_path / "gender_human.gmt"),
gender_mouse=str(data_path / "gender_mouse.gmt"),
mitochondrial_genes_human=str(data_path / "mitochondrial_genes_human.gmt"),
mitochondrial_genes_mouse=str(data_path / "mitochondrial_genes_mouse.gmt"),
ribosomal_genes_human=str(data_path / "ribosomal_genes_human.gmt"),
ribosomal_genes_mouse=str(data_path / "ribosomal_genes_mouse.gmt"),
apoptosis_human=str(data_path / "apoptosis_human.gmt"),
apoptosis_mouse=str(data_path / "apoptosis_mouse.gmt"),
human_lung=str(data_path / "human_lung.gmt"),
mouse_lung=str(data_path / "mouse_lung.gmt"),
mouse_brain=str(data_path / "mouse_brain.gmt"),
mouse_liver=str(data_path / "mouse_liver.gmt"),
emt_human=str(data_path / "emt_human.gmt"),
)

predefined_pathways = dict(
hallmark=pkg_resources.resource_filename("pegasus", "data_files/h.all.v7.5.1.symbols.gmt"),
canonical_pathways=pkg_resources.resource_filename("pegasus", "data_files/c2.cp.v7.5.1.symbols.gmt"),
hallmark=str(data_path / "h.all.v7.5.1.symbols.gmt"),
canonical_pathways=str(data_path / "c2.cp.v7.5.1.symbols.gmt"),
)

def load_signatures_from_file(input_file: str) -> Dict[str, List[str]]:
Expand Down
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,11 @@
"Topic :: Software Development :: Build Tools",
"Topic :: Scientific/Engineering :: Bio-Informatics",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13",
"Programming Language :: Python :: 3.14",
],
keywords="single cell/nucleus genomics analysis",
packages=find_packages(),
Expand All @@ -61,7 +61,7 @@
pseudobulk=["pydeseq2", "gseapy"],
all=["fitsne", "louvain", "scanorama", "torch", "harmony-pytorch", "nmf-torch", "rpy2", "forceatlas2-python", "scvi-tools", "pydeseq2", "gseapy"]
),
python_requires="~=3.9",
python_requires="~=3.10",
package_data={
"pegasus.annotate_cluster": [
"human_immune_cell_markers.json",
Expand Down
2 changes: 1 addition & 1 deletion tests/run_inmf.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
pegasus aggregate_matrix tests/data/count_matrix.csv tests/aggr

if [ -f "tests/aggr.zarr.zip" ]; then
pegasus cluster -p 2 --output-h5ad --output-loom --correct-batch-effect --correction-method inmf --louvain --umap tests/aggr.zarr.zip tests/inmf_result
pegasus cluster -p 2 --output-h5ad --output-loom --correct-batch-effect --correction-method inmf --leiden --umap tests/aggr.zarr.zip tests/inmf_result
fi
2 changes: 1 addition & 1 deletion tests/run_one_sample.sh
Original file line number Diff line number Diff line change
@@ -1 +1 @@
pegasus cluster -p 2 --min-genes 500 --max-genes 6000 --mito-prefix mt- --percent-mito 20.0 --output-filtration-results --output-h5ad --output-loom --plot-filtration-results --plot-hvf --louvain --leiden --umap --fle tests/data/heart_1k_v3/filtered_feature_bc_matrix.h5 tests/one_sample_result
pegasus cluster -p 2 --min-genes 500 --max-genes 6000 --mito-prefix mt- --percent-mito 20.0 --output-filtration-results --output-h5ad --output-loom --plot-filtration-results --plot-hvf --leiden --umap --fle tests/data/heart_1k_v3/filtered_feature_bc_matrix.h5 tests/one_sample_result