Skip to content

Commit ea5d8ed

Browse files
committed
feat add chr lookup
1 parent d27fd97 commit ea5d8ed

File tree

7 files changed

+183
-204
lines changed

7 files changed

+183
-204
lines changed

src/agct/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
"""Provide fast liftover in Python via the ``chainfile`` crate."""
22

3-
from agct.assembly_registry import Assembly, get_assembly_from_refget_id
43
from agct.converter import Converter, LiftoverResult, Strand, get_converter
4+
from agct.seqref_registry import Assembly, get_seqinfo_from_refget_id
55

66
__all__ = [
77
"Assembly",
88
"Converter",
99
"LiftoverResult",
1010
"Strand",
11-
"get_assembly_from_refget_id",
1211
"get_converter",
12+
"get_seqinfo_from_refget_id",
1313
]

src/agct/assembly_registry.py

Lines changed: 0 additions & 160 deletions
This file was deleted.

src/agct/converter.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from wags_tails.utils.storage import get_data_dir
1313

1414
import agct._core as _core
15-
from agct.assembly_registry import Assembly
15+
from agct.seqref_registry import Assembly
1616

1717
_logger = logging.getLogger(__name__)
1818

@@ -73,7 +73,7 @@ def __init__(
7373
try:
7474
file_prefix = f"chainfile_{from_assembly.value}_to_{to_assembly.value}"
7575
except AttributeError as e:
76-
msg = f"Assembly args must be instance of `agct.assembly_registry.Genome`, instead got from_assembly={from_assembly} and to_assembly={to_assembly}"
76+
msg = f"Assembly args must be instance of `agct.seqref_registry.Genome`, instead got from_assembly={from_assembly} and to_assembly={to_assembly}"
7777
_logger.exception(msg)
7878
raise ValueError(msg) from e
7979

src/agct/seqref_registry.py

Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
"""Sequence reference registry.
2+
3+
Maps refget accessions (``SQ.*``) to a tuple of (:class:`Assembly`, :class:`Chromosome`)
4+
and exposes helpers to look up the assembly/chromosome for a given ID. The registry is
5+
curated for internally-used builds (currently ``hg19``/``hg38``); extend as needed.
6+
7+
"""
8+
9+
import logging
10+
import re
11+
from enum import StrEnum
12+
13+
_logger = logging.getLogger(__name__)
14+
15+
16+
class Assembly(StrEnum):
17+
"""Constrain reference genome assembly values.
18+
19+
We could conceivably support every UCSC chainfile offering, but for now, we'll
20+
stick with internal use cases only.
21+
"""
22+
23+
HG38 = "hg38"
24+
HG19 = "hg19"
25+
26+
27+
class Chromosome(StrEnum):
28+
"""Constrain chromosome values to UCSC-style names.
29+
30+
This class should NOT be used to type-constrain input in the converter
31+
module, because in practice, chainfiles can use any name for an accession. In practice,
32+
though, we're mostly interested in UCSC chainfiles, so this class is provided as a
33+
utility for likely-relevant chromosome names.
34+
"""
35+
36+
CHR1 = "chr1"
37+
CHR2 = "chr2"
38+
CHR3 = "chr3"
39+
CHR4 = "chr4"
40+
CHR5 = "chr5"
41+
CHR6 = "chr6"
42+
CHR7 = "chr7"
43+
CHR8 = "chr8"
44+
CHR9 = "chr9"
45+
CHR10 = "chr10"
46+
CHR11 = "chr11"
47+
CHR12 = "chr12"
48+
CHR13 = "chr13"
49+
CHR14 = "chr14"
50+
CHR15 = "chr15"
51+
CHR16 = "chr16"
52+
CHR17 = "chr17"
53+
CHR18 = "chr18"
54+
CHR19 = "chr19"
55+
CHR20 = "chr20"
56+
CHR21 = "chr21"
57+
CHR22 = "chr22"
58+
CHRX = "chrX"
59+
CHRY = "chrY"
60+
61+
62+
REFGET_ID_INFO = {
63+
"SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO": (Assembly.HG38, Chromosome.CHR1),
64+
"SQ.pnAqCRBrTsUoBghSD1yp_jXWSmlbdh4g": (Assembly.HG38, Chromosome.CHR2),
65+
"SQ.Zu7h9AggXxhTaGVsy7h_EZSChSZGcmgX": (Assembly.HG38, Chromosome.CHR3),
66+
"SQ.HxuclGHh0XCDuF8x6yQrpHUBL7ZntAHc": (Assembly.HG38, Chromosome.CHR4),
67+
"SQ.aUiQCzCPZ2d0csHbMSbh2NzInhonSXwI": (Assembly.HG38, Chromosome.CHR5),
68+
"SQ.0iKlIQk2oZLoeOG9P1riRU6hvL5Ux8TV": (Assembly.HG38, Chromosome.CHR6),
69+
"SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul": (Assembly.HG38, Chromosome.CHR7),
70+
"SQ.209Z7zJ-mFypBEWLk4rNC6S_OxY5p7bs": (Assembly.HG38, Chromosome.CHR8),
71+
"SQ.KEO-4XBcm1cxeo_DIQ8_ofqGUkp4iZhI": (Assembly.HG38, Chromosome.CHR9),
72+
"SQ.ss8r_wB0-b9r44TQTMmVTI92884QvBiB": (Assembly.HG38, Chromosome.CHR10),
73+
"SQ.2NkFm8HK88MqeNkCgj78KidCAXgnsfV1": (Assembly.HG38, Chromosome.CHR11),
74+
"SQ.6wlJpONE3oNb4D69ULmEXhqyDZ4vwNfl": (Assembly.HG38, Chromosome.CHR12),
75+
"SQ._0wi-qoDrvram155UmcSC-zA5ZK4fpLT": (Assembly.HG38, Chromosome.CHR13),
76+
"SQ.eK4D2MosgK_ivBkgi6FVPg5UXs1bYESm": (Assembly.HG38, Chromosome.CHR14),
77+
"SQ.AsXvWL1-2i5U_buw6_niVIxD6zTbAuS6": (Assembly.HG38, Chromosome.CHR15),
78+
"SQ.yC_0RBj3fgBlvgyAuycbzdubtLxq-rE0": (Assembly.HG38, Chromosome.CHR16),
79+
"SQ.dLZ15tNO1Ur0IcGjwc3Sdi_0A6Yf4zm7": (Assembly.HG38, Chromosome.CHR17),
80+
"SQ.vWwFhJ5lQDMhh-czg06YtlWqu0lvFAZV": (Assembly.HG38, Chromosome.CHR18),
81+
"SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl": (Assembly.HG38, Chromosome.CHR19),
82+
"SQ.-A1QmD_MatoqxvgVxBLZTONHz9-c7nQo": (Assembly.HG38, Chromosome.CHR20),
83+
"SQ.5ZUqxCmDDgN4xTRbaSjN8LwgZironmB8": (Assembly.HG38, Chromosome.CHR21),
84+
"SQ.7B7SHsmchAR0dFcDCuSFjJAo7tX87krQ": (Assembly.HG38, Chromosome.CHR22),
85+
"SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP": (Assembly.HG38, Chromosome.CHRX),
86+
"SQ.8_liLu1aycC0tPQPFmUaGXJLDs5SbPZ5": (Assembly.HG38, Chromosome.CHRY),
87+
"SQ.S_KjnFVz-FE7M0W6yoaUDgYxLPc1jyWU": (Assembly.HG19, Chromosome.CHR1),
88+
"SQ.9KdcA9ZpY1Cpvxvg8bMSLYDUpsX6GDLO": (Assembly.HG19, Chromosome.CHR2),
89+
"SQ.VNBualIltAyi2AI_uXcKU7M9XUOuA7MS": (Assembly.HG19, Chromosome.CHR3),
90+
"SQ.iy7Zfceb5_VGtTQzJ-v5JpPbpeifHD_V": (Assembly.HG19, Chromosome.CHR4),
91+
"SQ.vbjOdMfHJvTjK_nqvFvpaSKhZillW0SX": (Assembly.HG19, Chromosome.CHR5),
92+
"SQ.KqaUhJMW3CDjhoVtBetdEKT1n6hM-7Ek": (Assembly.HG19, Chromosome.CHR6),
93+
"SQ.IW78mgV5Cqf6M24hy52hPjyyo5tCCd86": (Assembly.HG19, Chromosome.CHR7),
94+
"SQ.tTm7wmhz0G4lpt8wPspcNkAD_qiminj6": (Assembly.HG19, Chromosome.CHR8),
95+
"SQ.HBckYGQ4wYG9APHLpjoQ9UUe9v7NxExt": (Assembly.HG19, Chromosome.CHR9),
96+
"SQ.-BOZ8Esn8J88qDwNiSEwUr5425UXdiGX": (Assembly.HG19, Chromosome.CHR10),
97+
"SQ.XXi2_O1ly-CCOi3HP5TypAw7LtC6niFG": (Assembly.HG19, Chromosome.CHR11),
98+
"SQ.105bBysLoDFQHhajooTAUyUkNiZ8LJEH": (Assembly.HG19, Chromosome.CHR12),
99+
"SQ.Ewb9qlgTqN6e_XQiRVYpoUfZJHXeiUfH": (Assembly.HG19, Chromosome.CHR13),
100+
"SQ.5Ji6FGEKfejK1U6BMScqrdKJK8GqmIGf": (Assembly.HG19, Chromosome.CHR14),
101+
"SQ.zIMZb3Ft7RdWa5XYq0PxIlezLY2ccCgt": (Assembly.HG19, Chromosome.CHR15),
102+
"SQ.W6wLoIFOn4G7cjopxPxYNk2lcEqhLQFb": (Assembly.HG19, Chromosome.CHR16),
103+
"SQ.AjWXsI7AkTK35XW9pgd3UbjpC3MAevlz": (Assembly.HG19, Chromosome.CHR17),
104+
"SQ.BTj4BDaaHYoPhD3oY2GdwC_l0uqZ92UD": (Assembly.HG19, Chromosome.CHR18),
105+
"SQ.ItRDD47aMoioDCNW_occY5fWKZBKlxCX": (Assembly.HG19, Chromosome.CHR19),
106+
"SQ.iy_UbUrvECxFRX5LPTH_KPojdlT7BKsf": (Assembly.HG19, Chromosome.CHR20),
107+
"SQ.LpTaNW-hwuY_yARP0rtarCnpCQLkgVCg": (Assembly.HG19, Chromosome.CHR21),
108+
"SQ.XOgHwwR3Upfp5sZYk6ZKzvV25a4RBVu8": (Assembly.HG19, Chromosome.CHR22),
109+
"SQ.v7noePfnNpK8ghYXEqZ9NukMXW7YeNsm": (Assembly.HG19, Chromosome.CHRX),
110+
"SQ.BT7QyW5iXaX_1PSX-msSGYsqRdMKqkj-": (Assembly.HG19, Chromosome.CHRY),
111+
}
112+
113+
114+
_ERROR_PATTERN = re.compile(r"^SQ\.[0-9A-Za-z_\\-]{32}$")
115+
116+
117+
def get_seqinfo_from_refget_id(
118+
refget_accession: str,
119+
) -> tuple[Assembly, Chromosome] | None:
120+
"""Given a GA4GH SequenceReference refget accession ID, get back its reference genome and chromosome name
121+
122+
.. code-block:: pycon
123+
124+
>>> from agct.assembly_registry import get_assembly_from_refget_id
125+
>>> get_assembly_from_refget_id("SQ.pnAqCRBrTsUoBghSD1yp_jXWSmlbdh4g")
126+
(<Assembly.HG38: 'hg38'>, <Chromosome.CHR2: 'chr2'>)
127+
128+
Use for acquiring a converter instance and calling liftover on a referenced GA4GH
129+
variation object.
130+
131+
:param refget_accession: sequence reference (must start with `"SQ."`)
132+
:return: a reference assembly and chromosome, if successful
133+
:raise ValueError: if input appears to be in an invalid format for a refget accession ID
134+
"""
135+
if not re.match(_ERROR_PATTERN, refget_accession):
136+
msg = f"refget accession ID must be in format 'SQ.ABCDEFGHIJKLMNOPQRSTUVWXYZ123456'; got {refget_accession}"
137+
_logger.error(msg)
138+
raise ValueError(msg)
139+
return REFGET_ID_INFO.get(refget_accession)

tests/test_assembly_registry.py

Lines changed: 0 additions & 39 deletions
This file was deleted.

tests/test_converter.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ def test_invalid():
3030
with pytest.raises(
3131
ValueError,
3232
match=re.escape(
33-
"Assembly args must be instance of `agct.assembly_registry.Genome`, instead got from_assembly=hg19 and to_assembly=hg18"
33+
"Assembly args must be instance of `agct.seqref_registry.Genome`, instead got from_assembly=hg19 and to_assembly=hg18"
3434
),
3535
):
3636
Converter(Assembly.HG19, "hg18")

0 commit comments

Comments
 (0)