Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
[![Actions status](https://github.com/genomicmedlab/agct/actions/workflows/checks.yaml/badge.svg)](https://github.com/genomicmedlab/agct/actions/checks.yaml)

<!-- description -->
Drop-in replacement for the [pyliftover](https://github.com/konstantint/pyliftover) tool, using the St. Jude's [chainfile](https://docs.rs/chainfile/latest/chainfile/) crate.
Lift over positions between genomic reference assemblies, using the St. Jude's [chainfile](https://docs.rs/chainfile/latest/chainfile/) crate.
<!-- description -->

Enables significantly faster chainfile loading from cold start (see `analysis/`).
Expand All @@ -24,17 +24,17 @@ python3 -m pip install agct
Initialize a class instance:

```python3
from agct import Converter, Genome
c = Converter(Genome.HG38, Genome.HG19)
from agct import Converter, Assembly, Strand

c = Converter(Assembly.HG19, Assembly.HG38)
```

> If a chainfile is unavailable locally, it's downloaded from UCSC and saved using the `wags-tails` package -- see the [wags-tails configuration instructions](https://github.com/GenomicMedLab/wags-tails?tab=readme-ov-file#configuration) for information on how to designate a non-default storage location.

Call ``convert_coordinate()``:

```python3
c.convert_coordinate("chr7", 140453136, "+")
# [['chr7', 140152936, <Strand.POSITIVE: '+'>]]
c.convert_coordinate("chr7", 140453136, Strand.POSITIVE)
```

## Development
Expand Down
5 changes: 3 additions & 2 deletions src/agct/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Provide fast liftover in Python via the ``chainfile`` crate."""

from agct.converter import Converter, Genome, Strand
from agct.assembly_registry import Assembly, get_assembly_from_refget_id
from agct.converter import Converter, Strand

__all__ = ["Converter", "Genome", "Strand"]
__all__ = ["Assembly", "Converter", "Strand", "get_assembly_from_refget_id"]
160 changes: 160 additions & 0 deletions src/agct/assembly_registry.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
"""Assembly registry.

This module maintains a curated mapping from GA4GH VRS `SequenceReference`
refget accession identifiers (the ``"SQ."`` IDs) to human reference assemblies
and exposes a helper to look up the assembly for a given ID. Assemblies are
represented by the :class:`Assembly` enum (UCSC-style labels: ``hg19``, ``hg38``).

Use :func:`get_assembly_from_refget_id` when you have a refget accession ID and
need to determine which assembly it belongs to. The mapping is intentionally
conservative and currently reflects internal use cases; extend it as needed.
"""

import logging
import re
from enum import Enum

_logger = logging.getLogger(__name__)


class Assembly(str, Enum):
"""Constrain reference genome assembly values.

We could conceivably support every UCSC chainfile offering, but for now, we'll
stick with internal use cases only.
"""

HG38 = "hg38"
HG19 = "hg19"


ID_TO_REFERENCE_MAP = {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Consider changing from set to a list. I'm thinking of the case where want to do a quick look up without SeqRepo, where we know both the assembly and chromosome so we could do something like ID_TO_REFERENID_TO_REFERENCE_MAP["hg19"][11-1]

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah that seems ok. I was wondering about whether we'd need to add more IDs in the future, but now that I think about it, I'm not sure what those would be.

Assembly.HG38: {
# chr1
"SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO",
# chr2
"SQ.pnAqCRBrTsUoBghSD1yp_jXWSmlbdh4g",
# chr3
"SQ.Zu7h9AggXxhTaGVsy7h_EZSChSZGcmgX",
# chr4
"SQ.HxuclGHh0XCDuF8x6yQrpHUBL7ZntAHc",
# chr5
"SQ.aUiQCzCPZ2d0csHbMSbh2NzInhonSXwI",
# chr6
"SQ.0iKlIQk2oZLoeOG9P1riRU6hvL5Ux8TV",
# chr7
"SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul",
# chr8
"SQ.209Z7zJ-mFypBEWLk4rNC6S_OxY5p7bs",
# chr9
"SQ.KEO-4XBcm1cxeo_DIQ8_ofqGUkp4iZhI",
# chr10
"SQ.ss8r_wB0-b9r44TQTMmVTI92884QvBiB",
# chr11
"SQ.2NkFm8HK88MqeNkCgj78KidCAXgnsfV1",
# chr12
"SQ.6wlJpONE3oNb4D69ULmEXhqyDZ4vwNfl",
# chr13
"SQ._0wi-qoDrvram155UmcSC-zA5ZK4fpLT",
# chr14
"SQ.eK4D2MosgK_ivBkgi6FVPg5UXs1bYESm",
# chr15
"SQ.AsXvWL1-2i5U_buw6_niVIxD6zTbAuS6",
# chr16
"SQ.yC_0RBj3fgBlvgyAuycbzdubtLxq-rE0",
# chr17
"SQ.dLZ15tNO1Ur0IcGjwc3Sdi_0A6Yf4zm7",
# chr18
"SQ.vWwFhJ5lQDMhh-czg06YtlWqu0lvFAZV",
# chr19
"SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl",
# chr20
"SQ.-A1QmD_MatoqxvgVxBLZTONHz9-c7nQo",
# chr21
"SQ.5ZUqxCmDDgN4xTRbaSjN8LwgZironmB8",
# chr22
"SQ.7B7SHsmchAR0dFcDCuSFjJAo7tX87krQ",
# chr23
"SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP",
# chr24/chrY
"SQ.8_liLu1aycC0tPQPFmUaGXJLDs5SbPZ5",
},
Assembly.HG19: {
# chr1
"SQ.S_KjnFVz-FE7M0W6yoaUDgYxLPc1jyWU",
# chr2
"SQ.9KdcA9ZpY1Cpvxvg8bMSLYDUpsX6GDLO",
# chr3
"SQ.VNBualIltAyi2AI_uXcKU7M9XUOuA7MS",
# chr4
"SQ.iy7Zfceb5_VGtTQzJ-v5JpPbpeifHD_V",
# chr5
"SQ.vbjOdMfHJvTjK_nqvFvpaSKhZillW0SX",
# chr6
"SQ.KqaUhJMW3CDjhoVtBetdEKT1n6hM-7Ek",
# chr7
"SQ.IW78mgV5Cqf6M24hy52hPjyyo5tCCd86",
# chr8
"SQ.tTm7wmhz0G4lpt8wPspcNkAD_qiminj6",
# chr9
"SQ.HBckYGQ4wYG9APHLpjoQ9UUe9v7NxExt",
# chr10
"SQ.-BOZ8Esn8J88qDwNiSEwUr5425UXdiGX",
# chr11
"SQ.XXi2_O1ly-CCOi3HP5TypAw7LtC6niFG",
# chr12
"SQ.105bBysLoDFQHhajooTAUyUkNiZ8LJEH",
# chr13
"SQ.Ewb9qlgTqN6e_XQiRVYpoUfZJHXeiUfH",
# chr14
"SQ.5Ji6FGEKfejK1U6BMScqrdKJK8GqmIGf",
# chr15
"SQ.zIMZb3Ft7RdWa5XYq0PxIlezLY2ccCgt",
# chr16
"SQ.W6wLoIFOn4G7cjopxPxYNk2lcEqhLQFb",
# chr17
"SQ.AjWXsI7AkTK35XW9pgd3UbjpC3MAevlz",
# chr18
"SQ.BTj4BDaaHYoPhD3oY2GdwC_l0uqZ92UD",
# chr19
"SQ.ItRDD47aMoioDCNW_occY5fWKZBKlxCX",
# chr20
"SQ.iy_UbUrvECxFRX5LPTH_KPojdlT7BKsf",
# chr21
"SQ.LpTaNW-hwuY_yARP0rtarCnpCQLkgVCg"
# chr22
"SQ.XOgHwwR3Upfp5sZYk6ZKzvV25a4RBVu8",
# chr23
"SQ.v7noePfnNpK8ghYXEqZ9NukMXW7YeNsm",
# chr24/chrY
"SQ.BT7QyW5iXaX_1PSX-msSGYsqRdMKqkj-",
},
}


_error_pattern = re.compile(r"^SQ.[0-9A-Za-z_\\-]{32}$")


def get_assembly_from_refget_id(refget_accession: str) -> Assembly | None:
"""Given a GA4GH SequenceReference refget accession ID, get back its corresponding reference genome

.. code-block:: pycon

>>> from agct.assembly_registry import get_assembly_from_refget_id
>>> get_assembly_from_refget_id("SQ.pnAqCRBrTsUoBghSD1yp_jXWSmlbdh4g")
Genome.HG38

Use to get an appropriate converter from a referenced GA4GH variation object.

:param refget_accession: sequence reference (must start with `"SQ."`)
:return: a reference assembly if recognized
:raise ValueError: if input appears to be in an invalid format for a refget accession ID
"""
for reference_genome, chromosome_accession_ids in ID_TO_REFERENCE_MAP.items():
if refget_accession in chromosome_accession_ids:
return reference_genome
if not re.match(_error_pattern, refget_accession):
msg = f"refget accession ID must be in format 'SQ.ABCDEFGHIJKLMNOPQRSTUVWXYZ123456'; got {refget_accession}"
_logger.error(msg)
raise ValueError(msg)
return None
74 changes: 32 additions & 42 deletions src/agct/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from wags_tails.utils.storage import get_data_dir

import agct._core as _core
from agct.assembly_registry import Assembly

_logger = logging.getLogger(__name__)

Expand All @@ -21,68 +22,55 @@ class Strand(str, Enum):
NEGATIVE = "-"


class Genome(str, Enum):
"""Constrain genome values.

We could conceivably support every UCSC chainfile offering, but for now, we'll
stick with internal use cases only.
"""

HG38 = "hg38"
HG19 = "hg19"


class Converter:
"""Chainfile-based liftover provider for a single sequence to sequence
association.
"""

def __init__(
self,
from_db: Genome | str | None = None,
to_db: Genome | str | None = None,
from_assembly: Assembly | None = None,
to_assembly: Assembly | None = None,
chainfile: str | None = None,
) -> None:
"""Initialize liftover instance.

:param from_db: database name, e.g. ``"hg19"``. Must be different than ``to_db``
If ``chainfile`` is provided, will ignore this argument
:param to_db: database name, e.g. ``"hg38"``. Must be different than ``from_db``
If ``chainfile`` is provided, will ignore this argument
Initialize with either ``from_assembly`` and ``to_assembly``, or directly with a chainfile.

* If using assembly params, both must be provided, and they must be different
* If using assembly params, the ``wags-tails`` library will be used to locate and, if
necessary, acquire the pertinent chainfile from the UCSC web server. See
the `wags-tails documentation <https://wags-tails.readthedocs.io/>`_ for more info.
* If ``chainfile`` arg is provided, all other args are ignored.

:param from_assembly: Assembly name, e.g. ``<Assembly.HG19>``
:param to_assembly: database name, e.g. ``"hg38"``.
:param chainfile: Path to chainfile
If not provided, must provide both ``from_db`` and ``to_db`` so that
``wags-tails`` can download the corresponding chainfile
:raise ValueError: if required arguments are not passed or are invalid
:raise FileNotFoundError: if unable to open corresponding chainfile
:raise _core.ChainfileError: if unable to read chainfile (i.e. it's invalid)
"""
if not chainfile:
if from_db is None or to_db is None:
msg = "Must provide both `from_db` and `to_db`"
if from_assembly is None or to_assembly is None:
msg = "Must provide both `from_assembly` and `to_assembly`"
raise ValueError(msg)

if from_db == to_db:
if from_assembly == to_assembly:
msg = "Liftover must be to/from different sources."
raise ValueError(msg)

if isinstance(from_db, str):
try:
from_db = Genome(from_db)
except ValueError as e:
msg = f"Unable to coerce from_db value '{from_db}' to a known reference genome: {list(Genome)}"
raise ValueError(msg) from e
if isinstance(to_db, str):
try:
to_db = Genome(to_db)
except ValueError as e:
msg = f"Unable to coerce to_db value '{to_db}' to a known reference genome: {list(Genome)}"
raise ValueError(msg) from e
try:
file_prefix = f"chainfile_{from_assembly.value}_to_{to_assembly.value}"
except AttributeError as e:
msg = f"Assembly args must be instance of `agct.assembly_registry.Genome`, instead got from_assembly={from_assembly} and to_assembly={to_assembly}"
_logger.exception(msg)
raise ValueError(msg) from e

data_handler = CustomData(
f"chainfile_{from_db.value}_to_{to_db.value}",
file_prefix,
"chain",
lambda: "",
self._download_function_builder(from_db, to_db),
self._download_function_builder(from_assembly, to_assembly),
data_dir=get_data_dir() / "ucsc-chainfile",
)
file, _ = data_handler.get_latest()
Expand All @@ -98,15 +86,17 @@ def __init__(
raise

@staticmethod
def _download_function_builder(from_db: Genome, to_db: Genome) -> Callable:
def _download_function_builder(
from_assembly: Assembly, to_assembly: Assembly
) -> Callable:
"""Build downloader function for chainfile corresponding to source/destination
params.

Wags-Tails' custom data handler takes a downloader callback function. We
construct it here, curried with from/to values in the download URL.

:param from_db: genome lifting from
:param to_db: genome lifting to
:param from_assembly: genome lifting from
:param to_assembly: genome lifting to
:return: Function that downloads appropriate chainfile from UCSC
"""

Expand All @@ -116,7 +106,7 @@ def _download_data(version: str, file: Path) -> None: # noqa: ARG001
:param version: not used
:param file: path to save file to
"""
url = f"https://hgdownload.soe.ucsc.edu/goldenPath/{from_db.value}/liftOver/{from_db.value}To{to_db.value.title()}.over.chain.gz"
url = f"https://hgdownload.soe.ucsc.edu/goldenPath/{from_assembly.value}/liftOver/{from_assembly.value}To{to_assembly.value.title()}.over.chain.gz"
download_http(url, file, handler=handle_gzip)

return _download_data
Expand All @@ -130,9 +120,9 @@ def convert_coordinate(

.. code-block:: python

from agct import Converter, Strand
from agct import Converter, Strand, Assembly

c = Converter("hg19", "hg38")
c = Converter(Assembly.HG19, Assembly.HG38)
c.convert_coordinate("chr7", 140453136, Strand.POSITIVE)
# returns [['chr7', 140753336, '+']]

Expand Down
35 changes: 35 additions & 0 deletions tests/test_assembly_registry.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import pytest

from agct.assembly_registry import Assembly, get_assembly_from_refget_id


def test_assembly_fetcher():
assert (
get_assembly_from_refget_id("SQ.ss8r_wB0-b9r44TQTMmVTI92884QvBiB")
== Assembly.HG38
)
assert (
get_assembly_from_refget_id("SQ.-BOZ8Esn8J88qDwNiSEwUr5425UXdiGX")
== Assembly.HG19
)
assert (
get_assembly_from_refget_id("SQ.0iKlIQk2oZLoeOG9P1riRU6hvL5Ux8TV")
== Assembly.HG38
)
assert (
get_assembly_from_refget_id("SQ.KqaUhJMW3CDjhoVtBetdEKT1n6hM-7Ek")
== Assembly.HG19
)


def test_unknown_sequences():
assert get_assembly_from_refget_id("SQ.KqX7SGMq4GLY21TCAHPMWfNy-cFdT44h") is None


def test_invalid_input():
input_string = "ga4gh:SQ.KqaUhJMW3CDjhoVtBetdEKT1n6hM-7Ek"
with pytest.raises(
ValueError,
match=f"refget accession ID must be in format 'SQ.ABCDEFGHIJKLMNOPQRSTUVWXYZ123456'; got {input_string}",
):
get_assembly_from_refget_id(input_string)
Loading