From 5f6f52a184308563a441195a1c53b36b7e1f0383 Mon Sep 17 00:00:00 2001 From: Momir Milutinovic Date: Tue, 21 Apr 2026 11:42:05 +0200 Subject: [PATCH 1/2] fix: Reduce the number of ELink rate limit errors --- pyproject.toml | 1 + src/db/linkers/elink_dataset_linker.py | 6 ++++++ uv.lock | 11 +++++++++++ 3 files changed, 18 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 9e9be64..8b78c1e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,4 +25,5 @@ dependencies = [ "spacy>=3.8.11", "en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl", "flask-sqlalchemy>=3.1.1", + "pyrate-limiter>=4.1.0", ] diff --git a/src/db/linkers/elink_dataset_linker.py b/src/db/linkers/elink_dataset_linker.py index 3f81f1e..b3f5705 100644 --- a/src/db/linkers/elink_dataset_linker.py +++ b/src/db/linkers/elink_dataset_linker.py @@ -1,13 +1,17 @@ import logging from typing import List, Dict import re + import requests import tenacity +from pyrate_limiter import Limiter, Rate, Duration +from pyrate_limiter.limiter_factory import create_inmemory_limiter from src.db.linkers.paper_dataset_linker import PaperDatasetLinker from src.exception.entrez_error import EntrezError logger = logging.getLogger(__name__) +eutilities_rate_limiter = create_inmemory_limiter() class ELinkDatasetLinker(PaperDatasetLinker): @@ -53,6 +57,7 @@ def link_to_datasets_mapped(self, pubmed_ids: List[str]) -> Dict[str, List[str]] @tenacity.retry(wait=tenacity.wait_exponential(max=10), stop=tenacity.stop_after_attempt(NUMBER_OF_RETRIES), before_sleep=tenacity.before_sleep_log(logger, logging.WARNING), reraise=True) + @eutilities_rate_limiter.as_decorator(name="e-utilities", weight=1) def _fetch_geo_ids(self, pubmed_ids: List[str]) -> List[str]: """ Fetches GEO dataset ids for papers with the specified PubMed IDs. @@ -95,6 +100,7 @@ def _fetch_geo_ids(self, pubmed_ids: List[str]) -> List[str]: @tenacity.retry(wait=tenacity.wait_exponential(max=10), stop=tenacity.stop_after_attempt(NUMBER_OF_RETRIES), before_sleep=tenacity.before_sleep_log(logger, logging.WARNING), reraise=True) + @eutilities_rate_limiter.as_decorator(name="e-utilities", weight=1) def _fetch_geo_accessions(self, geo_ids: List[str]) -> List[str]: """ Fetches GEO accessions for the given GEO IDs from the NCBI E-Utilities. diff --git a/uv.lock b/uv.lock index bd76d81..c097acc 100644 --- a/uv.lock +++ b/uv.lock @@ -1179,6 +1179,7 @@ dependencies = [ { name = "pandas-stubs" }, { name = "parameterized" }, { name = "pip" }, + { name = "pyrate-limiter" }, { name = "requests" }, { name = "spacy" }, { name = "sqlacodegen" }, @@ -1203,6 +1204,7 @@ requires-dist = [ { name = "pandas-stubs", specifier = ">=2.3.3.251219" }, { name = "parameterized", specifier = "==0.9.0" }, { name = "pip", specifier = "==25.3" }, + { name = "pyrate-limiter", specifier = ">=4.1.0" }, { name = "requests", specifier = "==2.32.5" }, { name = "spacy", specifier = ">=3.8.11" }, { name = "sqlacodegen", specifier = ">=3.2.0" }, @@ -1290,6 +1292,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f4/7e/a72dd26f3b0f4f2bf1dd8923c85f7ceb43172af56d63c7383eb62b332364/pygments-2.20.0-py3-none-any.whl", hash = "sha256:81a9e26dd42fd28a23a2d169d86d7ac03b46e2f8b59ed4698fb4785f946d0176", size = 1231151, upload-time = "2026-03-29T13:29:30.038Z" }, ] +[[package]] +name = "pyrate-limiter" +version = "4.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/33/0c/6e78218e6ef726be35a4c0a5e2e281e36ddd940566800219e96d13de99ad/pyrate_limiter-4.1.0.tar.gz", hash = "sha256:be1ac413a263aa410b98757d1b01a880650948a1fc3a959512f15865eb58dbf3", size = 306136, upload-time = "2026-03-22T14:43:03.739Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c7/fd/57181fafae08385d00ea2702be246ab8035352a0a8e1f63391c2bcad74d4/pyrate_limiter-4.1.0-py3-none-any.whl", hash = "sha256:2696b4e4a6cffb3d40fc76662baccb766697893f0979e12bebbfc7d3b6b19603", size = 38197, upload-time = "2026-03-22T14:43:01.975Z" }, +] + [[package]] name = "python-dateutil" version = "2.9.0.post0" From 080f44643fe19e7aacdfe10a593dfa0fea8064c4 Mon Sep 17 00:00:00 2001 From: Momir Milutinovic Date: Tue, 21 Apr 2026 11:54:21 +0200 Subject: [PATCH 2/2] fix: Enforce rate limit in EuropePMCDatasetLinker --- src/db/linkers/elink_dataset_linker.py | 3 +-- src/db/linkers/europepmc_dataset_linker.py | 6 ++++++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/src/db/linkers/elink_dataset_linker.py b/src/db/linkers/elink_dataset_linker.py index b3f5705..041af0f 100644 --- a/src/db/linkers/elink_dataset_linker.py +++ b/src/db/linkers/elink_dataset_linker.py @@ -1,10 +1,9 @@ import logging -from typing import List, Dict import re +from typing import List, Dict import requests import tenacity -from pyrate_limiter import Limiter, Rate, Duration from pyrate_limiter.limiter_factory import create_inmemory_limiter from src.db.linkers.paper_dataset_linker import PaperDatasetLinker diff --git a/src/db/linkers/europepmc_dataset_linker.py b/src/db/linkers/europepmc_dataset_linker.py index fe9fbb1..beb6640 100644 --- a/src/db/linkers/europepmc_dataset_linker.py +++ b/src/db/linkers/europepmc_dataset_linker.py @@ -1,8 +1,13 @@ from typing import List, Dict import requests +from pyrate_limiter import Duration +from pyrate_limiter.limiter_factory import create_inmemory_limiter + from src.exception.europepmc_error import EuropePMCError from src.db.linkers.paper_dataset_linker import PaperDatasetLinker +europepmc_rate_limiter = create_inmemory_limiter(10, Duration.SECOND) + class EuropePMCDatasetLinker(PaperDatasetLinker): EUROPEPMC_URL = ( @@ -68,6 +73,7 @@ def link_to_datasets_mapped(self, pubmed_ids: List[str]) -> Dict[str, List[str]] return result + @europepmc_rate_limiter.as_decorator(name="EuropePMC", weight=1) def _fetch_geo_accession_batch_mapped(self, pubmed_ids: List[str]) -> Dict[str, List[str]]: """ Fetches GEO references in a list of papers (max 8 papers) from EuropePMC's