Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add referencer utilities and constants for Enterobacteriaceae #201

Open
wants to merge 30 commits into
base: rc4.2.0
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
326bc21
Add referencer utilities and constants for Escherichia and Klebsiella
karlnyr Mar 12, 2025
a8517ac
Refactor referencer utilities to improve species identification and r…
karlnyr Mar 12, 2025
4ee1e95
Refactor import statements for Referencer to improve module structure
karlnyr Mar 12, 2025
262fd85
Add unit tests for referencer utilities to validate species identific…
karlnyr Mar 12, 2025
4138cc3
address code review
karlnyr Mar 18, 2025
a703f58
fix logic
karlnyr Mar 18, 2025
09f3398
fix test
karlnyr Mar 19, 2025
d7b0ab3
Merge branch 'rc4.2.0' into fix-199-incorrect-database-used
karlnyr Mar 19, 2025
3afe747
Apply suggestions from code review
karlnyr Mar 24, 2025
39ffc5d
Merge branch 'rc4.2.0' into fix-199-incorrect-database-used
karlnyr Mar 26, 2025
62c1966
Refactor organism name parsing to use underscore as delimiter and upd…
karlnyr Mar 26, 2025
cea7711
Integrate reference retrieval for Enterobacteriaceae in MLST profile …
karlnyr Mar 27, 2025
c11d8a0
Enhance logging and update external reference fetching to use authent…
karlnyr Mar 27, 2025
9dbf571
Remove unused reference fetching for Enterobacteriaceae and update re…
karlnyr Mar 28, 2025
bdc78c3
Add command to update all external references in CLI
karlnyr Mar 28, 2025
514176a
Add force update option to reference update command
karlnyr Mar 28, 2025
4a4c593
Refactor referencer to improve MLST profile downloading and streamlin…
karlnyr Mar 28, 2025
fd9fd91
Remove debug logging and add print statements for scheme retrieval in…
karlnyr Mar 28, 2025
deeb16d
add print statements to debug
karlnyr Mar 28, 2025
6d1093e
adding type checks
karlnyr Mar 28, 2025
d652792
is this supposed to be a json handler?
karlnyr Mar 28, 2025
66a60f1
remove unwanted argument, adding updater only from static
karlnyr Mar 28, 2025
4b2ea0f
Refactor MLST profile retrieval and downloading process
karlnyr Mar 28, 2025
12540b1
Update profile version retrieval to use external versioning method
karlnyr Mar 28, 2025
0d2feb4
adding some prints for debug
karlnyr Mar 28, 2025
2a408c3
Update profile version assignment to use last updated timestamp from …
karlnyr Mar 28, 2025
37806c8
Enhance profile update logic to check last updated timestamp before d…
karlnyr Mar 28, 2025
481573f
move extraction of db to before variable call
karlnyr Mar 28, 2025
e62b4c0
Add error handling for invalid URLs in parse_pubmlst_url method
karlnyr Mar 28, 2025
1bc2676
Refactor referencer imports and remove unused referencer module and c…
karlnyr Mar 29, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions microSALT/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,9 +80,9 @@

# Initialize logger
logger = logging.getLogger("main_logger")
logger.setLevel(logging.INFO)
logger.setLevel(logging.DEBUG)
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
ch.setLevel(logging.DEBUG)
ch.setFormatter(logging.Formatter("%(levelname)s - %(message)s"))
logger.addHandler(ch)

Expand Down
64 changes: 31 additions & 33 deletions microSALT/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,18 +118,14 @@ def root(ctx):
default=preset_config["regex"]["mail_recipient"],
help="Forced e-mail recipient",
)
@click.option(
"--skip_update", default=False, help="Skips downloading of references", is_flag=True
)
@click.option("--skip_update", default=False, help="Skips downloading of references", is_flag=True)
@click.option(
"--force_update",
default=False,
help="Forces downloading of pubMLST references",
is_flag=True,
)
@click.option(
"--untrimmed", help="Use untrimmed input data", default=False, is_flag=True
)
@click.option("--untrimmed", help="Use untrimmed input data", default=False, is_flag=True)
@click.pass_context
def analyse(
ctx,
Expand Down Expand Up @@ -235,21 +231,15 @@ def refer(ctx):
default=preset_config["regex"]["mail_recipient"],
help="Forced e-mail recipient",
)
@click.option(
"--skip_update", default=False, help="Skips downloading of references", is_flag=True
)
@click.option("--skip_update", default=False, help="Skips downloading of references", is_flag=True)
@click.option(
"--report",
default="default",
type=click.Choice(
["default", "typing", "motif_overview", "qc", "json_dump", "st_update"]
),
type=click.Choice(["default", "typing", "motif_overview", "qc", "json_dump", "st_update"]),
)
@click.option("--output", help="Report output folder", default="")
@click.pass_context
def finish(
ctx, sampleinfo_file, input, track, config, dry, email, skip_update, report, output
):
def finish(ctx, sampleinfo_file, input, track, config, dry, email, skip_update, report, output):
"""Sequence analysis, typing and resistance identification"""
# Run section
pool = []
Expand All @@ -275,9 +265,7 @@ def finish(

# Samples section
sampleinfo = review_sampleinfo(sampleinfo_file)
ext_refs = Referencer(
config=ctx.obj["config"], log=ctx.obj["log"], sampleinfo=sampleinfo
)
ext_refs = Referencer(config=ctx.obj["config"], log=ctx.obj["log"], sampleinfo=sampleinfo)
click.echo("INFO - Checking versions of references..")
try:
if not skip_update:
Expand Down Expand Up @@ -312,9 +300,7 @@ def finish(

@refer.command()
@click.argument("organism")
@click.option(
"--force", help="Redownloads existing organism", default=False, is_flag=True
)
@click.option("--force", help="Redownloads existing organism", default=False, is_flag=True)
@click.pass_context
def add(ctx, organism, force):
"""Adds a new internal organism from pubMLST"""
Expand Down Expand Up @@ -349,9 +335,7 @@ def observe(ctx):
@click.option(
"--type",
default="default",
type=click.Choice(
["default", "typing", "motif_overview", "qc", "json_dump", "st_update"]
),
type=click.Choice(["default", "typing", "motif_overview", "qc", "json_dump", "st_update"]),
)
@click.option("--output", help="Full path to output folder", default="")
@click.option("--collection", default=False, is_flag=True)
Expand Down Expand Up @@ -391,9 +375,7 @@ def generate(ctx, input):

pool = []
if not os.path.isdir(input):
click.echo(
"ERROR - Sequence data folder {} does not exist.".format(project_name)
)
click.echo("ERROR - Sequence data folder {} does not exist.".format(project_name))
ctx.abort()
elif input != os.getcwd():
for subfolder in os.listdir(input):
Expand Down Expand Up @@ -425,9 +407,7 @@ def resync(ctx):
help="Output format",
)
@click.option("--customer", default="all", help="Customer id filter")
@click.option(
"--skip_update", default=False, help="Skips downloading of references", is_flag=True
)
@click.option("--skip_update", default=False, help="Skips downloading of references", is_flag=True)
@click.option(
"--email",
default=preset_config["regex"]["mail_recipient"],
Expand All @@ -445,15 +425,33 @@ def review(ctx, type, customer, skip_update, email, output):
ext_refs.resync()
click.echo("INFO - Version check done. Generating output")
if type == "report":
codemonkey = Reporter(
config=ctx.obj["config"], log=ctx.obj["log"], output=output
)
codemonkey = Reporter(config=ctx.obj["config"], log=ctx.obj["log"], output=output)
codemonkey.report(type="st_update", customer=customer)
elif type == "list":
ext_refs.resync(type=type)
done()


@resync.command()
@click.option("--force-update", default=False, is_flag=True, help="Forces update")
@click.pass_context
def update_refs(ctx, force_update: bool):
"""Updates all references"""
ext_refs = Referencer(config=ctx.obj["config"], log=ctx.obj["log"], force=force_update)
ext_refs.update_refs()
done()


@resync.command()
@click.option("--force-update", default=False, is_flag=True, help="Forces update")
@click.pass_context
def update_from_static(ctx, force_update: bool):
"""Updates a specific organism"""
ext_refs = Referencer(config=ctx.obj["config"], log=ctx.obj["log"])
ext_refs.fetch_external(force=force_update)
done()


@resync.command()
@click.argument("sample_name")
@click.option(
Expand Down
1 change: 0 additions & 1 deletion microSALT/utils/job_creator.py
Original file line number Diff line number Diff line change
Expand Up @@ -623,7 +623,6 @@ def project_job(self, single_sample=False):
try:
sample_in = "{}/{}".format(self.indir, ldir)
sample_out = "{}/{}".format(self.finishdir, ldir)
linkedjson = None
local_sampleinfo = [p for p in self.sampleinfo if p["CG_ID_sample"] == ldir]
if local_sampleinfo == []:
raise Exception("Sample {} has no counterpart in json file".format(ldir))
Expand Down
6 changes: 6 additions & 0 deletions microSALT/utils/pubmlst/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,12 @@ def download_profiles_csv(self, db: str, scheme_id: int):
RequestType.DB, HTTPMethod.GET, url, db=db, response_handler=ResponseHandler.TEXT
)

def download_profiles_csv_by_url_and_db(self, db: str, url: str):
"""Download MLST profiles in CSV format using a custom URL."""
return self._make_request(
RequestType.DB, HTTPMethod.GET, url, response_handler=ResponseHandler.TEXT
)

def retrieve_scheme_info(self, db: str, scheme_id: int):
"""Retrieve information about a specific MLST scheme."""
url = f"{BASE_API}/db/{db}/schemes/{scheme_id}"
Expand Down
116 changes: 80 additions & 36 deletions microSALT/utils/referencer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,18 @@

#!/usr/bin/env python
import glob
import json
import os
import re
import shutil
import subprocess
import urllib.request
import zipfile

from microSALT.utils.pubmlst.client import PubMLSTClient

from Bio import Entrez
import xml.etree.ElementTree as ET
from microSALT.store.db_manipulator import DB_Manipulator
from microSALT.utils.pubmlst.exceptions import InvalidURLError


class Referencer:
Expand Down Expand Up @@ -144,34 +144,82 @@ def fetch_external(self, force=False):
# Check for newer version
currver = self.db_access.get_version("profile_{}".format(organ))
st_link = entry.find("./mlst/database/profiles/url").text
profiles_query = urllib.request.urlopen(st_link)
profile_no = profiles_query.readlines()[-1].decode("utf-8").split("\t")[0]
if organ.replace("_", " ") not in self.updated and (
int(profile_no.replace("-", "")) > int(currver.replace("-", "")) or force

# Parse the database name and scheme ID
try:
parsed_data = self.client.parse_pubmlst_url(url=st_link)
except InvalidURLError as e:
self.logger.warning(f"Invalid URL: {st_link} - {e}")
continue

scheme_id = parsed_data.get("scheme_id") # Extract scheme ID
db = parsed_data.get("db") # Extract database name

if not db or not scheme_id:
self.logger.warning(
f"Could not extract database name or scheme ID from MLST URL: {st_link}"
)
return None

scheme_info = self.client.retrieve_scheme_info(
db, scheme_id
) # Retrieve scheme info
last_updated = scheme_info.get("last_updated") # Extract last updated date
if (
int(last_updated.replace("-", "")) <= int(currver.replace("-", ""))
and not force
):
# Download MLST profiles
self.logger.info("Downloading new MLST profiles for " + species)
output = "{}/{}".format(self.config["folders"]["profiles"], organ)
urllib.request.urlretrieve(st_link, output)
# Clear existing directory and download allele files
out = "{}/{}".format(self.config["folders"]["references"], organ)
shutil.rmtree(out)
os.makedirs(out)
for locus in entry.findall("./mlst/database/loci/locus"):
locus_name = locus.text.strip()
locus_link = locus.find("./url").text
urllib.request.urlretrieve(
locus_link, "{}/{}.tfa".format(out, locus_name)
)
# Create new indexes
self.index_db(out, ".tfa")
# Update database
self.db_access.upd_rec(
{"name": "profile_{}".format(organ)},
"Versions",
{"version": profile_no},
self.logger.info(
f"Profile for {organ.replace('_', ' ').capitalize()} already at the latest version."
)
self.db_access.reload_profiletable(organ)
continue
self.logger.info(
f"pubMLST reference for {organ.replace('_', ' ').capitalize()} updated to {last_updated} from {currver}"
)

# Step 1: Download the profiles CSV
st_target = f"{self.config['folders']['profiles']}/{organ}"
profiles_csv = self.client.download_profiles_csv(db, scheme_id)

# Only write the first 8 columns, this avoids adding information such as "clonal_complex" and "species"
profiles_csv = profiles_csv.split("\n")
trimmed_profiles = []
for line in profiles_csv:
trimmed_profiles.append("\t".join(line.split("\t")[:8]))

profiles_csv = "\n".join(trimmed_profiles)

with open(st_target, "w") as profile_file:
profile_file.write(profiles_csv)

self.logger.info(f"Profiles CSV downloaded to {st_target}")

# Step 2: Fetch scheme information to get loci

loci_list = scheme_info.get("loci", [])

# Step 3: Download loci FASTA files
output = f"{self.config['folders']['references']}/{organ}"
if os.path.isdir(output):
shutil.rmtree(output)
os.makedirs(output)

for locus_uri in loci_list:
locus_name = os.path.basename(os.path.normpath(locus_uri))
loci_fasta = self.client.download_locus(db, locus_name)
with open(f"{output}/{locus_name}.tfa", "w") as fasta_file:
fasta_file.write(loci_fasta)
self.logger.info(f"Locus FASTA downloaded: {locus_name}.tfa")

# Step 4: Create new indexes
self.index_db(output, ".tfa")

self.db_access.upd_rec(
{"name": "profile_{}".format(organ)},
"Versions",
{"version": last_updated},
)
self.db_access.reload_profiletable(organ)
except Exception as e:
self.logger.warning("Unable to update pubMLST external data: {}".format(e))

Expand Down Expand Up @@ -257,21 +305,16 @@ def organism2reference(self, normal_organism_name):
orgs = os.listdir(self.config["folders"]["references"])
organism = re.split(r"\W+", normal_organism_name.lower())
try:
refs = 0
for target in orgs:
hit = 0
for piece in organism:
if len(piece) == 1:
if target.startswith(piece):
hit += 1
elif piece in target or piece == "pneumonsiae" and "pneumoniae" in target:
hit += 1
else:
if piece in target:
hit += 1
# For when people misspell the strain in the orderform
elif piece == "pneumonsiae" and "pneumoniae" in target:
hit += 1
else:
break
break
if hit == len(organism):
return target
except Exception as e:
Expand Down Expand Up @@ -519,6 +562,7 @@ def fetch_pubmlst(self, force=False):
for key, val in seqdef_url.items():
internal_ver = self.db_access.get_version("profile_{}".format(key))
external_ver = self.external_version(key, val)

if (internal_ver < external_ver) or force:
self.logger.info(
"pubMLST reference for {} updated to {} from {}".format(
Expand Down
2 changes: 1 addition & 1 deletion microSALT/utils/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ def scrape_blast(self, type="", file_list=[]):
res_cols = self.db_pusher.get_columns("{}".format(type2db))

try:
old_ref = ""
# If organism is enterobacteriaceae, the reference in pubMLST now is using the "spp." suffix
for file in file_list:
filename = os.path.basename(file).rsplit(".", 1)[0] # Removes suffix
if filename == "lactam":
Expand Down
1 change: 0 additions & 1 deletion tests/test_reporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@

from microSALT import preset_config, logger
from microSALT.utils.reporter import Reporter
from microSALT.utils.referencer import Referencer
from microSALT.store.db_manipulator import DB_Manipulator

def unpack_db_json(filename):
Expand Down
Loading