Clinical-Genomics · karlnyr · Mar 12, 2025 · Mar 12, 2025 · Mar 12, 2025 · Mar 12, 2025
@@ -80,9 +80,9 @@
 
         # Initialize logger
         logger = logging.getLogger("main_logger")
-        logger.setLevel(logging.INFO)
+        logger.setLevel(logging.DEBUG)
         ch = logging.StreamHandler()
-        ch.setLevel(logging.INFO)
+        ch.setLevel(logging.DEBUG)
         ch.setFormatter(logging.Formatter("%(levelname)s - %(message)s"))
         logger.addHandler(ch)
 

@@ -118,18 +118,14 @@ def root(ctx):
     default=preset_config["regex"]["mail_recipient"],
     help="Forced e-mail recipient",
 )
-@click.option(
-    "--skip_update", default=False, help="Skips downloading of references", is_flag=True
-)
+@click.option("--skip_update", default=False, help="Skips downloading of references", is_flag=True)
 @click.option(
     "--force_update",
     default=False,
     help="Forces downloading of pubMLST references",
     is_flag=True,
 )
-@click.option(
-    "--untrimmed", help="Use untrimmed input data", default=False, is_flag=True
-)
+@click.option("--untrimmed", help="Use untrimmed input data", default=False, is_flag=True)
 @click.pass_context
 def analyse(
     ctx,
@@ -235,21 +231,15 @@ def refer(ctx):
     default=preset_config["regex"]["mail_recipient"],
     help="Forced e-mail recipient",
 )
-@click.option(
-    "--skip_update", default=False, help="Skips downloading of references", is_flag=True
-)
+@click.option("--skip_update", default=False, help="Skips downloading of references", is_flag=True)
 @click.option(
     "--report",
     default="default",
-    type=click.Choice(
-        ["default", "typing", "motif_overview", "qc", "json_dump", "st_update"]
-    ),
+    type=click.Choice(["default", "typing", "motif_overview", "qc", "json_dump", "st_update"]),
 )
 @click.option("--output", help="Report output folder", default="")
 @click.pass_context
-def finish(
-    ctx, sampleinfo_file, input, track, config, dry, email, skip_update, report, output
-):
+def finish(ctx, sampleinfo_file, input, track, config, dry, email, skip_update, report, output):
     """Sequence analysis, typing and resistance identification"""
     # Run section
     pool = []
@@ -275,9 +265,7 @@ def finish(
 
     # Samples section
     sampleinfo = review_sampleinfo(sampleinfo_file)
-    ext_refs = Referencer(
-        config=ctx.obj["config"], log=ctx.obj["log"], sampleinfo=sampleinfo
-    )
+    ext_refs = Referencer(config=ctx.obj["config"], log=ctx.obj["log"], sampleinfo=sampleinfo)
     click.echo("INFO - Checking versions of references..")
     try:
         if not skip_update:
@@ -312,9 +300,7 @@ def finish(
 
 @refer.command()
 @click.argument("organism")
-@click.option(
-    "--force", help="Redownloads existing organism", default=False, is_flag=True
-)
+@click.option("--force", help="Redownloads existing organism", default=False, is_flag=True)
 @click.pass_context
 def add(ctx, organism, force):
     """Adds a new internal organism from pubMLST"""
@@ -349,9 +335,7 @@ def observe(ctx):
 @click.option(
     "--type",
     default="default",
-    type=click.Choice(
-        ["default", "typing", "motif_overview", "qc", "json_dump", "st_update"]
-    ),
+    type=click.Choice(["default", "typing", "motif_overview", "qc", "json_dump", "st_update"]),
 )
 @click.option("--output", help="Full path to output folder", default="")
 @click.option("--collection", default=False, is_flag=True)
@@ -391,9 +375,7 @@ def generate(ctx, input):
 
     pool = []
     if not os.path.isdir(input):
-        click.echo(
-            "ERROR - Sequence data folder {} does not exist.".format(project_name)
-        )
+        click.echo("ERROR - Sequence data folder {} does not exist.".format(project_name))
         ctx.abort()
     elif input != os.getcwd():
         for subfolder in os.listdir(input):
@@ -425,9 +407,7 @@ def resync(ctx):
     help="Output format",
 )
 @click.option("--customer", default="all", help="Customer id filter")
-@click.option(
-    "--skip_update", default=False, help="Skips downloading of references", is_flag=True
-)
+@click.option("--skip_update", default=False, help="Skips downloading of references", is_flag=True)
 @click.option(
     "--email",
     default=preset_config["regex"]["mail_recipient"],
@@ -445,15 +425,33 @@ def review(ctx, type, customer, skip_update, email, output):
         ext_refs.resync()
     click.echo("INFO - Version check done. Generating output")
     if type == "report":
-        codemonkey = Reporter(
-            config=ctx.obj["config"], log=ctx.obj["log"], output=output
-        )
+        codemonkey = Reporter(config=ctx.obj["config"], log=ctx.obj["log"], output=output)
         codemonkey.report(type="st_update", customer=customer)
     elif type == "list":
         ext_refs.resync(type=type)
     done()
 
 
+@resync.command()
+@click.option("--force-update", default=False, is_flag=True, help="Forces update")
+@click.pass_context
+def update_refs(ctx, force_update: bool):
+    """Updates all references"""
+    ext_refs = Referencer(config=ctx.obj["config"], log=ctx.obj["log"], force=force_update)
+    ext_refs.update_refs()
+    done()
+
+
+@resync.command()
+@click.option("--force-update", default=False, is_flag=True, help="Forces update")
+@click.pass_context
+def update_from_static(ctx, force_update: bool):
+    """Updates a specific organism"""
+    ext_refs = Referencer(config=ctx.obj["config"], log=ctx.obj["log"])
+    ext_refs.fetch_external(force=force_update)
+    done()
+
+
 @resync.command()
 @click.argument("sample_name")
 @click.option(

@@ -623,7 +623,6 @@ def project_job(self, single_sample=False):
                 try:
                     sample_in = "{}/{}".format(self.indir, ldir)
                     sample_out = "{}/{}".format(self.finishdir, ldir)
-                    linkedjson = None
                     local_sampleinfo = [p for p in self.sampleinfo if p["CG_ID_sample"] == ldir]
                     if local_sampleinfo == []:
                         raise Exception("Sample {} has no counterpart in json file".format(ldir))

@@ -120,6 +120,12 @@ def download_profiles_csv(self, db: str, scheme_id: int):
             RequestType.DB, HTTPMethod.GET, url, db=db, response_handler=ResponseHandler.TEXT
         )
 
+    def download_profiles_csv_by_url_and_db(self, db: str, url: str):
+        """Download MLST profiles in CSV format using a custom URL."""
+        return self._make_request(
+            RequestType.DB, HTTPMethod.GET, url, response_handler=ResponseHandler.TEXT
+        )
+
     def retrieve_scheme_info(self, db: str, scheme_id: int):
         """Retrieve information about a specific MLST scheme."""
         url = f"{BASE_API}/db/{db}/schemes/{scheme_id}"

@@ -3,18 +3,18 @@
 
 #!/usr/bin/env python
 import glob
-import json
 import os
 import re
 import shutil
 import subprocess
 import urllib.request
-import zipfile
+
 from microSALT.utils.pubmlst.client import PubMLSTClient
 
 from Bio import Entrez
 import xml.etree.ElementTree as ET
 from microSALT.store.db_manipulator import DB_Manipulator
+from microSALT.utils.pubmlst.exceptions import InvalidURLError
 
 
 class Referencer:
@@ -144,34 +144,82 @@ def fetch_external(self, force=False):
                     # Check for newer version
                     currver = self.db_access.get_version("profile_{}".format(organ))
                     st_link = entry.find("./mlst/database/profiles/url").text
-                    profiles_query = urllib.request.urlopen(st_link)
-                    profile_no = profiles_query.readlines()[-1].decode("utf-8").split("\t")[0]
-                    if organ.replace("_", " ") not in self.updated and (
-                        int(profile_no.replace("-", "")) > int(currver.replace("-", "")) or force
+
+                    # Parse the database name and scheme ID
+                    try:
+                        parsed_data = self.client.parse_pubmlst_url(url=st_link)
+                    except InvalidURLError as e:
+                        self.logger.warning(f"Invalid URL: {st_link} - {e}")
+                        continue
+
+                    scheme_id = parsed_data.get("scheme_id")  # Extract scheme ID
+                    db = parsed_data.get("db")  # Extract database name
+
+                    if not db or not scheme_id:
+                        self.logger.warning(
+                            f"Could not extract database name or scheme ID from MLST URL: {st_link}"
+                        )
+                        return None
+
+                    scheme_info = self.client.retrieve_scheme_info(
+                        db, scheme_id
+                    )  # Retrieve scheme info
+                    last_updated = scheme_info.get("last_updated")  # Extract last updated date
+                    if (
+                        int(last_updated.replace("-", "")) <= int(currver.replace("-", ""))
+                        and not force
                     ):
-                        # Download MLST profiles
-                        self.logger.info("Downloading new MLST profiles for " + species)
-                        output = "{}/{}".format(self.config["folders"]["profiles"], organ)
-                        urllib.request.urlretrieve(st_link, output)
-                        # Clear existing directory and download allele files
-                        out = "{}/{}".format(self.config["folders"]["references"], organ)
-                        shutil.rmtree(out)
-                        os.makedirs(out)
-                        for locus in entry.findall("./mlst/database/loci/locus"):
-                            locus_name = locus.text.strip()
-                            locus_link = locus.find("./url").text
-                            urllib.request.urlretrieve(
-                                locus_link, "{}/{}.tfa".format(out, locus_name)
-                            )
-                        # Create new indexes
-                        self.index_db(out, ".tfa")
-                        # Update database
-                        self.db_access.upd_rec(
-                            {"name": "profile_{}".format(organ)},
-                            "Versions",
-                            {"version": profile_no},
+                        self.logger.info(
+                            f"Profile for {organ.replace('_', ' ').capitalize()} already at the latest version."
                         )
-                        self.db_access.reload_profiletable(organ)
+                        continue
+                    self.logger.info(
+                        f"pubMLST reference for {organ.replace('_', ' ').capitalize()} updated to {last_updated} from {currver}"
+                    )
+
+                    # Step 1: Download the profiles CSV
+                    st_target = f"{self.config['folders']['profiles']}/{organ}"
+                    profiles_csv = self.client.download_profiles_csv(db, scheme_id)
+
+                    # Only write the first 8 columns, this avoids adding information such as "clonal_complex" and "species"
+                    profiles_csv = profiles_csv.split("\n")
+                    trimmed_profiles = []
+                    for line in profiles_csv:
+                        trimmed_profiles.append("\t".join(line.split("\t")[:8]))
+
+                    profiles_csv = "\n".join(trimmed_profiles)
+
+                    with open(st_target, "w") as profile_file:
+                        profile_file.write(profiles_csv)
+
+                    self.logger.info(f"Profiles CSV downloaded to {st_target}")
+
+                    # Step 2: Fetch scheme information to get loci
+
+                    loci_list = scheme_info.get("loci", [])
+
+                    # Step 3: Download loci FASTA files
+                    output = f"{self.config['folders']['references']}/{organ}"
+                    if os.path.isdir(output):
+                        shutil.rmtree(output)
+                    os.makedirs(output)
+
+                    for locus_uri in loci_list:
+                        locus_name = os.path.basename(os.path.normpath(locus_uri))
+                        loci_fasta = self.client.download_locus(db, locus_name)
+                        with open(f"{output}/{locus_name}.tfa", "w") as fasta_file:
+                            fasta_file.write(loci_fasta)
+                        self.logger.info(f"Locus FASTA downloaded: {locus_name}.tfa")
+
+                    # Step 4: Create new indexes
+                    self.index_db(output, ".tfa")
+
+                    self.db_access.upd_rec(
+                        {"name": "profile_{}".format(organ)},
+                        "Versions",
+                        {"version": last_updated},
+                    )
+                    self.db_access.reload_profiletable(organ)
         except Exception as e:
             self.logger.warning("Unable to update pubMLST external data: {}".format(e))
 
@@ -257,21 +305,16 @@ def organism2reference(self, normal_organism_name):
         orgs = os.listdir(self.config["folders"]["references"])
         organism = re.split(r"\W+", normal_organism_name.lower())
         try:
-            refs = 0
             for target in orgs:
                 hit = 0
                 for piece in organism:
                     if len(piece) == 1:
                         if target.startswith(piece):
                             hit += 1
+                    elif piece in target or piece == "pneumonsiae" and "pneumoniae" in target:
+                        hit += 1
                     else:
-                        if piece in target:
-                            hit += 1
-                        # For when people misspell the strain in the orderform
-                        elif piece == "pneumonsiae" and "pneumoniae" in target:
-                            hit += 1
-                        else:
-                            break
+                        break
                 if hit == len(organism):
                     return target
         except Exception as e:
@@ -519,6 +562,7 @@ def fetch_pubmlst(self, force=False):
         for key, val in seqdef_url.items():
             internal_ver = self.db_access.get_version("profile_{}".format(key))
             external_ver = self.external_version(key, val)
+
             if (internal_ver < external_ver) or force:
                 self.logger.info(
                     "pubMLST reference for {} updated to {} from {}".format(

@@ -202,7 +202,7 @@ def scrape_blast(self, type="", file_list=[]):
         res_cols = self.db_pusher.get_columns("{}".format(type2db))
 
         try:
-            old_ref = ""
+            # If organism is enterobacteriaceae, the reference in pubMLST now is using the "spp." suffix
             for file in file_list:
                 filename = os.path.basename(file).rsplit(".", 1)[0]  # Removes suffix
                 if filename == "lactam":

@@ -16,7 +16,6 @@
 
 from microSALT import preset_config, logger
 from microSALT.utils.reporter import Reporter
-from microSALT.utils.referencer import Referencer
 from microSALT.store.db_manipulator import DB_Manipulator
 
 def unpack_db_json(filename):