Merge pull request #534 from Proteobench/peaks_parameters

drafingt parameter file parsing for peaks #520
Proteobench · Jan 20, 2025 · d463449 · d463449
2 parents cdc9b3a + 632c422
commit d463449
Show file tree

Hide file tree

Showing 33 changed files with 5,361 additions and 15 deletions.
diff --git a/.gitignore b/.gitignore
@@ -142,3 +142,6 @@ results.json
 
 # secrets
 **/.streamlit/secrets.toml
+
+# wew
+build.sh
diff --git a/proteobench/io/params/__init__.py b/proteobench/io/params/__init__.py
@@ -88,7 +88,8 @@ class ProteoBenchParameters:
     min_precursor_charge: Optional[int] = None  # precursor_charge
     max_precursor_charge: Optional[int] = None
     scan_window: Optional[int] = None  # DIA-specific
-    quantification_method: Optional[str] = None
+    quantification_method: Optional[str] = None  #
     second_pass: Optional[bool] = None  # DIANN specific
-    protein_inference: Optional[str] = None
-    predictors_library: Optional[dict] = None
+    protein_inference: Optional[str] = None  # example occams razor, proteinprophet
+    predictors_library: Optional[dict] = None  # type of model used to generate spectral library
+    abundance_normalization_ions: Optional[str] = None  # tic, median etc.
diff --git a/proteobench/io/params/alphapept.py b/proteobench/io/params/alphapept.py
@@ -66,7 +66,7 @@ def extract_params(fname: pathlib.Path) -> ProteoBenchParameters:
     params.min_precursor_charge = record["features"]["iso_charge_min"]
     params.max_precursor_charge = record["features"]["iso_charge_max"]
     params.enable_match_between_runs = record["workflow"]["match"]  # Check if matching is enabled
-
+    params.abundance_normalization_ions = None  # No normalization in AlphaPept
     return params
 
 

diff --git a/proteobench/io/params/diann.py b/proteobench/io/params/diann.py
@@ -346,7 +346,7 @@ def extract_params(fname: str) -> ProteoBenchParameters:
 
     # If scan window is not customely set, extract it from the log file
     parameters["scan_window"] = int(extract_with_regex(lines, scan_window_regex))
-
+    parameters["abundance_normalization_ions"] = None
     return ProteoBenchParameters(**parameters)
 
 

diff --git a/proteobench/io/params/fragger.py b/proteobench/io/params/fragger.py
@@ -156,10 +156,13 @@ def extract_params(file: BytesIO) -> ProteoBenchParameters:
         params.ident_fdr_protein = fragpipe_params.loc["ionquant.proteinfdr"]
         params.ident_fdr_peptide = fragpipe_params.loc["ionquant.peptidefdr"]
         params.ident_fdr_psm = fragpipe_params.loc["ionquant.ionfdr"]
+        params.abundance_normalization_ions = fragpipe_params.loc["ionquant.normalization"]
+
     elif fragpipe_params.loc["diann.run-dia-nn"] == "true":
         params.ident_fdr_protein = fragpipe_params.loc["diann.q-value"]
         params.ident_fdr_peptide = fragpipe_params.loc["diann.q-value"]
         params.ident_fdr_psm = fragpipe_params.loc["diann.q-value"]
+        params.abundance_normalization_ions = None
 
     # Precursor charge settings
     if fragpipe_params.loc["msfragger.override_charge"] == "true":

diff --git a/proteobench/io/params/peaks.py b/proteobench/io/params/peaks.py
@@ -0,0 +1,189 @@
+import re
+from pathlib import Path
+from typing import List, Optional
+import yaml
+
+import pandas as pd
+
+from proteobench.io.params import ProteoBenchParameters
+
+
+def clean_text(text: str) -> str:
+    """
+    Clean the input text by removing leading and trailing spaces, colons, commas, or tabs.
+
+    Args:
+        text (str): The text to be cleaned.
+
+    Returns:
+        str: The cleaned text.
+    """
+    text = re.sub(r"^[\s:,\t]+|[\s:,\t]+$", "", text)
+    return text
+
+
+def extract_value(lines: List[str], search_term: str) -> Optional[str]:
+    """
+    Extract the value associated with a search term from a list of lines.
+
+    Args:
+        lines (List[str]): The list of lines to search through.
+        search_term (str): The term to search for in the lines.
+
+    Returns:
+        Optional[str]: The extracted value, or None if the search term is not found.
+    """
+    matching_line = next((line for line in lines if search_term in line), None)
+    # Step 2: If a matching line is found, extract and clean the value
+    if matching_line:
+        # Extract the part after the search term
+        raw_value = matching_line.split(search_term, 1)[1]
+        # Clean the extracted value
+        return clean_text(raw_value)
+
+    # Step 3: Return None if no matching line is found
+    return None
+    # return next((clean_text(line.split(search_term)[1]) for line in lines if search_term in line), None)
+
+
+def extract_mass_tolerance(lines: List[str], search_term: str) -> Optional[str]:
+    """
+    Extract the mass tolerance value associated with a search term, with special handling for "System Default".
+
+    Args:
+        lines (List[str]): The list of lines to search through.
+        search_term (str): The term to search for in the lines.
+
+    Returns:
+        Optional[str]: The extracted mass tolerance value, or None if the search term is not found.
+    """
+    value = next((clean_text(line.split(search_term)[1]) for line in lines if search_term in line), None)
+    value = "40 ppm" if value == "System Default" else value
+    return value
+
+
+def extract_value_regex(lines: List[str], search_term: str) -> Optional[str]:
+    """
+    Extract the value associated with a search term using regular expressions.
+
+    Args:
+        lines (List[str]): The list of lines to search through.
+        search_term (str): The regular expression to search for in the lines.
+
+    Returns:
+        Optional[str]: The extracted value, or None if the search term is not found.
+    """
+    return next((clean_text(re.split(search_term, line)[1]) for line in lines if re.search(search_term, line)), None)
+
+
+def get_items_between(lines: list, start: str, end: str) -> list:
+    """
+    Finds all lines starting with '-' that appear
+    between 'Fixed Modifications:' and 'Variable Modifications:'.
+    Returns them as a list of strings, without the leading dash.
+    """
+
+    capturing = False
+    items = []
+
+    for line in lines:
+        stripped = line.strip()
+
+        if stripped.startswith(start):
+            capturing = True
+            continue
+
+        if stripped.startswith(end):
+            capturing = False
+            break
+
+        if capturing and stripped.startswith("- "):
+            # Remove the dash and leading space
+            item = stripped[2:].strip()
+            items.append(item)
+
+    return items
+
+
+def read_peaks_settings(file_path: str) -> ProteoBenchParameters:
+    """
+    Read a Spectronaut settings file, extract parameters, and return them as a `ProteoBenchParameters` object.
+
+    Args:
+        file_path (str): The path to the Spectronaut settings file.
+
+    Returns:
+        ProteoBenchParameters: The extracted parameters encapsulated in a `ProteoBenchParameters` object.
+    """
+    # Try to read the file contents
+
+    try:
+        # Attempt to open and read the file
+        with open(file_path, encoding="utf-8") as f:
+            lines = f.readlines()
+    except Exception as e:
+        raise IOError(f"Failed to open or read the file at {file_path}. Error: {e}")
+
+    # Remove any trailing newline characters from each line
+    lines = [line.strip() for line in lines]
+
+    params = ProteoBenchParameters()
+
+    params.software_name = "Peaks"
+    params.software_version = None
+    params.search_engine = "Peaks"
+    params.search_engine_version = params.software_version
+
+    params.ident_fdr_psm = None
+    fdr = extract_value(lines, "Peptide FDR:")
+    fdr = float(fdr[:-1]) / 100  # Convert percentage to a decimal
+    params.ident_fdr_peptide = fdr
+    # peaks uses  Proteins -10LgP >= 15.0  instead of FDR
+    params.ident_fdr_protein = None
+    params.enable_match_between_runs = extract_value(lines, "Match Between Run:")
+    params.precursor_mass_tolerance = extract_mass_tolerance(lines, "Precursor Mass Error Tolerance:")
+    params.fragment_mass_tolerance = extract_mass_tolerance(lines, "Fragment Mass Error Tolerance:")
+    params.enzyme = extract_value(lines, "Enzyme:")
+    params.allowed_miscleavages = int(extract_value(lines, "Max Missed Cleavage:"))
+
+    peptide_length_range = extract_value(lines, "Peptide Length between:").split(",")
+    params.max_peptide_length = int(peptide_length_range[1])
+    params.min_peptide_length = int(peptide_length_range[0])
+    fixed = get_items_between(lines, "Fixed Modifications:", "Variable Modifications:")
+    params.fixed_mods = " ,".join(fixed)
+    varmods = get_items_between(lines, "Variable Modifications:", "Database:")
+    params.variable_mods = " ,".join(varmods)
+    params.max_mods = int(extract_value(lines, "Max Variable PTM per Peptide:"))
+
+    precursor_charge_between = extract_value(lines, "Precursor Charge between:").split(",")
+    params.min_precursor_charge = int(precursor_charge_between[0])
+    params.max_precursor_charge = int(precursor_charge_between[1])
+
+    params.scan_window = None
+
+    params.quantification_method = extract_value(
+        lines, "LFQ Method:"
+    )  # "Quantity MS Level:" or "Protein LFQ Method:" or "Quantity Type:"
+    params.second_pass = None
+    params.protein_inference = None
+    params.predictors_library = None
+    params.abundance_normalization_ions = extract_value(lines, "Normalization Method:")
+    return params
+
+
+if __name__ == "__main__":
+    """
+    Reads Spectronaut settings files, extracts parameters, and writes them to CSV files.
+    """
+    fnames = ["../../../test/params/PEAKS_parameters.txt"]
+
+    for file in fnames:
+        # Extract parameters from the settings file
+        parameters = read_peaks_settings(file)
+
+        # Convert parameters to pandas Series and save to CSV
+        actual = pd.Series(parameters.__dict__)
+        actual.to_csv(Path(file).with_suffix(".csv"))
+
+        # Optionally, print the parameters to the console
+        print(parameters)
diff --git a/proteobench/io/params/spectronaut.py b/proteobench/io/params/spectronaut.py
@@ -76,11 +76,16 @@ def read_spectronaut_settings(file_path: str) -> ProteoBenchParameters:
         ProteoBenchParameters: The extracted parameters encapsulated in a `ProteoBenchParameters` object.
     """
     # Try to read the file contents
-    try:
-        with open(file_path) as f:
-            lines = f.readlines()
-    except:
-        lines = [l for l in file_path.read().decode("utf-8").splitlines()]
+    if hasattr(file_path, "read"):
+        # Assume it behaves like a file object
+        lines = file_path.read().decode("utf-8").splitlines()
+    else:
+        try:
+            # Attempt to open and read the file
+            with open(file_path, encoding="utf-8") as f:
+                lines = f.readlines()
+        except Exception as e:
+            raise IOError(f"Failed to open or read the file at {file_path}. Error: {e}")
 
     # Remove any trailing newline characters from each line
     lines = [line.strip() for line in lines]
@@ -128,7 +133,7 @@ def read_spectronaut_settings(file_path: str) -> ProteoBenchParameters:
     params.second_pass = extract_value(lines, "directDIA Workflow:")
     params.protein_inference = extract_value(lines, "Inference Algorithm:")  # or Protein Inference Workflow:
     params.predictors_library = extract_value(lines, "Hybrid (DDA + DIA) Library").replace(":", "").strip()
-
+    params.abundance_normalization_ions = extract_value(lines, "Cross-Run Normalization:")
     return params
 
 

diff --git a/proteobench/modules/quant/quant_base/quant_base_module.py b/proteobench/modules/quant/quant_base/quant_base_module.py
@@ -33,6 +33,10 @@
 from proteobench.io.params.spectronaut import (
     read_spectronaut_settings as extract_params_spectronaut,
 )
+from proteobench.io.params.peaks import (
+    read_peaks_settings as extract_params_peaks,
+)
+
 from proteobench.io.parsing.parse_ion import load_input_file
 from proteobench.io.parsing.parse_settings import ParseSettingsBuilder
 from proteobench.score.quant.quantscores import QuantScores
@@ -71,6 +75,7 @@ class QuantModule:
         "FragPipe (DIA-NN quant)": extract_params_fragger,
         "MSAID": extract_params_msaid,
         "Spectronaut": extract_params_spectronaut,
+        "PEAKS": extract_params_peaks,
         # TODO needs to be replace with parameter extraction function
         "WOMBAT": extract_params_spectronaut,
         # TODO needs to be replace with parameter extraction function