Skip to content

Commit

Permalink
Merge pull request #534 from Proteobench/peaks_parameters
Browse files Browse the repository at this point in the history
drafingt parameter file parsing for peaks #520
  • Loading branch information
RobbinBouwmeester authored Jan 20, 2025
2 parents cdc9b3a + 632c422 commit d463449
Show file tree
Hide file tree
Showing 33 changed files with 5,361 additions and 15 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -142,3 +142,6 @@ results.json

# secrets
**/.streamlit/secrets.toml

# wew
build.sh
7 changes: 4 additions & 3 deletions proteobench/io/params/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,8 @@ class ProteoBenchParameters:
min_precursor_charge: Optional[int] = None # precursor_charge
max_precursor_charge: Optional[int] = None
scan_window: Optional[int] = None # DIA-specific
quantification_method: Optional[str] = None
quantification_method: Optional[str] = None #
second_pass: Optional[bool] = None # DIANN specific
protein_inference: Optional[str] = None
predictors_library: Optional[dict] = None
protein_inference: Optional[str] = None # example occams razor, proteinprophet
predictors_library: Optional[dict] = None # type of model used to generate spectral library
abundance_normalization_ions: Optional[str] = None # tic, median etc.
2 changes: 1 addition & 1 deletion proteobench/io/params/alphapept.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def extract_params(fname: pathlib.Path) -> ProteoBenchParameters:
params.min_precursor_charge = record["features"]["iso_charge_min"]
params.max_precursor_charge = record["features"]["iso_charge_max"]
params.enable_match_between_runs = record["workflow"]["match"] # Check if matching is enabled

params.abundance_normalization_ions = None # No normalization in AlphaPept
return params


Expand Down
2 changes: 1 addition & 1 deletion proteobench/io/params/diann.py
Original file line number Diff line number Diff line change
Expand Up @@ -346,7 +346,7 @@ def extract_params(fname: str) -> ProteoBenchParameters:

# If scan window is not customely set, extract it from the log file
parameters["scan_window"] = int(extract_with_regex(lines, scan_window_regex))

parameters["abundance_normalization_ions"] = None
return ProteoBenchParameters(**parameters)


Expand Down
3 changes: 3 additions & 0 deletions proteobench/io/params/fragger.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,10 +156,13 @@ def extract_params(file: BytesIO) -> ProteoBenchParameters:
params.ident_fdr_protein = fragpipe_params.loc["ionquant.proteinfdr"]
params.ident_fdr_peptide = fragpipe_params.loc["ionquant.peptidefdr"]
params.ident_fdr_psm = fragpipe_params.loc["ionquant.ionfdr"]
params.abundance_normalization_ions = fragpipe_params.loc["ionquant.normalization"]

elif fragpipe_params.loc["diann.run-dia-nn"] == "true":
params.ident_fdr_protein = fragpipe_params.loc["diann.q-value"]
params.ident_fdr_peptide = fragpipe_params.loc["diann.q-value"]
params.ident_fdr_psm = fragpipe_params.loc["diann.q-value"]
params.abundance_normalization_ions = None

# Precursor charge settings
if fragpipe_params.loc["msfragger.override_charge"] == "true":
Expand Down
189 changes: 189 additions & 0 deletions proteobench/io/params/peaks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
import re
from pathlib import Path
from typing import List, Optional
import yaml

import pandas as pd

from proteobench.io.params import ProteoBenchParameters


def clean_text(text: str) -> str:
"""
Clean the input text by removing leading and trailing spaces, colons, commas, or tabs.
Args:
text (str): The text to be cleaned.
Returns:
str: The cleaned text.
"""
text = re.sub(r"^[\s:,\t]+|[\s:,\t]+$", "", text)
return text


def extract_value(lines: List[str], search_term: str) -> Optional[str]:
"""
Extract the value associated with a search term from a list of lines.
Args:
lines (List[str]): The list of lines to search through.
search_term (str): The term to search for in the lines.
Returns:
Optional[str]: The extracted value, or None if the search term is not found.
"""
matching_line = next((line for line in lines if search_term in line), None)
# Step 2: If a matching line is found, extract and clean the value
if matching_line:
# Extract the part after the search term
raw_value = matching_line.split(search_term, 1)[1]
# Clean the extracted value
return clean_text(raw_value)

# Step 3: Return None if no matching line is found
return None
# return next((clean_text(line.split(search_term)[1]) for line in lines if search_term in line), None)


def extract_mass_tolerance(lines: List[str], search_term: str) -> Optional[str]:
"""
Extract the mass tolerance value associated with a search term, with special handling for "System Default".
Args:
lines (List[str]): The list of lines to search through.
search_term (str): The term to search for in the lines.
Returns:
Optional[str]: The extracted mass tolerance value, or None if the search term is not found.
"""
value = next((clean_text(line.split(search_term)[1]) for line in lines if search_term in line), None)
value = "40 ppm" if value == "System Default" else value
return value


def extract_value_regex(lines: List[str], search_term: str) -> Optional[str]:
"""
Extract the value associated with a search term using regular expressions.
Args:
lines (List[str]): The list of lines to search through.
search_term (str): The regular expression to search for in the lines.
Returns:
Optional[str]: The extracted value, or None if the search term is not found.
"""
return next((clean_text(re.split(search_term, line)[1]) for line in lines if re.search(search_term, line)), None)


def get_items_between(lines: list, start: str, end: str) -> list:
"""
Finds all lines starting with '-' that appear
between 'Fixed Modifications:' and 'Variable Modifications:'.
Returns them as a list of strings, without the leading dash.
"""

capturing = False
items = []

for line in lines:
stripped = line.strip()

if stripped.startswith(start):
capturing = True
continue

if stripped.startswith(end):
capturing = False
break

if capturing and stripped.startswith("- "):
# Remove the dash and leading space
item = stripped[2:].strip()
items.append(item)

return items


def read_peaks_settings(file_path: str) -> ProteoBenchParameters:
"""
Read a Spectronaut settings file, extract parameters, and return them as a `ProteoBenchParameters` object.
Args:
file_path (str): The path to the Spectronaut settings file.
Returns:
ProteoBenchParameters: The extracted parameters encapsulated in a `ProteoBenchParameters` object.
"""
# Try to read the file contents

try:
# Attempt to open and read the file
with open(file_path, encoding="utf-8") as f:
lines = f.readlines()
except Exception as e:
raise IOError(f"Failed to open or read the file at {file_path}. Error: {e}")

# Remove any trailing newline characters from each line
lines = [line.strip() for line in lines]

params = ProteoBenchParameters()

params.software_name = "Peaks"
params.software_version = None
params.search_engine = "Peaks"
params.search_engine_version = params.software_version

params.ident_fdr_psm = None
fdr = extract_value(lines, "Peptide FDR:")
fdr = float(fdr[:-1]) / 100 # Convert percentage to a decimal
params.ident_fdr_peptide = fdr
# peaks uses Proteins -10LgP >= 15.0 instead of FDR
params.ident_fdr_protein = None
params.enable_match_between_runs = extract_value(lines, "Match Between Run:")
params.precursor_mass_tolerance = extract_mass_tolerance(lines, "Precursor Mass Error Tolerance:")
params.fragment_mass_tolerance = extract_mass_tolerance(lines, "Fragment Mass Error Tolerance:")
params.enzyme = extract_value(lines, "Enzyme:")
params.allowed_miscleavages = int(extract_value(lines, "Max Missed Cleavage:"))

peptide_length_range = extract_value(lines, "Peptide Length between:").split(",")
params.max_peptide_length = int(peptide_length_range[1])
params.min_peptide_length = int(peptide_length_range[0])
fixed = get_items_between(lines, "Fixed Modifications:", "Variable Modifications:")
params.fixed_mods = " ,".join(fixed)
varmods = get_items_between(lines, "Variable Modifications:", "Database:")
params.variable_mods = " ,".join(varmods)
params.max_mods = int(extract_value(lines, "Max Variable PTM per Peptide:"))

precursor_charge_between = extract_value(lines, "Precursor Charge between:").split(",")
params.min_precursor_charge = int(precursor_charge_between[0])
params.max_precursor_charge = int(precursor_charge_between[1])

params.scan_window = None

params.quantification_method = extract_value(
lines, "LFQ Method:"
) # "Quantity MS Level:" or "Protein LFQ Method:" or "Quantity Type:"
params.second_pass = None
params.protein_inference = None
params.predictors_library = None
params.abundance_normalization_ions = extract_value(lines, "Normalization Method:")
return params


if __name__ == "__main__":
"""
Reads Spectronaut settings files, extracts parameters, and writes them to CSV files.
"""
fnames = ["../../../test/params/PEAKS_parameters.txt"]

for file in fnames:
# Extract parameters from the settings file
parameters = read_peaks_settings(file)

# Convert parameters to pandas Series and save to CSV
actual = pd.Series(parameters.__dict__)
actual.to_csv(Path(file).with_suffix(".csv"))

# Optionally, print the parameters to the console
print(parameters)
17 changes: 11 additions & 6 deletions proteobench/io/params/spectronaut.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,11 +76,16 @@ def read_spectronaut_settings(file_path: str) -> ProteoBenchParameters:
ProteoBenchParameters: The extracted parameters encapsulated in a `ProteoBenchParameters` object.
"""
# Try to read the file contents
try:
with open(file_path) as f:
lines = f.readlines()
except:
lines = [l for l in file_path.read().decode("utf-8").splitlines()]
if hasattr(file_path, "read"):
# Assume it behaves like a file object
lines = file_path.read().decode("utf-8").splitlines()
else:
try:
# Attempt to open and read the file
with open(file_path, encoding="utf-8") as f:
lines = f.readlines()
except Exception as e:
raise IOError(f"Failed to open or read the file at {file_path}. Error: {e}")

# Remove any trailing newline characters from each line
lines = [line.strip() for line in lines]
Expand Down Expand Up @@ -128,7 +133,7 @@ def read_spectronaut_settings(file_path: str) -> ProteoBenchParameters:
params.second_pass = extract_value(lines, "directDIA Workflow:")
params.protein_inference = extract_value(lines, "Inference Algorithm:") # or Protein Inference Workflow:
params.predictors_library = extract_value(lines, "Hybrid (DDA + DIA) Library").replace(":", "").strip()

params.abundance_normalization_ions = extract_value(lines, "Cross-Run Normalization:")
return params


Expand Down
5 changes: 5 additions & 0 deletions proteobench/modules/quant/quant_base/quant_base_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,10 @@
from proteobench.io.params.spectronaut import (
read_spectronaut_settings as extract_params_spectronaut,
)
from proteobench.io.params.peaks import (
read_peaks_settings as extract_params_peaks,
)

from proteobench.io.parsing.parse_ion import load_input_file
from proteobench.io.parsing.parse_settings import ParseSettingsBuilder
from proteobench.score.quant.quantscores import QuantScores
Expand Down Expand Up @@ -71,6 +75,7 @@ class QuantModule:
"FragPipe (DIA-NN quant)": extract_params_fragger,
"MSAID": extract_params_msaid,
"Spectronaut": extract_params_spectronaut,
"PEAKS": extract_params_peaks,
# TODO needs to be replace with parameter extraction function
"WOMBAT": extract_params_spectronaut,
# TODO needs to be replace with parameter extraction function
Expand Down
Loading

0 comments on commit d463449

Please sign in to comment.