diff --git a/CHANGELOG.md b/CHANGELOG.md index 3be32746..86ec0122 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -# Changelog +# Changelog - pre 0.2.10 All notable changes to this project will be documented in this file. diff --git a/docs/available-modules/2-DDA-Quantification-ion-level.md b/docs/available-modules/2-DDA-Quantification-ion-level.md index 80f910c5..a20c8d04 100644 --- a/docs/available-modules/2-DDA-Quantification-ion-level.md +++ b/docs/available-modules/2-DDA-Quantification-ion-level.md @@ -43,9 +43,10 @@ The module is flexible in terms of what workflow the participants can run. Howev When you have successfully uploaded and visualized a benchmark run, we strongly encourage you to add the result to the online repository. This way, your run will be available to the entire community and can be compared to all other uploaded benchmark runs. By doing so, your workflow outputs, parameters and calculated metrics will be stored and publicly available. -To submit your run for public usage, you need to upload the parameter file associated to your run in the field `Meta data for searches`. Currently, we accept outputs from MaxQuant, FragPipe, Proline and i2MassChroQ (see bellow for more tool-specific details). Please fill the `Comments for submission` if needed, and confirm that the metadata is correct (correspond to the benchmark run) before pressing the button `I really want to upload it`. +To submit your run for public usage, you need to upload the parameter file associated to your run in the field `Meta data for searches`. Currently, we accept outputs from MaxQuant, FragPipe, Proline and i2MassChroQ (see bellow for more tool-specific details). Please fill the `Comments for submission` if needed, and confirm that the metadata is correct (correspond to the benchmark run) before checking the button `I confirm that the metadata is correct`. Then the button +`I really want to upload it` will appear to trigger the submission. -After upload, you will get a link to the pull request associated with your data. Please copy it and save it. With this link, you can get the unique identifier of your run (for example "Proline__20240106_141919"), and follow the advancement of your submission and add comments to communicate with the ProteoBench maintainers. If everything looks good, your submission will be reviewed and accepted (it will take a few working days). Then, your benchmark run will be added to the public runs of this module and plotted alongside all other benchmark runs in the figure. +After upload, you will get a link to the pull request associated with your data. Please copy it and save it. With this link, you can get the unique identifier of your run (for example `Proline__20240106_141919`), and follow the advancement of your submission and add comments to communicate with the ProteoBench maintainers. If everything looks good, your submission will be reviewed and accepted (it will take a few working days). Then, your benchmark run will be added to the public runs of this module and plotted alongside all other benchmark runs in the figure. ## Important Tool-specific settings @@ -82,9 +83,9 @@ Some older versions of MaxQuant do not provide the option to change fasta header ### Proline Use the raw file names as sample names. In the output, it will automatically remove "LFQ_Orbitrap_". -For this module, use the excel exports. Make sure that the “Quantified peptide ions” tab contains the columns "samesets_accessions" and "subsets_accessions". The accessions in these two field are combined to determine what species a peptide sequence matches to. -The "Quantified peptide ions" tab reports validated PSMs, so precursor ion quantities (retrieved from XICs) are duplicated. This redundancy is removed before metric calculation. -For public submission, you can upload the same excel export, just make sure to have the tabs "Search settings and infos", "Import and filters", "Quant config". +For this module, use the excel exports. Make sure that the `Quantified peptide ions` tab contains the columns `samesets_accessions` and `subsets_accessions`. The accessions in these two field are combined to determine what species a peptide sequence matches to. +The `Quantified peptide ions` tab reports validated PSMs, so precursor ion quantities (retrieved from XICs) are duplicated. This redundancy is removed before metric calculation. +For public submission, you can upload the same excel export, just make sure to have the tabs `Search settings and infos`, `Import and filters`, `Quant config`. ### Sage diff --git a/docs/changelog.rst b/docs/changelog.rst index 6345db50..7800e3b2 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -1,2 +1,16 @@ +Changelog +========= + +The recent changelog can be viewed on the Releases page on GitHub: + +`Github Releases `_ + +The old changelog is also available in the CHANGELOG.md file in the root of the repository +and displayed below. + +Changelog - pre 0.2.10 +---------------------- + .. include:: ../CHANGELOG.md :parser: myst_parser.sphinx_ + :start-line: 1 diff --git a/proteobench/io/params/proline.py b/proteobench/io/params/proline.py index 9662c0f4..d379807c 100644 --- a/proteobench/io/params/proline.py +++ b/proteobench/io/params/proline.py @@ -7,6 +7,7 @@ - "Import and filters" - "Quant config" """ + import re import pandas as pd @@ -34,6 +35,14 @@ PATTERN_MIN_PEP_LENGTH = r"\[threshold_value=([0-9].*)\]" +PATTERN_CHARGE = r"[\d+]+" + + +def find_charge(string): + charges = re.findall(PATTERN_CHARGE, string) + charges = [int(c[:-1]) for c in charges] + return charges + def find_min_pep_length(string): min_length = re.findall(PATTERN_MIN_PEP_LENGTH, string)[0] @@ -64,6 +73,11 @@ def extract_params(fname) -> ProteoBenchParameters: params.precursor_mass_tolerance = sheet.loc[0, "peptide_mass_error_tolerance"] params.fragment_mass_tolerance = sheet.loc[0, "fragment_mass_error_tolerance"] + # Extract allowed minimum and maximum charge states + charges = find_charge(sheet.loc[0, "peptide_charge_states"]) + params.min_precursor_charge = min(charges) + params.max_precursor_charge = max(charges) + # ! Second sheet contains information about the import and filters sheet_name = "Import and filters" cols = use_columns[sheet_name] @@ -73,7 +87,10 @@ def extract_params(fname) -> ProteoBenchParameters: assert all(stats.loc["unique", cols] == 1), "Not all columns are unique" sheet = sheet[cols].drop_duplicates().reset_index(drop=True) # Extract - params.ident_fdr_psm = int(sheet.loc[0, "psm_filter_expected_fdr"]) / 100 + try: + params.ident_fdr_psm = int(sheet.loc[0, "psm_filter_expected_fdr"]) / 100 + except ValueError: + params.ident_fdr_psm = sheet.loc[0, "psm_filter_expected_fdr"] params.min_peptide_length = find_min_pep_length(sheet.loc[0, "psm_filter_2"]) # ! Third sheet only contains match between runs (MBR) information indirectly @@ -87,13 +104,14 @@ def extract_params(fname) -> ProteoBenchParameters: if __name__ == "__main__": from pathlib import Path - file = Path("../../../test/params/Proline_example_w_Mascot_wo_proteinSets.xlsx") - params = extract_params(file) - data_dict = params.__dict__ - series = pd.Series(data_dict) - series.to_csv(file.with_suffix(".csv")) - file = Path("../../../test/params/Proline_example_2.xlsx") - params = extract_params(file) - data_dict = params.__dict__ - series = pd.Series(data_dict) - series.to_csv(file.with_suffix(".csv")) + files = [ + "../../../test/params/Proline_example_w_Mascot_wo_proteinSets.xlsx", + "../../../test/params/Proline_example_2.xlsx", + "../../../test/params/ProlineStudio_withMBR.xlsx", + ] + for file in files: + file = Path(file) + params = extract_params(file) + data_dict = params.__dict__ + series = pd.Series(data_dict) + series.to_csv(file.with_suffix(".csv")) diff --git a/test/params/ProlineStudio_withMBR.csv b/test/params/ProlineStudio_withMBR.csv new file mode 100644 index 00000000..c6ce5610 --- /dev/null +++ b/test/params/ProlineStudio_withMBR.csv @@ -0,0 +1,20 @@ +,0 +software_name,Proline +software_version, +search_engine,Mascot +search_engine_version,2.8.3 +ident_fdr_psm,- +ident_fdr_peptide, +ident_fdr_protein, +enable_match_between_runs,True +precursor_mass_tolerance,10.0 ppm +fragment_mass_tolerance,0.02 Da +enzyme,Trypsin/P +allowed_miscleavages,2 +min_peptide_length,7 +max_peptide_length, +fixed_mods,Carbamidomethyl (C) +variable_mods,Acetyl (Protein N-term); Oxidation (M) +max_mods, +min_precursor_charge,2 +max_precursor_charge,3 diff --git a/test/params/ProlineStudio_withMBR.xlsx b/test/params/ProlineStudio_withMBR.xlsx new file mode 100644 index 00000000..ac18fdd2 Binary files /dev/null and b/test/params/ProlineStudio_withMBR.xlsx differ diff --git a/test/params/Proline_example_2.csv b/test/params/Proline_example_2.csv index b50ad116..83d17b6f 100644 --- a/test/params/Proline_example_2.csv +++ b/test/params/Proline_example_2.csv @@ -16,5 +16,5 @@ max_peptide_length, fixed_mods,Carbamidomethyl (C) variable_mods,Acetyl (Protein N-term); Gln->pyro-Glu (Any N-term Q); Ammonia-loss (Any N-term C); Glu->pyro-Glu (Any N-term E); Oxidation (M) max_mods, -min_precursor_charge, -max_precursor_charge, +min_precursor_charge,1 +max_precursor_charge,4 diff --git a/test/params/Proline_example_w_Mascot_wo_proteinSets.csv b/test/params/Proline_example_w_Mascot_wo_proteinSets.csv index 4161f392..a4ec7b15 100644 --- a/test/params/Proline_example_w_Mascot_wo_proteinSets.csv +++ b/test/params/Proline_example_w_Mascot_wo_proteinSets.csv @@ -16,5 +16,5 @@ max_peptide_length, fixed_mods,Carbamidomethyl (C) variable_mods,Acetyl (Protein N-term); Oxidation (M) max_mods, -min_precursor_charge, -max_precursor_charge, +min_precursor_charge,2 +max_precursor_charge,3 diff --git a/test/test_parse_params_proline.py b/test/test_parse_params_proline.py index f0d4120c..8b88c8a7 100644 --- a/test/test_parse_params_proline.py +++ b/test/test_parse_params_proline.py @@ -11,6 +11,7 @@ fnames = [ "Proline_example_w_Mascot_wo_proteinSets.xlsx", "Proline_example_2.xlsx", + "ProlineStudio_withMBR.xlsx", ] fnames = [TESTDATA_DIR / f for f in fnames] @@ -42,3 +43,10 @@ def test_extract_params(file): actual = pd.Series(actual.__dict__) actual = pd.read_csv(io.StringIO(actual.to_csv()), index_col=0).squeeze("columns") assert expected.equals(actual) + + +def test_find_charges(): + assert proline_params.find_charge("2+ and 3+") == [2, 3] + assert proline_params.find_charge("2+") == [2] + assert proline_params.find_charge("3+") == [3] + assert proline_params.find_charge("30+ and 14+") == [30, 14]