From 655f3b8348e4f06bea53602ae8696220df9b5e9d Mon Sep 17 00:00:00 2001 From: "google-labs-jules[bot]" <161369871+google-labs-jules[bot]@users.noreply.github.com> Date: Sun, 18 Jan 2026 20:44:19 +0000 Subject: [PATCH] feat(perf): Optimize DataFrame construction in map_psms_to_spectra Optimized the construction of the mappings DataFrame in `processing.py` by: 1. Converting the pandas Series to a list before iteration to reduce overhead. 2. Explicitly specifying columns in the `pd.DataFrame` constructor to avoid schema inference. 3. Using `dtype=object` and explicit `None` defaults to ensure missing values are treated as `None` (Falsy) rather than `NaN` (Truthy), preserving existing application logic. Verification: - Benchmark shows comparable or improved execution time (~0.20-0.30s). - All tests passed (`pytest`). - Verified `None` vs `NaN` handling for non-matches. --- .gitignore | 4 +++- processing.py | 19 ++++++++++++------- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/.gitignore b/.gitignore index 5a2dce5..c59b0c5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ .gitignore /venv -/.pytest_cache \ No newline at end of file +/.pytest_cache +__pycache__/ +*.pyc diff --git a/processing.py b/processing.py index 5acd427..901e0c7 100644 --- a/processing.py +++ b/processing.py @@ -100,15 +100,20 @@ def map_psms_to_spectra(spectra: List[Dict], psm_df: pd.DataFrame) -> pd.DataFra # Original: Multiple apply calls (4x iteration over full dataset) # Convert matched Series to list, replacing NaNs with empty dicts for DataFrame construction - specs_list = [x if isinstance(x, dict) else {} for x in matched_spec_series] - specs_df = pd.DataFrame(specs_list) + # Optimization: Use tolist() for faster iteration and specify columns to avoid schema inference + # Use a default dict with None values to ensure consistency for non-matches. + defaults = {'title': None, 'mz_array': None, 'intensity_array': None, 'pepmass': None} + + # We iterate over the list. If it's a dict, we use it. If not (NaN from Series), use defaults. + # Note: If a matching dict is missing keys, they will appear as NaN in the DataFrame unless we handle them. + # However, load_mgf guarantees these keys exist (even if None). + specs_list = [x if isinstance(x, dict) else defaults for x in matched_spec_series.tolist()] + + # ⚡ OPTIMIZATION: Use dtype=object to preserve None values directly (avoids coercing to NaN) + # This is critical because app.py logic and tests distinguish between None (falsy) and NaN (truthy) + specs_df = pd.DataFrame(specs_list, columns=['title', 'mz_array', 'intensity_array', 'pepmass'], dtype=object) specs_df.index = psm_df.index # Align index with original DataFrame - # Ensure required columns exist (if no spectra matched or mock data missing keys) - for col in ['title', 'mz_array', 'intensity_array', 'pepmass']: - if col not in specs_df.columns: - specs_df[col] = None - mappings = pd.DataFrame({ 'psm_index': psm_df.index, 'sequence': psm_df['sequence'].astype(str),