Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
.gitignore
/venv
/.pytest_cache
/.pytest_cache
__pycache__/
*.pyc
19 changes: 12 additions & 7 deletions processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,15 +100,20 @@ def map_psms_to_spectra(spectra: List[Dict], psm_df: pd.DataFrame) -> pd.DataFra
# Original: Multiple apply calls (4x iteration over full dataset)

# Convert matched Series to list, replacing NaNs with empty dicts for DataFrame construction
specs_list = [x if isinstance(x, dict) else {} for x in matched_spec_series]
specs_df = pd.DataFrame(specs_list)
# Optimization: Use tolist() for faster iteration and specify columns to avoid schema inference
# Use a default dict with None values to ensure consistency for non-matches.
defaults = {'title': None, 'mz_array': None, 'intensity_array': None, 'pepmass': None}

# We iterate over the list. If it's a dict, we use it. If not (NaN from Series), use defaults.
# Note: If a matching dict is missing keys, they will appear as NaN in the DataFrame unless we handle them.
# However, load_mgf guarantees these keys exist (even if None).
specs_list = [x if isinstance(x, dict) else defaults for x in matched_spec_series.tolist()]

# ⚡ OPTIMIZATION: Use dtype=object to preserve None values directly (avoids coercing to NaN)
# This is critical because app.py logic and tests distinguish between None (falsy) and NaN (truthy)
specs_df = pd.DataFrame(specs_list, columns=['title', 'mz_array', 'intensity_array', 'pepmass'], dtype=object)
specs_df.index = psm_df.index # Align index with original DataFrame

# Ensure required columns exist (if no spectra matched or mock data missing keys)
for col in ['title', 'mz_array', 'intensity_array', 'pepmass']:
if col not in specs_df.columns:
specs_df[col] = None

mappings = pd.DataFrame({
'psm_index': psm_df.index,
'sequence': psm_df['sequence'].astype(str),
Expand Down