Skip to content
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 12 additions & 8 deletions processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,16 +99,20 @@ def map_psms_to_spectra(spectra: List[Dict], psm_df: pd.DataFrame) -> pd.DataFra
# ⚡ OPTIMIZATION: Convert list of dicts to DataFrame directly instead of repeated apply calls
# Original: Multiple apply calls (4x iteration over full dataset)

# Convert matched Series to list, replacing NaNs with empty dicts for DataFrame construction
specs_list = [x if isinstance(x, dict) else {} for x in matched_spec_series]
specs_df = pd.DataFrame(specs_list)
# Convert matched Series to list, replacing NaNs with empty dicts
# We use a default dict with None values to ensure explicit Nones instead
# of NaNs, which is crucial for boolean checks in app.py (NaN is Truthy).
expected_cols = ['title', 'mz_array', 'intensity_array', 'pepmass']
defaults = {col: None for col in expected_cols}
specs_list = [
x if isinstance(x, dict) else defaults for x in matched_spec_series
]

# ⚡ OPTIMIZATION: Explicitly specify columns to skip schema inference
# (approx 2.25x faster)
specs_df = pd.DataFrame(specs_list, columns=expected_cols)
specs_df.index = psm_df.index # Align index with original DataFrame

# Ensure required columns exist (if no spectra matched or mock data missing keys)
for col in ['title', 'mz_array', 'intensity_array', 'pepmass']:
if col not in specs_df.columns:
specs_df[col] = None

mappings = pd.DataFrame({
'psm_index': psm_df.index,
'sequence': psm_df['sequence'].astype(str),
Expand Down