erayfirat · google-labs-jules · Jan 18, 2026
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,5 @@
 .gitignore
 /venv
-/.pytest_cache
+/.pytest_cache
+__pycache__/
+*.pyc
diff --git a/processing.py b/processing.py
@@ -100,15 +100,20 @@ def map_psms_to_spectra(spectra: List[Dict], psm_df: pd.DataFrame) -> pd.DataFra
     # Original: Multiple apply calls (4x iteration over full dataset)
 
     # Convert matched Series to list, replacing NaNs with empty dicts for DataFrame construction
-    specs_list = [x if isinstance(x, dict) else {} for x in matched_spec_series]
-    specs_df = pd.DataFrame(specs_list)
+    # Optimization: Use tolist() for faster iteration and specify columns to avoid schema inference
+    # Use a default dict with None values to ensure consistency for non-matches.
+    defaults = {'title': None, 'mz_array': None, 'intensity_array': None, 'pepmass': None}
+
+    # We iterate over the list. If it's a dict, we use it. If not (NaN from Series), use defaults.
+    # Note: If a matching dict is missing keys, they will appear as NaN in the DataFrame unless we handle them.
+    # However, load_mgf guarantees these keys exist (even if None).
+    specs_list = [x if isinstance(x, dict) else defaults for x in matched_spec_series.tolist()]
+
+    # ⚡ OPTIMIZATION: Use dtype=object to preserve None values directly (avoids coercing to NaN)
+    # This is critical because app.py logic and tests distinguish between None (falsy) and NaN (truthy)
+    specs_df = pd.DataFrame(specs_list, columns=['title', 'mz_array', 'intensity_array', 'pepmass'], dtype=object)
     specs_df.index = psm_df.index  # Align index with original DataFrame
 
-    # Ensure required columns exist (if no spectra matched or mock data missing keys)
-    for col in ['title', 'mz_array', 'intensity_array', 'pepmass']:
-        if col not in specs_df.columns:
-            specs_df[col] = None
-
     mappings = pd.DataFrame({
         'psm_index': psm_df.index,
         'sequence': psm_df['sequence'].astype(str),