erayfirat · erayfirat · Jan 30, 2026
diff --git a/.jules/bolt.md b/.jules/bolt.md
@@ -1,3 +1,7 @@
 ## 2024-05-23 - [Regex Pre-compilation in Loops]
 **Learning:** Pre-compiling regular expressions (`re.compile`) at the module level provides a significant performance boost (measured ~1.8x speedup) when the regex is used inside a tight loop or a pandas `apply` function, compared to compiling it repeatedly or implicitly inside the loop. Vectorized string operations in Pandas are usually faster, but in complex logic cases (multiple prioritized regex groups + fallback logic), a simple pre-compiled regex with `apply` can sometimes be cleaner and sufficiently fast, or even faster if the vectorized approach requires multiple passes or expensive intermediate structures.
 **Action:** Always check for regex usage in loops or `apply` calls. If found, refactor to use module-level pre-compiled patterns. When considering vectorization, benchmark against the optimized loop version, as the overhead of complex vectorization might outweigh the benefits for moderate dataset sizes.
+
+## 2026-01-30 - [Streaming I/O with TextIOWrapper]
+**Learning:** Using `io.TextIOWrapper` to wrap binary file streams (like `streamlit.UploadedFile`) significantly reduces memory usage compared to `read().decode()` for large text-based data files (like MGF/mzTab), as it avoids loading the entire file content into memory twice.
+**Action:** When handling file uploads in Streamlit or other web frameworks, prefer streaming wrappers over full-read-and-decode patterns.
diff --git a/app.py b/app.py
@@ -31,10 +31,11 @@ def run_streamlit_app():
 
     # Process files only when both are uploaded
     if mgf_file and mztab_file:
-        # Decode uploaded file contents (Streamlit files are bytes by default)
-        # Use StringIO to create file-like objects for pyteomics parsers
-        spectra = load_mgf(io.StringIO(mgf_file.read().decode('utf-8')))
-        psm_df = load_mztab(io.StringIO(mztab_file.read().decode('utf-8')))
+        # ⚡ OPTIMIZATION: Wrap file buffers with TextIOWrapper instead of reading full content
+        # This avoids loading the entire file into memory as bytes and then decoding to a huge string.
+        # Streamlit UploadedFile behaves like a BytesIO, which TextIOWrapper can wrap efficiently.
+        spectra = load_mgf(io.TextIOWrapper(mgf_file, encoding='utf-8'))
+        psm_df = load_mztab(io.TextIOWrapper(mztab_file, encoding='utf-8'))
 
         # Create mappings between PSMs and spectra
         mapped = map_psms_to_spectra(spectra, psm_df)

diff --git a/tests/test_streaming_io.py b/tests/test_streaming_io.py
@@ -0,0 +1,60 @@
+
+import io
+import pytest
+import pandas as pd
+from data_loading import load_mgf, load_mztab
+
+def test_load_mgf_streaming():
+    """Test loading MGF from a streaming TextIOWrapper."""
+    mgf_content = """BEGIN IONS
+TITLE=test
+PEPMASS=450.25
+1.0 10.0
+2.0 20.0
+END IONS
+"""
+    # Create a BytesIO buffer (simulating uploaded file)
+    bytes_buffer = io.BytesIO(mgf_content.encode('utf-8'))
+
+    # Wrap in TextIOWrapper
+    text_wrapper = io.TextIOWrapper(bytes_buffer, encoding='utf-8')
+
+    spectra = load_mgf(text_wrapper)
+    assert len(spectra) == 1
+    spec = spectra[0]
+    assert spec['title'] == 'test'
+    assert spec['pepmass'] == (450.25, None)
+
+def test_load_mztab_streaming():
+    """Test loading mzTab from a streaming TextIOWrapper."""
+    # Use explicit newlines and no indentation for the content
+    mztab_content = (
+"MTD\tmzTab-version\t1.0.0\n"
+"MTD\tmzTab-mode\tSummary\n"
+"MTD\tmzTab-type\tIdentification\n"
+"PSH\tsequence\tPSM_ID\taccession\tunique\tdatabase\tdatabase_version\tsearch_engine\tsearch_engine_score[1]\tmodifications\tretention_time\tcharge\texp_mass_to_charge\tcalc_mass_to_charge\tspectra_ref\tpre\tpost\tstart\tend\n"
+"PSM\tPEPTIDE\t1\tACC\t0\tDB\t1.0\tSE\t10.0\tNULL\t100.0\t2\t500.0\t500.0\tindex=1\t-\t-\t1\t10\n"
+)
+    # Note: I changed PSM header to PSH based on mzTab spec?
+    # Wait, mzTab spec uses PSH for header? Or PSM line with specific columns.
+    # Pyteomics mztab.py documentation says: "The PSM section (starting with 'PSH')..."
+    # Let's try PSH for the header line.
+
+    # Create a BytesIO buffer
+    bytes_buffer = io.BytesIO(mztab_content.encode('utf-8'))
+
+    # Wrap in TextIOWrapper
+    text_wrapper = io.TextIOWrapper(bytes_buffer, encoding='utf-8')
+
+    # Verify strict mock data first with StringIO (current behavior)
+    df_orig = load_mztab(io.StringIO(mztab_content))
+    assert len(df_orig) == 1
+
+    # Re-create buffer for the streaming test
+    bytes_buffer_2 = io.BytesIO(mztab_content.encode('utf-8'))
+    text_wrapper = io.TextIOWrapper(bytes_buffer_2, encoding='utf-8')
+
+    df = load_mztab(text_wrapper)
+    assert isinstance(df, pd.DataFrame)
+    assert len(df) == 1
+    assert df.iloc[0]['sequence'] == 'PEPTIDE'