erayfirat · erayfirat · Jan 29, 2026
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,5 @@
 .gitignore
 /venv
-/.pytest_cache
+/.pytest_cache
+__pycache__/
+*.pyc
diff --git a/.jules/bolt.md b/.jules/bolt.md
@@ -1,3 +1,7 @@
 ## 2024-05-23 - [Regex Pre-compilation in Loops]
 **Learning:** Pre-compiling regular expressions (`re.compile`) at the module level provides a significant performance boost (measured ~1.8x speedup) when the regex is used inside a tight loop or a pandas `apply` function, compared to compiling it repeatedly or implicitly inside the loop. Vectorized string operations in Pandas are usually faster, but in complex logic cases (multiple prioritized regex groups + fallback logic), a simple pre-compiled regex with `apply` can sometimes be cleaner and sufficiently fast, or even faster if the vectorized approach requires multiple passes or expensive intermediate structures.
 **Action:** Always check for regex usage in loops or `apply` calls. If found, refactor to use module-level pre-compiled patterns. When considering vectorization, benchmark against the optimized loop version, as the overhead of complex vectorization might outweigh the benefits for moderate dataset sizes.
+
+## 2025-05-27 - [Streaming File Uploads]
+**Learning:** `pyteomics.mgf.read` and `mztab` are compatible with `io.TextIOWrapper` wrapping a binary stream (like `io.BytesIO` or Streamlit's `UploadedFile`). This avoids reading the entire file into memory and decoding it into a massive string before parsing, which significantly reduces memory footprint for large proteomics files.
+**Action:** When handling file uploads or binary streams that need to be parsed as text by `pyteomics`, use `io.TextIOWrapper(binary_stream, encoding='utf-8')` instead of `binary_stream.read().decode('utf-8')` followed by `io.StringIO`.
diff --git a/app.py b/app.py
@@ -32,9 +32,9 @@ def run_streamlit_app():
     # Process files only when both are uploaded
     if mgf_file and mztab_file:
         # Decode uploaded file contents (Streamlit files are bytes by default)
-        # Use StringIO to create file-like objects for pyteomics parsers
-        spectra = load_mgf(io.StringIO(mgf_file.read().decode('utf-8')))
-        psm_df = load_mztab(io.StringIO(mztab_file.read().decode('utf-8')))
+        # Use TextIOWrapper to stream decoding instead of loading full content into memory
+        spectra = load_mgf(io.TextIOWrapper(mgf_file, encoding='utf-8'))
+        psm_df = load_mztab(io.TextIOWrapper(mztab_file, encoding='utf-8'))
 
         # Create mappings between PSMs and spectra
         mapped = map_psms_to_spectra(spectra, psm_df)

diff --git a/tests/test_streaming_io.py b/tests/test_streaming_io.py
@@ -0,0 +1,46 @@
+import io
+import pytest
+from data_loading import load_mgf, load_mztab
+
+def test_load_mgf_with_textiowrapper():
+    mgf_content = b"""BEGIN IONS
+TITLE=Spectrum 1
+PEPMASS=1000.0
+100.0 10.0
+200.0 20.0
+END IONS
+"""
+    # Simulate Streamlit file upload (BytesIO) wrapped in TextIOWrapper
+    bytes_io = io.BytesIO(mgf_content)
+    text_io = io.TextIOWrapper(bytes_io, encoding='utf-8')
+
+    specs = load_mgf(text_io)
+    assert len(specs) == 1
+    assert specs[0]['title'] == 'Spectrum 1'
+
+def test_load_mztab_with_textiowrapper():
+    mztab_content = b"""MTD\tmzTab-version\t1.0.0
+MTD\tmode\tComplete
+MTD\ttype\tIdentification
+PSH\tsequence\tPSM_ID\taccession\tunique\tdatabase\tdatabase_version\tsearch_engine\tsearch_engine_score[1]\tmodifications\tretention_time\tcharge\texp_mass_to_charge\tcalc_mass_to_charge\tspectra_ref\tpre\tpost\tstart\tend
+PSM\tPEPTIDE\t1\tP12345\t0\tDB\t1.0\tMascot\t100\tnull\t100.0\t2\t1000.0\t1000.0\tindex=0\t-\t-\t1\t10
+"""
+    bytes_io = io.BytesIO(mztab_content)
+    text_io = io.TextIOWrapper(bytes_io, encoding='utf-8')
+
+    df = load_mztab(text_io)
+    assert len(df) == 1
+    assert df.iloc[0]['sequence'] == 'PEPTIDE'
+
+if __name__ == "__main__":
+    try:
+        test_load_mgf_with_textiowrapper()
+        print("MGF test passed")
+    except Exception as e:
+        print(f"MGF test failed: {e}")
+
+    try:
+        test_load_mztab_with_textiowrapper()
+        print("MzTab test passed")
+    except Exception as e:
+        print(f"MzTab test failed: {e}")