erayfirat · google-labs-jules · Jan 21, 2026
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,5 @@
 .gitignore
 /venv
-/.pytest_cache
+/.pytest_cache
+__pycache__/
+*.pyc
diff --git a/.jules/bolt.md b/.jules/bolt.md
@@ -1,3 +1,7 @@
 ## 2024-05-23 - [Regex Pre-compilation in Loops]
 **Learning:** Pre-compiling regular expressions (`re.compile`) at the module level provides a significant performance boost (measured ~1.8x speedup) when the regex is used inside a tight loop or a pandas `apply` function, compared to compiling it repeatedly or implicitly inside the loop. Vectorized string operations in Pandas are usually faster, but in complex logic cases (multiple prioritized regex groups + fallback logic), a simple pre-compiled regex with `apply` can sometimes be cleaner and sufficiently fast, or even faster if the vectorized approach requires multiple passes or expensive intermediate structures.
 **Action:** Always check for regex usage in loops or `apply` calls. If found, refactor to use module-level pre-compiled patterns. When considering vectorization, benchmark against the optimized loop version, as the overhead of complex vectorization might outweigh the benefits for moderate dataset sizes.
+
+## 2025-05-23 - [Streamlit File Upload Memory Optimization]
+**Learning:** Streamlit's `UploadedFile` object is a bytes stream. Reading it entirely into memory with `read().decode('utf-8')` to pass to a text parser (like `io.StringIO`) creates massive memory overhead (original bytes + decoded string + StringIO buffer). Using `io.TextIOWrapper(file, encoding='utf-8')` allows streaming the decoding process, significantly reducing peak memory usage for large files (e.g., proteomics MGF files) without changing downstream logic, as `pyteomics` accepts file-like objects.
+**Action:** Always prefer `io.TextIOWrapper` over `read().decode()` when processing text-based file uploads in Streamlit, especially for large datasets.
diff --git a/__pycache__/app.cpython-312.pyc b/__pycache__/app.cpython-312.pyc
diff --git a/__pycache__/data_loading.cpython-312.pyc b/__pycache__/data_loading.cpython-312.pyc
diff --git a/__pycache__/processing.cpython-312.pyc b/__pycache__/processing.cpython-312.pyc
diff --git a/app.py b/app.py
@@ -32,9 +32,10 @@ def run_streamlit_app():
     # Process files only when both are uploaded
     if mgf_file and mztab_file:
         # Decode uploaded file contents (Streamlit files are bytes by default)
-        # Use StringIO to create file-like objects for pyteomics parsers
-        spectra = load_mgf(io.StringIO(mgf_file.read().decode('utf-8')))
-        psm_df = load_mztab(io.StringIO(mztab_file.read().decode('utf-8')))
+        # ⚡ OPTIMIZATION: Wrap file buffer with TextIOWrapper to stream data
+        # instead of reading entire file into memory with read().decode()
+        spectra = load_mgf(io.TextIOWrapper(mgf_file, encoding='utf-8'))
+        psm_df = load_mztab(io.TextIOWrapper(mztab_file, encoding='utf-8'))
 
         # Create mappings between PSMs and spectra
         mapped = map_psms_to_spectra(spectra, psm_df)

diff --git a/tests/__pycache__/__init__.cpython-312.pyc b/tests/__pycache__/__init__.cpython-312.pyc
diff --git a/tests/__pycache__/conftest.cpython-312-pytest-8.4.2.pyc b/tests/__pycache__/conftest.cpython-312-pytest-8.4.2.pyc
diff --git a/tests/__pycache__/conftest.cpython-312-pytest-9.0.2.pyc b/tests/__pycache__/conftest.cpython-312-pytest-9.0.2.pyc
diff --git a/tests/__pycache__/temp_test_textiowrapper.cpython-312-pytest-8.4.2.pyc b/tests/__pycache__/temp_test_textiowrapper.cpython-312-pytest-8.4.2.pyc
diff --git a/tests/__pycache__/temp_test_textiowrapper.cpython-312-pytest-9.0.2.pyc b/tests/__pycache__/temp_test_textiowrapper.cpython-312-pytest-9.0.2.pyc
diff --git a/tests/__pycache__/test_extract_index_from_spectra_ref.cpython-312-pytest-9.0.2.pyc b/tests/__pycache__/test_extract_index_from_spectra_ref.cpython-312-pytest-9.0.2.pyc
diff --git a/tests/__pycache__/test_integration.cpython-312-pytest-9.0.2.pyc b/tests/__pycache__/test_integration.cpython-312-pytest-9.0.2.pyc
diff --git a/tests/__pycache__/test_load_mgf.cpython-312-pytest-9.0.2.pyc b/tests/__pycache__/test_load_mgf.cpython-312-pytest-9.0.2.pyc
diff --git a/tests/__pycache__/test_load_mztab.cpython-312-pytest-9.0.2.pyc b/tests/__pycache__/test_load_mztab.cpython-312-pytest-9.0.2.pyc
diff --git a/tests/__pycache__/test_map_psms_to_spectra.cpython-312-pytest-9.0.2.pyc b/tests/__pycache__/test_map_psms_to_spectra.cpython-312-pytest-9.0.2.pyc
diff --git a/tests/test_integration.py b/tests/test_integration.py
@@ -1,4 +1,5 @@
 import pytest
+import io
 from io import BytesIO, StringIO
 import pandas as pd
 from data_loading import load_mgf, load_mztab
@@ -22,6 +23,27 @@ def test_full_pipeline(self, sample_mgf_buffer, sample_mztab_buffer):
         matches = mapped['matched_title'].notnull().sum()
         assert matches > 0, "At least some PSMs should match spectra"
 
+    def test_loading_with_textiowrapper(self, sample_mgf_content, sample_mztab_content):
+        """Test data loading with io.TextIOWrapper (simulating Streamlit optimization)."""
+        # Create BytesIO buffers (simulating file uploads)
+        mgf_bytes = BytesIO(sample_mgf_content.encode('utf-8'))
+        mztab_bytes = BytesIO(sample_mztab_content.encode('utf-8'))
+
+        # Wrap with TextIOWrapper
+        mgf_wrapper = io.TextIOWrapper(mgf_bytes, encoding='utf-8')
+        mztab_wrapper = io.TextIOWrapper(mztab_bytes, encoding='utf-8')
+
+        # Load data
+        spectra = load_mgf(mgf_wrapper)
+        psm_df = load_mztab(mztab_wrapper)
+
+        assert len(spectra) > 0
+        assert len(psm_df) > 0
+
+        # Verify structure
+        assert 'title' in spectra[0]
+        assert 'sequence' in psm_df.columns
+
     def test_streamlit_integration(self):
         """Test the Streamlit app with mock file uploads."""
         # This would require setting up streamlit testing