diff --git a/.gitignore b/.gitignore index 5a2dce5..c59b0c5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ .gitignore /venv -/.pytest_cache \ No newline at end of file +/.pytest_cache +__pycache__/ +*.pyc diff --git a/.jules/bolt.md b/.jules/bolt.md index 8780446..7bd8812 100644 --- a/.jules/bolt.md +++ b/.jules/bolt.md @@ -1,3 +1,7 @@ ## 2024-05-23 - [Regex Pre-compilation in Loops] **Learning:** Pre-compiling regular expressions (`re.compile`) at the module level provides a significant performance boost (measured ~1.8x speedup) when the regex is used inside a tight loop or a pandas `apply` function, compared to compiling it repeatedly or implicitly inside the loop. Vectorized string operations in Pandas are usually faster, but in complex logic cases (multiple prioritized regex groups + fallback logic), a simple pre-compiled regex with `apply` can sometimes be cleaner and sufficiently fast, or even faster if the vectorized approach requires multiple passes or expensive intermediate structures. **Action:** Always check for regex usage in loops or `apply` calls. If found, refactor to use module-level pre-compiled patterns. When considering vectorization, benchmark against the optimized loop version, as the overhead of complex vectorization might outweigh the benefits for moderate dataset sizes. + +## 2025-05-23 - [Streamlit File Upload Memory Optimization] +**Learning:** Streamlit's `UploadedFile` object is a bytes stream. Reading it entirely into memory with `read().decode('utf-8')` to pass to a text parser (like `io.StringIO`) creates massive memory overhead (original bytes + decoded string + StringIO buffer). Using `io.TextIOWrapper(file, encoding='utf-8')` allows streaming the decoding process, significantly reducing peak memory usage for large files (e.g., proteomics MGF files) without changing downstream logic, as `pyteomics` accepts file-like objects. +**Action:** Always prefer `io.TextIOWrapper` over `read().decode()` when processing text-based file uploads in Streamlit, especially for large datasets. diff --git a/__pycache__/app.cpython-312.pyc b/__pycache__/app.cpython-312.pyc new file mode 100644 index 0000000..c175f49 Binary files /dev/null and b/__pycache__/app.cpython-312.pyc differ diff --git a/__pycache__/data_loading.cpython-312.pyc b/__pycache__/data_loading.cpython-312.pyc new file mode 100644 index 0000000..107979c Binary files /dev/null and b/__pycache__/data_loading.cpython-312.pyc differ diff --git a/__pycache__/processing.cpython-312.pyc b/__pycache__/processing.cpython-312.pyc new file mode 100644 index 0000000..07e2b43 Binary files /dev/null and b/__pycache__/processing.cpython-312.pyc differ diff --git a/app.py b/app.py index aa7990e..32339c3 100644 --- a/app.py +++ b/app.py @@ -32,9 +32,10 @@ def run_streamlit_app(): # Process files only when both are uploaded if mgf_file and mztab_file: # Decode uploaded file contents (Streamlit files are bytes by default) - # Use StringIO to create file-like objects for pyteomics parsers - spectra = load_mgf(io.StringIO(mgf_file.read().decode('utf-8'))) - psm_df = load_mztab(io.StringIO(mztab_file.read().decode('utf-8'))) + # ⚡ OPTIMIZATION: Wrap file buffer with TextIOWrapper to stream data + # instead of reading entire file into memory with read().decode() + spectra = load_mgf(io.TextIOWrapper(mgf_file, encoding='utf-8')) + psm_df = load_mztab(io.TextIOWrapper(mztab_file, encoding='utf-8')) # Create mappings between PSMs and spectra mapped = map_psms_to_spectra(spectra, psm_df) diff --git a/tests/__pycache__/__init__.cpython-312.pyc b/tests/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000..81f2577 Binary files /dev/null and b/tests/__pycache__/__init__.cpython-312.pyc differ diff --git a/tests/__pycache__/conftest.cpython-312-pytest-8.4.2.pyc b/tests/__pycache__/conftest.cpython-312-pytest-8.4.2.pyc new file mode 100644 index 0000000..14dae09 Binary files /dev/null and b/tests/__pycache__/conftest.cpython-312-pytest-8.4.2.pyc differ diff --git a/tests/__pycache__/conftest.cpython-312-pytest-9.0.2.pyc b/tests/__pycache__/conftest.cpython-312-pytest-9.0.2.pyc new file mode 100644 index 0000000..386758f Binary files /dev/null and b/tests/__pycache__/conftest.cpython-312-pytest-9.0.2.pyc differ diff --git a/tests/__pycache__/temp_test_textiowrapper.cpython-312-pytest-8.4.2.pyc b/tests/__pycache__/temp_test_textiowrapper.cpython-312-pytest-8.4.2.pyc new file mode 100644 index 0000000..71de426 Binary files /dev/null and b/tests/__pycache__/temp_test_textiowrapper.cpython-312-pytest-8.4.2.pyc differ diff --git a/tests/__pycache__/temp_test_textiowrapper.cpython-312-pytest-9.0.2.pyc b/tests/__pycache__/temp_test_textiowrapper.cpython-312-pytest-9.0.2.pyc new file mode 100644 index 0000000..a9cf5d1 Binary files /dev/null and b/tests/__pycache__/temp_test_textiowrapper.cpython-312-pytest-9.0.2.pyc differ diff --git a/tests/__pycache__/test_extract_index_from_spectra_ref.cpython-312-pytest-9.0.2.pyc b/tests/__pycache__/test_extract_index_from_spectra_ref.cpython-312-pytest-9.0.2.pyc new file mode 100644 index 0000000..bf3b384 Binary files /dev/null and b/tests/__pycache__/test_extract_index_from_spectra_ref.cpython-312-pytest-9.0.2.pyc differ diff --git a/tests/__pycache__/test_integration.cpython-312-pytest-9.0.2.pyc b/tests/__pycache__/test_integration.cpython-312-pytest-9.0.2.pyc new file mode 100644 index 0000000..10b33ab Binary files /dev/null and b/tests/__pycache__/test_integration.cpython-312-pytest-9.0.2.pyc differ diff --git a/tests/__pycache__/test_load_mgf.cpython-312-pytest-9.0.2.pyc b/tests/__pycache__/test_load_mgf.cpython-312-pytest-9.0.2.pyc new file mode 100644 index 0000000..38db880 Binary files /dev/null and b/tests/__pycache__/test_load_mgf.cpython-312-pytest-9.0.2.pyc differ diff --git a/tests/__pycache__/test_load_mztab.cpython-312-pytest-9.0.2.pyc b/tests/__pycache__/test_load_mztab.cpython-312-pytest-9.0.2.pyc new file mode 100644 index 0000000..320c1a6 Binary files /dev/null and b/tests/__pycache__/test_load_mztab.cpython-312-pytest-9.0.2.pyc differ diff --git a/tests/__pycache__/test_map_psms_to_spectra.cpython-312-pytest-9.0.2.pyc b/tests/__pycache__/test_map_psms_to_spectra.cpython-312-pytest-9.0.2.pyc new file mode 100644 index 0000000..bd3ea26 Binary files /dev/null and b/tests/__pycache__/test_map_psms_to_spectra.cpython-312-pytest-9.0.2.pyc differ diff --git a/tests/test_integration.py b/tests/test_integration.py index 92b9141..26b2cf1 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -1,4 +1,5 @@ import pytest +import io from io import BytesIO, StringIO import pandas as pd from data_loading import load_mgf, load_mztab @@ -22,6 +23,27 @@ def test_full_pipeline(self, sample_mgf_buffer, sample_mztab_buffer): matches = mapped['matched_title'].notnull().sum() assert matches > 0, "At least some PSMs should match spectra" + def test_loading_with_textiowrapper(self, sample_mgf_content, sample_mztab_content): + """Test data loading with io.TextIOWrapper (simulating Streamlit optimization).""" + # Create BytesIO buffers (simulating file uploads) + mgf_bytes = BytesIO(sample_mgf_content.encode('utf-8')) + mztab_bytes = BytesIO(sample_mztab_content.encode('utf-8')) + + # Wrap with TextIOWrapper + mgf_wrapper = io.TextIOWrapper(mgf_bytes, encoding='utf-8') + mztab_wrapper = io.TextIOWrapper(mztab_bytes, encoding='utf-8') + + # Load data + spectra = load_mgf(mgf_wrapper) + psm_df = load_mztab(mztab_wrapper) + + assert len(spectra) > 0 + assert len(psm_df) > 0 + + # Verify structure + assert 'title' in spectra[0] + assert 'sequence' in psm_df.columns + def test_streamlit_integration(self): """Test the Streamlit app with mock file uploads.""" # This would require setting up streamlit testing