diff --git a/.gitignore b/.gitignore index 5a2dce5..5f73537 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ .gitignore /venv -/.pytest_cache \ No newline at end of file +/.pytest_cache +__pycache__/ diff --git a/.jules/bolt.md b/.jules/bolt.md index 8780446..c36e6ec 100644 --- a/.jules/bolt.md +++ b/.jules/bolt.md @@ -1,3 +1,7 @@ ## 2024-05-23 - [Regex Pre-compilation in Loops] **Learning:** Pre-compiling regular expressions (`re.compile`) at the module level provides a significant performance boost (measured ~1.8x speedup) when the regex is used inside a tight loop or a pandas `apply` function, compared to compiling it repeatedly or implicitly inside the loop. Vectorized string operations in Pandas are usually faster, but in complex logic cases (multiple prioritized regex groups + fallback logic), a simple pre-compiled regex with `apply` can sometimes be cleaner and sufficiently fast, or even faster if the vectorized approach requires multiple passes or expensive intermediate structures. **Action:** Always check for regex usage in loops or `apply` calls. If found, refactor to use module-level pre-compiled patterns. When considering vectorization, benchmark against the optimized loop version, as the overhead of complex vectorization might outweigh the benefits for moderate dataset sizes. + +## 2025-05-23 - [Streamlit File Upload Memory Optimization] +**Learning:** When handling file uploads in Streamlit (or other web frameworks), reading the entire file into memory with `.read().decode()` creates a massive memory spike (2x-3x file size) due to holding raw bytes, decoded string, and the file buffer simultaneously. For text-based formats (like MGF/mzTab), wrapping the binary stream directly with `io.TextIOWrapper` allows for streaming processing, significantly reducing memory usage without sacrificing functionality. +**Action:** Use `io.TextIOWrapper` for processing text-based file uploads instead of reading/decoding fully into memory. diff --git a/__pycache__/app.cpython-312.pyc b/__pycache__/app.cpython-312.pyc new file mode 100644 index 0000000..57acb50 Binary files /dev/null and b/__pycache__/app.cpython-312.pyc differ diff --git a/__pycache__/data_loading.cpython-312.pyc b/__pycache__/data_loading.cpython-312.pyc new file mode 100644 index 0000000..310702f Binary files /dev/null and b/__pycache__/data_loading.cpython-312.pyc differ diff --git a/__pycache__/processing.cpython-312.pyc b/__pycache__/processing.cpython-312.pyc new file mode 100644 index 0000000..0a39d6c Binary files /dev/null and b/__pycache__/processing.cpython-312.pyc differ diff --git a/app.py b/app.py index aa7990e..28eff38 100644 --- a/app.py +++ b/app.py @@ -32,9 +32,10 @@ def run_streamlit_app(): # Process files only when both are uploaded if mgf_file and mztab_file: # Decode uploaded file contents (Streamlit files are bytes by default) - # Use StringIO to create file-like objects for pyteomics parsers - spectra = load_mgf(io.StringIO(mgf_file.read().decode('utf-8'))) - psm_df = load_mztab(io.StringIO(mztab_file.read().decode('utf-8'))) + # ⚡ OPTIMIZATION: Use TextIOWrapper to stream decoded text instead of reading full file into memory + # This significantly reduces memory usage for large MGF/mzTab files + spectra = load_mgf(io.TextIOWrapper(mgf_file, encoding='utf-8')) + psm_df = load_mztab(io.TextIOWrapper(mztab_file, encoding='utf-8')) # Create mappings between PSMs and spectra mapped = map_psms_to_spectra(spectra, psm_df) diff --git a/tests/__pycache__/__init__.cpython-312.pyc b/tests/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000..8035e2b Binary files /dev/null and b/tests/__pycache__/__init__.cpython-312.pyc differ diff --git a/tests/__pycache__/conftest.cpython-312-pytest-9.0.2.pyc b/tests/__pycache__/conftest.cpython-312-pytest-9.0.2.pyc new file mode 100644 index 0000000..2a0e2df Binary files /dev/null and b/tests/__pycache__/conftest.cpython-312-pytest-9.0.2.pyc differ diff --git a/tests/__pycache__/test_extract_index_from_spectra_ref.cpython-312-pytest-9.0.2.pyc b/tests/__pycache__/test_extract_index_from_spectra_ref.cpython-312-pytest-9.0.2.pyc new file mode 100644 index 0000000..c0c9aca Binary files /dev/null and b/tests/__pycache__/test_extract_index_from_spectra_ref.cpython-312-pytest-9.0.2.pyc differ diff --git a/tests/__pycache__/test_integration.cpython-312-pytest-9.0.2.pyc b/tests/__pycache__/test_integration.cpython-312-pytest-9.0.2.pyc new file mode 100644 index 0000000..b29b1dd Binary files /dev/null and b/tests/__pycache__/test_integration.cpython-312-pytest-9.0.2.pyc differ diff --git a/tests/__pycache__/test_load_mgf.cpython-312-pytest-9.0.2.pyc b/tests/__pycache__/test_load_mgf.cpython-312-pytest-9.0.2.pyc new file mode 100644 index 0000000..ba706fd Binary files /dev/null and b/tests/__pycache__/test_load_mgf.cpython-312-pytest-9.0.2.pyc differ diff --git a/tests/__pycache__/test_load_mztab.cpython-312-pytest-9.0.2.pyc b/tests/__pycache__/test_load_mztab.cpython-312-pytest-9.0.2.pyc new file mode 100644 index 0000000..f99b9ef Binary files /dev/null and b/tests/__pycache__/test_load_mztab.cpython-312-pytest-9.0.2.pyc differ diff --git a/tests/__pycache__/test_map_psms_to_spectra.cpython-312-pytest-9.0.2.pyc b/tests/__pycache__/test_map_psms_to_spectra.cpython-312-pytest-9.0.2.pyc new file mode 100644 index 0000000..412a37d Binary files /dev/null and b/tests/__pycache__/test_map_psms_to_spectra.cpython-312-pytest-9.0.2.pyc differ