diff --git a/.gitignore b/.gitignore index 5a2dce5..c59b0c5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ .gitignore /venv -/.pytest_cache \ No newline at end of file +/.pytest_cache +__pycache__/ +*.pyc diff --git a/.jules/bolt.md b/.jules/bolt.md index 8780446..1775206 100644 --- a/.jules/bolt.md +++ b/.jules/bolt.md @@ -1,3 +1,7 @@ ## 2024-05-23 - [Regex Pre-compilation in Loops] **Learning:** Pre-compiling regular expressions (`re.compile`) at the module level provides a significant performance boost (measured ~1.8x speedup) when the regex is used inside a tight loop or a pandas `apply` function, compared to compiling it repeatedly or implicitly inside the loop. Vectorized string operations in Pandas are usually faster, but in complex logic cases (multiple prioritized regex groups + fallback logic), a simple pre-compiled regex with `apply` can sometimes be cleaner and sufficiently fast, or even faster if the vectorized approach requires multiple passes or expensive intermediate structures. **Action:** Always check for regex usage in loops or `apply` calls. If found, refactor to use module-level pre-compiled patterns. When considering vectorization, benchmark against the optimized loop version, as the overhead of complex vectorization might outweigh the benefits for moderate dataset sizes. + +## 2024-10-30 - [Streaming Text Decoding] +**Learning:** When handling large text-based file uploads (like MGF/mzTab) in Streamlit/Python, using `io.TextIOWrapper(binary_stream, encoding='utf-8')` is significantly more memory-efficient than `io.StringIO(binary_stream.read().decode('utf-8'))`. The latter approach loads the entire binary content into memory, creates a huge string copy, and then creates another buffer, potentially tripling memory usage. The wrapper approach streams the decoding, keeping memory footprint low. +**Action:** Use `io.TextIOWrapper` for parsing large text files from binary streams (like `streamlit.UploadedFile`) instead of reading and decoding the full content. Verify compatibility with downstream parsers (e.g., `pyteomics` supports it). diff --git a/__pycache__/data_loading.cpython-312.pyc b/__pycache__/data_loading.cpython-312.pyc new file mode 100644 index 0000000..77ed2c9 Binary files /dev/null and b/__pycache__/data_loading.cpython-312.pyc differ diff --git a/__pycache__/processing.cpython-312.pyc b/__pycache__/processing.cpython-312.pyc new file mode 100644 index 0000000..b9a17d9 Binary files /dev/null and b/__pycache__/processing.cpython-312.pyc differ diff --git a/app.py b/app.py index aa7990e..f4c3782 100644 --- a/app.py +++ b/app.py @@ -32,9 +32,9 @@ def run_streamlit_app(): # Process files only when both are uploaded if mgf_file and mztab_file: # Decode uploaded file contents (Streamlit files are bytes by default) - # Use StringIO to create file-like objects for pyteomics parsers - spectra = load_mgf(io.StringIO(mgf_file.read().decode('utf-8'))) - psm_df = load_mztab(io.StringIO(mztab_file.read().decode('utf-8'))) + # Use TextIOWrapper to create streaming file-like objects, avoiding full memory load + spectra = load_mgf(io.TextIOWrapper(mgf_file, encoding='utf-8')) + psm_df = load_mztab(io.TextIOWrapper(mztab_file, encoding='utf-8')) # Create mappings between PSMs and spectra mapped = map_psms_to_spectra(spectra, psm_df) diff --git a/tests/__pycache__/__init__.cpython-312.pyc b/tests/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000..9804010 Binary files /dev/null and b/tests/__pycache__/__init__.cpython-312.pyc differ diff --git a/tests/__pycache__/conftest.cpython-312-pytest-9.0.2.pyc b/tests/__pycache__/conftest.cpython-312-pytest-9.0.2.pyc new file mode 100644 index 0000000..6a2f6b8 Binary files /dev/null and b/tests/__pycache__/conftest.cpython-312-pytest-9.0.2.pyc differ diff --git a/tests/__pycache__/test_extract_index_from_spectra_ref.cpython-312-pytest-9.0.2.pyc b/tests/__pycache__/test_extract_index_from_spectra_ref.cpython-312-pytest-9.0.2.pyc new file mode 100644 index 0000000..b380bd8 Binary files /dev/null and b/tests/__pycache__/test_extract_index_from_spectra_ref.cpython-312-pytest-9.0.2.pyc differ diff --git a/tests/__pycache__/test_integration.cpython-312-pytest-9.0.2.pyc b/tests/__pycache__/test_integration.cpython-312-pytest-9.0.2.pyc new file mode 100644 index 0000000..7c5342c Binary files /dev/null and b/tests/__pycache__/test_integration.cpython-312-pytest-9.0.2.pyc differ diff --git a/tests/__pycache__/test_load_mgf.cpython-312-pytest-9.0.2.pyc b/tests/__pycache__/test_load_mgf.cpython-312-pytest-9.0.2.pyc new file mode 100644 index 0000000..e434a95 Binary files /dev/null and b/tests/__pycache__/test_load_mgf.cpython-312-pytest-9.0.2.pyc differ diff --git a/tests/__pycache__/test_load_mztab.cpython-312-pytest-9.0.2.pyc b/tests/__pycache__/test_load_mztab.cpython-312-pytest-9.0.2.pyc new file mode 100644 index 0000000..aefee96 Binary files /dev/null and b/tests/__pycache__/test_load_mztab.cpython-312-pytest-9.0.2.pyc differ diff --git a/tests/__pycache__/test_map_psms_to_spectra.cpython-312-pytest-9.0.2.pyc b/tests/__pycache__/test_map_psms_to_spectra.cpython-312-pytest-9.0.2.pyc new file mode 100644 index 0000000..bc3a5d0 Binary files /dev/null and b/tests/__pycache__/test_map_psms_to_spectra.cpython-312-pytest-9.0.2.pyc differ diff --git a/tests/__pycache__/test_streaming_io.cpython-312-pytest-9.0.2.pyc b/tests/__pycache__/test_streaming_io.cpython-312-pytest-9.0.2.pyc new file mode 100644 index 0000000..75b18f1 Binary files /dev/null and b/tests/__pycache__/test_streaming_io.cpython-312-pytest-9.0.2.pyc differ diff --git a/tests/test_streaming_io.py b/tests/test_streaming_io.py new file mode 100644 index 0000000..9856fde --- /dev/null +++ b/tests/test_streaming_io.py @@ -0,0 +1,55 @@ +import io +import pytest +import pandas as pd +from data_loading import load_mgf, load_mztab + +class TestStreamingIO: + """ + Test suite to ensure that data loading functions support streaming I/O. + This validates the optimization of using TextIOWrapper instead of reading + entire files into memory. + """ + + def test_load_mgf_streaming(self): + """Test load_mgf with io.TextIOWrapper wrapping a binary stream.""" + mgf_content = b"""BEGIN IONS +TITLE=test_spectrum +PEPMASS=450.25 +1.0 10.0 +2.0 20.0 +END IONS +""" + # Simulate binary stream (like Streamlit's UploadedFile) + binary_stream = io.BytesIO(mgf_content) + + # Wrap with TextIOWrapper to simulate streaming text decoding + text_stream = io.TextIOWrapper(binary_stream, encoding='utf-8') + + # This should not raise an error + spectra = load_mgf(text_stream) + + assert len(spectra) == 1 + assert spectra[0]['title'] == 'test_spectrum' + assert spectra[0]['pepmass'] == (450.25, None) + assert len(spectra[0]['mz_array']) == 2 + + def test_load_mztab_streaming(self): + """Test load_mztab with io.TextIOWrapper wrapping a binary stream.""" + mztab_content = b"""MTD\tmzTab-version\t1.0.0 +MTD\tmzTab-mode\tSummary +PSH\tsequence\tPSM_ID\tspectra_ref +PSM\tPEPTIDE_SEQ\t1\tms_run[1]:index=0 +""" + # Simulate binary stream + binary_stream = io.BytesIO(mztab_content) + + # Wrap with TextIOWrapper + text_stream = io.TextIOWrapper(binary_stream, encoding='utf-8') + + # This should not raise an error + df = load_mztab(text_stream) + + assert isinstance(df, pd.DataFrame) + assert len(df) == 1 + assert df.iloc[0]['sequence'] == 'PEPTIDE_SEQ' + assert df.iloc[0]['spectra_ref'] == 'ms_run[1]:index=0'