diff --git a/.gitignore b/.gitignore index 5a2dce5..5f73537 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ .gitignore /venv -/.pytest_cache \ No newline at end of file +/.pytest_cache +__pycache__/ diff --git a/.jules/bolt.md b/.jules/bolt.md index 8780446..24dc78f 100644 --- a/.jules/bolt.md +++ b/.jules/bolt.md @@ -1,3 +1,7 @@ ## 2024-05-23 - [Regex Pre-compilation in Loops] **Learning:** Pre-compiling regular expressions (`re.compile`) at the module level provides a significant performance boost (measured ~1.8x speedup) when the regex is used inside a tight loop or a pandas `apply` function, compared to compiling it repeatedly or implicitly inside the loop. Vectorized string operations in Pandas are usually faster, but in complex logic cases (multiple prioritized regex groups + fallback logic), a simple pre-compiled regex with `apply` can sometimes be cleaner and sufficiently fast, or even faster if the vectorized approach requires multiple passes or expensive intermediate structures. **Action:** Always check for regex usage in loops or `apply` calls. If found, refactor to use module-level pre-compiled patterns. When considering vectorization, benchmark against the optimized loop version, as the overhead of complex vectorization might outweigh the benefits for moderate dataset sizes. + +## 2025-01-29 - [Streaming File Decoding with TextIOWrapper] +**Learning:** `streamlit.UploadedFile` is a binary stream (`io.BytesIO` subclass). Reading it fully with `.read().decode()` to create a string for `io.StringIO` doubles memory usage (bytes + string) and is O(N) in allocation. Wrapping the binary stream directly with `io.TextIOWrapper` allows streaming decoding, which is compatible with `pyteomics` parsers (MGF and mzTab) and significantly reduces memory footprint for large files. +**Action:** When handling `streamlit.UploadedFile` (or any binary stream) for text-based parsers, prefer `io.TextIOWrapper(file, encoding='utf-8')` over `io.StringIO(file.read().decode('utf-8'))`. Ensure the downstream parser accepts a file-like object (and `use_index=False` for MGF if needed). diff --git a/__pycache__/data_loading.cpython-312.pyc b/__pycache__/data_loading.cpython-312.pyc new file mode 100644 index 0000000..4d1e188 Binary files /dev/null and b/__pycache__/data_loading.cpython-312.pyc differ diff --git a/__pycache__/processing.cpython-312.pyc b/__pycache__/processing.cpython-312.pyc new file mode 100644 index 0000000..8a9d28e Binary files /dev/null and b/__pycache__/processing.cpython-312.pyc differ diff --git a/app.py b/app.py index aa7990e..22c63ca 100644 --- a/app.py +++ b/app.py @@ -33,8 +33,9 @@ def run_streamlit_app(): if mgf_file and mztab_file: # Decode uploaded file contents (Streamlit files are bytes by default) # Use StringIO to create file-like objects for pyteomics parsers - spectra = load_mgf(io.StringIO(mgf_file.read().decode('utf-8'))) - psm_df = load_mztab(io.StringIO(mztab_file.read().decode('utf-8'))) + # ⚡ OPTIMIZATION: Use io.TextIOWrapper to stream decoding instead of loading whole file into memory + spectra = load_mgf(io.TextIOWrapper(mgf_file, encoding='utf-8')) + psm_df = load_mztab(io.TextIOWrapper(mztab_file, encoding='utf-8')) # Create mappings between PSMs and spectra mapped = map_psms_to_spectra(spectra, psm_df) diff --git a/tests/__pycache__/__init__.cpython-312.pyc b/tests/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000..245cea7 Binary files /dev/null and b/tests/__pycache__/__init__.cpython-312.pyc differ diff --git a/tests/__pycache__/conftest.cpython-312-pytest-9.0.2.pyc b/tests/__pycache__/conftest.cpython-312-pytest-9.0.2.pyc new file mode 100644 index 0000000..9240e9b Binary files /dev/null and b/tests/__pycache__/conftest.cpython-312-pytest-9.0.2.pyc differ diff --git a/tests/__pycache__/test_extract_index_from_spectra_ref.cpython-312-pytest-9.0.2.pyc b/tests/__pycache__/test_extract_index_from_spectra_ref.cpython-312-pytest-9.0.2.pyc new file mode 100644 index 0000000..781adb3 Binary files /dev/null and b/tests/__pycache__/test_extract_index_from_spectra_ref.cpython-312-pytest-9.0.2.pyc differ diff --git a/tests/__pycache__/test_integration.cpython-312-pytest-9.0.2.pyc b/tests/__pycache__/test_integration.cpython-312-pytest-9.0.2.pyc new file mode 100644 index 0000000..7736d91 Binary files /dev/null and b/tests/__pycache__/test_integration.cpython-312-pytest-9.0.2.pyc differ diff --git a/tests/__pycache__/test_load_mgf.cpython-312-pytest-9.0.2.pyc b/tests/__pycache__/test_load_mgf.cpython-312-pytest-9.0.2.pyc new file mode 100644 index 0000000..f0a1cec Binary files /dev/null and b/tests/__pycache__/test_load_mgf.cpython-312-pytest-9.0.2.pyc differ diff --git a/tests/__pycache__/test_load_mztab.cpython-312-pytest-9.0.2.pyc b/tests/__pycache__/test_load_mztab.cpython-312-pytest-9.0.2.pyc new file mode 100644 index 0000000..56c7a4e Binary files /dev/null and b/tests/__pycache__/test_load_mztab.cpython-312-pytest-9.0.2.pyc differ diff --git a/tests/__pycache__/test_map_psms_to_spectra.cpython-312-pytest-9.0.2.pyc b/tests/__pycache__/test_map_psms_to_spectra.cpython-312-pytest-9.0.2.pyc new file mode 100644 index 0000000..9086857 Binary files /dev/null and b/tests/__pycache__/test_map_psms_to_spectra.cpython-312-pytest-9.0.2.pyc differ diff --git a/tests/__pycache__/test_streaming_io.cpython-312-pytest-9.0.2.pyc b/tests/__pycache__/test_streaming_io.cpython-312-pytest-9.0.2.pyc new file mode 100644 index 0000000..1983955 Binary files /dev/null and b/tests/__pycache__/test_streaming_io.cpython-312-pytest-9.0.2.pyc differ diff --git a/tests/test_streaming_io.py b/tests/test_streaming_io.py new file mode 100644 index 0000000..197ab34 --- /dev/null +++ b/tests/test_streaming_io.py @@ -0,0 +1,58 @@ + +import io +import unittest +from pyteomics import mgf, mztab +import pandas as pd + +class TestStreamingIO(unittest.TestCase): + def test_mgf_streaming(self): + # Create a mock MGF content as bytes + mgf_content = b"""BEGIN IONS +TITLE=Spectrum 1 +PEPMASS=1000.0 +CHARGE=2+ +100.0 1000.0 +200.0 500.0 +END IONS +""" + # Wrap in BytesIO to simulate Streamlit's UploadedFile + binary_stream = io.BytesIO(mgf_content) + + # Wrap in TextIOWrapper for streaming decoding + text_stream = io.TextIOWrapper(binary_stream, encoding='utf-8') + + # Verify pyteomics can read from it + with mgf.read(text_stream, use_index=False) as reader: + spectra = list(reader) + + self.assertEqual(len(spectra), 1) + self.assertEqual(spectra[0]['params']['title'], 'Spectrum 1') + + # Check if underlying stream is closed + # TextIOWrapper might define closed property + self.assertFalse(binary_stream.closed, "Binary stream should not be closed implicitly if possible, but strict ownership might vary") + + def test_mztab_streaming(self): + # Create a mock mzTab content as bytes + mztab_content = b"""MTD\tmzTab-version\t1.0.0 +MTD\tmzTab-mode\tSummary +MTD\tmzTab-type\tIdentification +PSH\tsequence\tPSM_ID\taccession\tunique\tdatabase\tdatabase_version\tsearch_engine\tsearch_engine_score[1]\tmodifications\tretention_time\tcharge\texp_mass_to_charge\tcalc_mass_to_charge\tspectra_ref\tpre\tpost\tstart\tend\topt_global_cv_MS:1002217_decoy_peptide\topt_global_cv_MS:1000889_peptidoform_sequence\topt_global_spec_evalue +PSM\tPEPTIDE\t1\tACC\t0\tDB\t1.0\tSE\t0.99\tnull\t100.0\t2\t1000.0\t1000.0\tindex=1\t-\t-\t1\t10\t0\tPEPTIDE\t0.01 +""" + # Wrap in BytesIO to simulate Streamlit's UploadedFile + binary_stream = io.BytesIO(mztab_content) + + # Wrap in TextIOWrapper for streaming decoding + text_stream = io.TextIOWrapper(binary_stream, encoding='utf-8') + + # Verify pyteomics can read from it + # pyteomics.mztab.MzTab reads the whole file, but it should accept a file-like object + tab = mztab.MzTab(text_stream) + psm_df = pd.DataFrame(tab['psm']) + + self.assertEqual(len(psm_df), 1) + self.assertEqual(psm_df.iloc[0]['sequence'], 'PEPTIDE') + +if __name__ == '__main__': + unittest.main()