diff --git a/.jules/bolt.md b/.jules/bolt.md index 8780446..92dd6d8 100644 --- a/.jules/bolt.md +++ b/.jules/bolt.md @@ -1,3 +1,7 @@ ## 2024-05-23 - [Regex Pre-compilation in Loops] **Learning:** Pre-compiling regular expressions (`re.compile`) at the module level provides a significant performance boost (measured ~1.8x speedup) when the regex is used inside a tight loop or a pandas `apply` function, compared to compiling it repeatedly or implicitly inside the loop. Vectorized string operations in Pandas are usually faster, but in complex logic cases (multiple prioritized regex groups + fallback logic), a simple pre-compiled regex with `apply` can sometimes be cleaner and sufficiently fast, or even faster if the vectorized approach requires multiple passes or expensive intermediate structures. **Action:** Always check for regex usage in loops or `apply` calls. If found, refactor to use module-level pre-compiled patterns. When considering vectorization, benchmark against the optimized loop version, as the overhead of complex vectorization might outweigh the benefits for moderate dataset sizes. + +## 2024-10-30 - [Streamlit File Upload Memory Optimization] +**Learning:** Loading large text files (like MGF/mzTab) in Streamlit using `file.read().decode('utf-8')` creates massive memory spikes because it loads the entire raw bytes AND the decoded string into RAM before parsing. Using `io.TextIOWrapper(file, encoding='utf-8')` wraps the underlying binary stream and allows libraries like `pyteomics` to stream-read the data, reducing memory usage from O(N) to O(buffer_size). +**Action:** Always wrap `streamlit.UploadedFile` (or any `BytesIO` source) with `io.TextIOWrapper` when passing to parsers that accept file-like objects, instead of reading and decoding the whole file first. diff --git a/app.py b/app.py index aa7990e..21f47b0 100644 --- a/app.py +++ b/app.py @@ -32,9 +32,9 @@ def run_streamlit_app(): # Process files only when both are uploaded if mgf_file and mztab_file: # Decode uploaded file contents (Streamlit files are bytes by default) - # Use StringIO to create file-like objects for pyteomics parsers - spectra = load_mgf(io.StringIO(mgf_file.read().decode('utf-8'))) - psm_df = load_mztab(io.StringIO(mztab_file.read().decode('utf-8'))) + # ⚡ OPTIMIZATION: Use TextIOWrapper to stream file content instead of loading fully into memory + spectra = load_mgf(io.TextIOWrapper(mgf_file, encoding='utf-8')) + psm_df = load_mztab(io.TextIOWrapper(mztab_file, encoding='utf-8')) # Create mappings between PSMs and spectra mapped = map_psms_to_spectra(spectra, psm_df) diff --git a/tests/test_streaming_io.py b/tests/test_streaming_io.py new file mode 100644 index 0000000..307d0c4 --- /dev/null +++ b/tests/test_streaming_io.py @@ -0,0 +1,50 @@ + +import io +import unittest +from data_loading import load_mgf, load_mztab + +class TestStreamingIO(unittest.TestCase): + """ + Test that data loading functions support streaming input (TextIOWrapper). + This ensures that the memory optimization in app.py remains valid. + """ + def test_load_mgf_with_textiowrapper(self): + mgf_content = b"""BEGIN IONS +TITLE=Spectrum 1 +PEPMASS=1000.0 +100.0 10.0 +200.0 20.0 +END IONS +""" + # Simulate Streamlit UploadedFile (BytesIO) + mgf_file = io.BytesIO(mgf_content) + + # Wrap with TextIOWrapper + text_wrapper = io.TextIOWrapper(mgf_file, encoding='utf-8') + + # Test load_mgf + spectra = load_mgf(text_wrapper) + self.assertEqual(len(spectra), 1) + self.assertEqual(spectra[0]['title'], 'Spectrum 1') + + def test_load_mztab_with_textiowrapper(self): + mztab_content = b"""MTD\tmzTab-version\t1.0.0 +MTD\tmzTab-mode\tSummary +MTD\tmzTab-type\tQuantification +PSH\tsequence\tPSM_ID\taccession\tunique\tdatabase\tdatabase_version\tsearch_engine\tsearch_engine_score[1]\tmodifications\tretention_time\tcharge\texp_mass_to_charge\tcalc_mass_to_charge\tspectra_ref\tpre\tpost\tstart\tend\topt_global_cv_MS:1002217_decoy_peptide +PSM\tPEPTIDE\tpsm1\tP12345\t0\tDB\t1.0\tMS:1001207\t100.0\t\t100.0\t2\t1000.0\t1000.0\tindex=1\t-\t-\t-\t-\t0 +""" + # Simulate Streamlit UploadedFile (BytesIO) + mztab_file = io.BytesIO(mztab_content) + + # Wrap with TextIOWrapper + text_wrapper = io.TextIOWrapper(mztab_file, encoding='utf-8') + + # Test load_mztab + psm_df = load_mztab(text_wrapper) + self.assertFalse(psm_df.empty) + self.assertEqual(len(psm_df), 1) + self.assertEqual(psm_df.iloc[0]['sequence'], 'PEPTIDE') + +if __name__ == '__main__': + unittest.main()