Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .jules/bolt.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
## 2024-05-23 - [Regex Pre-compilation in Loops]
**Learning:** Pre-compiling regular expressions (`re.compile`) at the module level provides a significant performance boost (measured ~1.8x speedup) when the regex is used inside a tight loop or a pandas `apply` function, compared to compiling it repeatedly or implicitly inside the loop. Vectorized string operations in Pandas are usually faster, but in complex logic cases (multiple prioritized regex groups + fallback logic), a simple pre-compiled regex with `apply` can sometimes be cleaner and sufficiently fast, or even faster if the vectorized approach requires multiple passes or expensive intermediate structures.
**Action:** Always check for regex usage in loops or `apply` calls. If found, refactor to use module-level pre-compiled patterns. When considering vectorization, benchmark against the optimized loop version, as the overhead of complex vectorization might outweigh the benefits for moderate dataset sizes.

## 2024-10-30 - [Streamlit File Upload Memory Optimization]
**Learning:** Loading large text files (like MGF/mzTab) in Streamlit using `file.read().decode('utf-8')` creates massive memory spikes because it loads the entire raw bytes AND the decoded string into RAM before parsing. Using `io.TextIOWrapper(file, encoding='utf-8')` wraps the underlying binary stream and allows libraries like `pyteomics` to stream-read the data, reducing memory usage from O(N) to O(buffer_size).
**Action:** Always wrap `streamlit.UploadedFile` (or any `BytesIO` source) with `io.TextIOWrapper` when passing to parsers that accept file-like objects, instead of reading and decoding the whole file first.
6 changes: 3 additions & 3 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,9 @@ def run_streamlit_app():
# Process files only when both are uploaded
if mgf_file and mztab_file:
# Decode uploaded file contents (Streamlit files are bytes by default)
# Use StringIO to create file-like objects for pyteomics parsers
spectra = load_mgf(io.StringIO(mgf_file.read().decode('utf-8')))
psm_df = load_mztab(io.StringIO(mztab_file.read().decode('utf-8')))
# ⚡ OPTIMIZATION: Use TextIOWrapper to stream file content instead of loading fully into memory
spectra = load_mgf(io.TextIOWrapper(mgf_file, encoding='utf-8'))
psm_df = load_mztab(io.TextIOWrapper(mztab_file, encoding='utf-8'))

# Create mappings between PSMs and spectra
mapped = map_psms_to_spectra(spectra, psm_df)
Expand Down
50 changes: 50 additions & 0 deletions tests/test_streaming_io.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@

import io
import unittest
from data_loading import load_mgf, load_mztab

class TestStreamingIO(unittest.TestCase):
"""
Test that data loading functions support streaming input (TextIOWrapper).
This ensures that the memory optimization in app.py remains valid.
"""
def test_load_mgf_with_textiowrapper(self):
mgf_content = b"""BEGIN IONS
TITLE=Spectrum 1
PEPMASS=1000.0
100.0 10.0
200.0 20.0
END IONS
"""
# Simulate Streamlit UploadedFile (BytesIO)
mgf_file = io.BytesIO(mgf_content)

# Wrap with TextIOWrapper
text_wrapper = io.TextIOWrapper(mgf_file, encoding='utf-8')

# Test load_mgf
spectra = load_mgf(text_wrapper)
self.assertEqual(len(spectra), 1)
self.assertEqual(spectra[0]['title'], 'Spectrum 1')

def test_load_mztab_with_textiowrapper(self):
mztab_content = b"""MTD\tmzTab-version\t1.0.0
MTD\tmzTab-mode\tSummary
MTD\tmzTab-type\tQuantification
PSH\tsequence\tPSM_ID\taccession\tunique\tdatabase\tdatabase_version\tsearch_engine\tsearch_engine_score[1]\tmodifications\tretention_time\tcharge\texp_mass_to_charge\tcalc_mass_to_charge\tspectra_ref\tpre\tpost\tstart\tend\topt_global_cv_MS:1002217_decoy_peptide
PSM\tPEPTIDE\tpsm1\tP12345\t0\tDB\t1.0\tMS:1001207\t100.0\t\t100.0\t2\t1000.0\t1000.0\tindex=1\t-\t-\t-\t-\t0
"""
# Simulate Streamlit UploadedFile (BytesIO)
mztab_file = io.BytesIO(mztab_content)

# Wrap with TextIOWrapper
text_wrapper = io.TextIOWrapper(mztab_file, encoding='utf-8')

# Test load_mztab
psm_df = load_mztab(text_wrapper)
self.assertFalse(psm_df.empty)
self.assertEqual(len(psm_df), 1)
self.assertEqual(psm_df.iloc[0]['sequence'], 'PEPTIDE')

if __name__ == '__main__':
unittest.main()