Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
.gitignore
/venv
/.pytest_cache
/.pytest_cache
__pycache__/
4 changes: 4 additions & 0 deletions .jules/bolt.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
## 2024-05-23 - [Regex Pre-compilation in Loops]
**Learning:** Pre-compiling regular expressions (`re.compile`) at the module level provides a significant performance boost (measured ~1.8x speedup) when the regex is used inside a tight loop or a pandas `apply` function, compared to compiling it repeatedly or implicitly inside the loop. Vectorized string operations in Pandas are usually faster, but in complex logic cases (multiple prioritized regex groups + fallback logic), a simple pre-compiled regex with `apply` can sometimes be cleaner and sufficiently fast, or even faster if the vectorized approach requires multiple passes or expensive intermediate structures.
**Action:** Always check for regex usage in loops or `apply` calls. If found, refactor to use module-level pre-compiled patterns. When considering vectorization, benchmark against the optimized loop version, as the overhead of complex vectorization might outweigh the benefits for moderate dataset sizes.

## 2025-01-29 - [Streaming File Decoding with TextIOWrapper]
**Learning:** `streamlit.UploadedFile` is a binary stream (`io.BytesIO` subclass). Reading it fully with `.read().decode()` to create a string for `io.StringIO` doubles memory usage (bytes + string) and is O(N) in allocation. Wrapping the binary stream directly with `io.TextIOWrapper` allows streaming decoding, which is compatible with `pyteomics` parsers (MGF and mzTab) and significantly reduces memory footprint for large files.
**Action:** When handling `streamlit.UploadedFile` (or any binary stream) for text-based parsers, prefer `io.TextIOWrapper(file, encoding='utf-8')` over `io.StringIO(file.read().decode('utf-8'))`. Ensure the downstream parser accepts a file-like object (and `use_index=False` for MGF if needed).
Binary file added __pycache__/data_loading.cpython-312.pyc
Binary file not shown.
Binary file added __pycache__/processing.cpython-312.pyc
Binary file not shown.
5 changes: 3 additions & 2 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,9 @@ def run_streamlit_app():
if mgf_file and mztab_file:
# Decode uploaded file contents (Streamlit files are bytes by default)
# Use StringIO to create file-like objects for pyteomics parsers
spectra = load_mgf(io.StringIO(mgf_file.read().decode('utf-8')))
psm_df = load_mztab(io.StringIO(mztab_file.read().decode('utf-8')))
# ⚡ OPTIMIZATION: Use io.TextIOWrapper to stream decoding instead of loading whole file into memory
spectra = load_mgf(io.TextIOWrapper(mgf_file, encoding='utf-8'))
psm_df = load_mztab(io.TextIOWrapper(mztab_file, encoding='utf-8'))

# Create mappings between PSMs and spectra
mapped = map_psms_to_spectra(spectra, psm_df)
Expand Down
Binary file added tests/__pycache__/__init__.cpython-312.pyc
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
58 changes: 58 additions & 0 deletions tests/test_streaming_io.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@

import io
import unittest
from pyteomics import mgf, mztab
import pandas as pd

class TestStreamingIO(unittest.TestCase):
def test_mgf_streaming(self):
# Create a mock MGF content as bytes
mgf_content = b"""BEGIN IONS
TITLE=Spectrum 1
PEPMASS=1000.0
CHARGE=2+
100.0 1000.0
200.0 500.0
END IONS
"""
# Wrap in BytesIO to simulate Streamlit's UploadedFile
binary_stream = io.BytesIO(mgf_content)

# Wrap in TextIOWrapper for streaming decoding
text_stream = io.TextIOWrapper(binary_stream, encoding='utf-8')

# Verify pyteomics can read from it
with mgf.read(text_stream, use_index=False) as reader:
spectra = list(reader)

self.assertEqual(len(spectra), 1)
self.assertEqual(spectra[0]['params']['title'], 'Spectrum 1')

# Check if underlying stream is closed
# TextIOWrapper might define closed property
self.assertFalse(binary_stream.closed, "Binary stream should not be closed implicitly if possible, but strict ownership might vary")

def test_mztab_streaming(self):
# Create a mock mzTab content as bytes
mztab_content = b"""MTD\tmzTab-version\t1.0.0
MTD\tmzTab-mode\tSummary
MTD\tmzTab-type\tIdentification
PSH\tsequence\tPSM_ID\taccession\tunique\tdatabase\tdatabase_version\tsearch_engine\tsearch_engine_score[1]\tmodifications\tretention_time\tcharge\texp_mass_to_charge\tcalc_mass_to_charge\tspectra_ref\tpre\tpost\tstart\tend\topt_global_cv_MS:1002217_decoy_peptide\topt_global_cv_MS:1000889_peptidoform_sequence\topt_global_spec_evalue
PSM\tPEPTIDE\t1\tACC\t0\tDB\t1.0\tSE\t0.99\tnull\t100.0\t2\t1000.0\t1000.0\tindex=1\t-\t-\t1\t10\t0\tPEPTIDE\t0.01
"""
# Wrap in BytesIO to simulate Streamlit's UploadedFile
binary_stream = io.BytesIO(mztab_content)

# Wrap in TextIOWrapper for streaming decoding
text_stream = io.TextIOWrapper(binary_stream, encoding='utf-8')

# Verify pyteomics can read from it
# pyteomics.mztab.MzTab reads the whole file, but it should accept a file-like object
tab = mztab.MzTab(text_stream)
psm_df = pd.DataFrame(tab['psm'])

self.assertEqual(len(psm_df), 1)
self.assertEqual(psm_df.iloc[0]['sequence'], 'PEPTIDE')

if __name__ == '__main__':
unittest.main()