diff --git a/Doc/library/mailbox.rst b/Doc/library/mailbox.rst index d74fc8059fd538..d5bb9305168a68 100644 --- a/Doc/library/mailbox.rst +++ b/Doc/library/mailbox.rst @@ -441,7 +441,7 @@ Supported mailbox formats are Maildir, mbox, MH, Babyl, and MMDF. ^^^^^^^^^^^^^ -.. class:: mbox(path, factory=None, create=True) +.. class:: mbox(path, factory=None, create=True, from_matcher=None) A subclass of :class:`Mailbox` for mailboxes in mbox format. Parameter *factory* is a callable object that accepts a file-like message representation (which @@ -453,6 +453,11 @@ Supported mailbox formats are Maildir, mbox, MH, Babyl, and MMDF. The mbox format is the classic format for storing mail on Unix systems. All messages in an mbox mailbox are stored in a single file with the beginning of each message indicated by a line whose first five characters are "From ". + The parameter *from_matcher* can be used to override this default, by providing + a boolean function that takes the line as its sole parameter. + The default matcher is ``lambda line: line.startswith(b'From ')``. + A stricter matcher might be: + ``lambda line: re.match(b'From .+ \\d\\d\\d\\d\\r?\\n', line)``. Several variations of the mbox format exist to address perceived shortcomings in the original. In the interest of compatibility, :class:`mbox` implements the diff --git a/Lib/mailbox.py b/Lib/mailbox.py index 70da07ed2e9e8b..7fe4c07b2ea016 100644 --- a/Lib/mailbox.py +++ b/Lib/mailbox.py @@ -844,9 +844,14 @@ class mbox(_mboxMMDF): # _post_message_hooks outputs an empty line between messages. _append_newline = True - def __init__(self, path, factory=None, create=True): + def __init__(self, path, factory=None, create=True, from_matcher=None): """Initialize an mbox mailbox.""" self._message_factory = mboxMessage + if from_matcher is None: + # default to original matcher + self._from_matcher = lambda line: line.startswith(b'From ') + else: + self._from_matcher = from_matcher _mboxMMDF.__init__(self, path, factory, create) def _post_message_hook(self, f): @@ -861,7 +866,7 @@ def _generate_toc(self): while True: line_pos = self._file.tell() line = self._file.readline() - if line.startswith(b'From '): + if self._from_matcher(line): if len(stops) < len(starts): if last_was_empty: stops.append(line_pos - len(linesep)) diff --git a/Lib/test/test_mailbox.py b/Lib/test/test_mailbox.py index 07c2764dfd1b2f..07aa5fe975ee1a 100644 --- a/Lib/test/test_mailbox.py +++ b/Lib/test/test_mailbox.py @@ -1164,6 +1164,41 @@ def test_message_separator(self): data = f.read() self.assertEqual(data[-3:], '0\n\n') + # Test reading an mbox file with un-prefixed From in body text + # currently generates 2 messages + def _test_read_mbox(self, matcher=0, count=2): + # create a basic mbox file + self._box.add('From: foo\n\nHello\n') + # Add an un-prefixed From to create a second entry + self._box._file.write(b'From time to time\n') + self._box.close() + # re-read it using the provided matcher + if matcher == 0: # not provided, so omit + self._box = mailbox.mbox(self._path, create=False) + else: + self._box = mailbox.mbox(self._path, create=False, from_matcher=matcher) + # How many messages were found? + self.assertEqual(len(self._box.keys()), count) + + def test_read_mbox_omitted(self): + self._test_read_mbox() + + def test_read_mbox_none(self): + self._test_read_mbox(None) + + def test_read_mbox_default(self): + self._test_read_mbox(lambda line: re.match(b'From ', line)) + + def test_read_mbox_regex1(self): + import re + # stricter matching should only find one message + self._test_read_mbox(lambda line: re.match(b'From .+ \\d\\d\\d\\d\\r?\\n', line), count=1) + + def test_read_mbox_regex2(self): + import re + # invalid, so don't find any messages + self._test_read_mbox(lambda line: re.match(b'From .+ \\d\\d\\d\\r?\\n', line), count=0) + class TestMMDF(_TestMboxMMDF, unittest.TestCase):