From d9d84f53ea143084f48aed5a38419344de1ab0fc Mon Sep 17 00:00:00 2001 From: Sebb Date: Sun, 7 Aug 2022 22:49:46 +0100 Subject: [PATCH 1/2] gh-93376: Allow override of mbox From matching --- Doc/library/mailbox.rst | 15 ++++++- Lib/mailbox.py | 20 ++++++++- Lib/test/mailbox_data/mailbox_01.mbox | 8 ++++ Lib/test/test_mailbox.py | 44 +++++++++++++++++++ ...2-08-07-18-49-49.gh-issue-93376.G7XqQo.rst | 5 +++ 5 files changed, 89 insertions(+), 3 deletions(-) create mode 100644 Lib/test/mailbox_data/mailbox_01.mbox create mode 100644 Misc/NEWS.d/next/Library/2022-08-07-18-49-49.gh-issue-93376.G7XqQo.rst diff --git a/Doc/library/mailbox.rst b/Doc/library/mailbox.rst index d74fc8059fd538..b83801fafca784 100644 --- a/Doc/library/mailbox.rst +++ b/Doc/library/mailbox.rst @@ -441,7 +441,7 @@ Supported mailbox formats are Maildir, mbox, MH, Babyl, and MMDF. ^^^^^^^^^^^^^ -.. class:: mbox(path, factory=None, create=True) +.. class:: mbox(path, factory=None, create=True, from_matcher=None) A subclass of :class:`Mailbox` for mailboxes in mbox format. Parameter *factory* is a callable object that accepts a file-like message representation (which @@ -453,6 +453,19 @@ Supported mailbox formats are Maildir, mbox, MH, Babyl, and MMDF. The mbox format is the classic format for storing mail on Unix systems. All messages in an mbox mailbox are stored in a single file with the beginning of each message indicated by a line whose first five characters are "From ". + The parameter *from_matcher* can be used to override this default. + The value must either be the string ``'full'`` + or a boolean function which takes a single parameter (the line to be matched). + The default matcher is ``lambda line: line.startswith(b'From ')``. + One alternate matcher is included: + - ``'full'``: this matches the syntax ``From [ info]`` + The ``asctime`` field must match the standard syntax, i.e. the fixed length (24 char) string: + ``(Mon|...|Sun) (Jan|...|Dec) [ |d]d hh:mm:ss yyyy``. + The date field can have a leading space instead of a leading ``0``. + [The month and day-of-week fields are always in English] + A boolean function might be useful in some cases where the body text contains + un-quoted "From " lines. In such cases, it might help to check that the year (and month) + are the expected values for the mbox. Any other "From " lines are likely to be un-quoted body text. Several variations of the mbox format exist to address perceived shortcomings in the original. In the interest of compatibility, :class:`mbox` implements the diff --git a/Lib/mailbox.py b/Lib/mailbox.py index 70da07ed2e9e8b..66bccd30d220b1 100644 --- a/Lib/mailbox.py +++ b/Lib/mailbox.py @@ -838,15 +838,31 @@ def _install_message(self, message): class mbox(_mboxMMDF): """A classic mbox mailbox.""" + # This is the full syntax, i.e. From sender asctime[ moreinfo] + DAY_RE = b' (?:Mon|Tue|Wed|Thu|Fri|Sat|Sun)' + MON_RE = b' (?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)' + DTY_RE = b' [ 0]\\d \\d\\d:\\d\\d:\\d\\d \\d{4}' # day, time, year + FULL_RE = b'From \\S+' + DAY_RE + MON_RE + DTY_RE + b'( .+)?' + linesep + b'\\Z' + # we capture the optional moreinfo group so we can check for lines that end in the date + _mangle_from_ = True # All messages must end in a newline character, and # _post_message_hooks outputs an empty line between messages. _append_newline = True - def __init__(self, path, factory=None, create=True): + def __init__(self, path, factory=None, create=True, from_matcher=None): """Initialize an mbox mailbox.""" self._message_factory = mboxMessage + if from_matcher is None: + # default to original matcher + self._from_matcher = lambda line: line.startswith(b'From ') + elif from_matcher == 'full': # From sender date[ moreinfo] + import re + regex = re.compile(self.FULL_RE) # compile once + self._from_matcher = lambda line: re.match(regex, line) + else: # assume it is a boolean function with one parameter + self._from_matcher = from_matcher _mboxMMDF.__init__(self, path, factory, create) def _post_message_hook(self, f): @@ -861,7 +877,7 @@ def _generate_toc(self): while True: line_pos = self._file.tell() line = self._file.readline() - if line.startswith(b'From '): + if self._from_matcher(line): if len(stops) < len(starts): if last_was_empty: stops.append(line_pos - len(linesep)) diff --git a/Lib/test/mailbox_data/mailbox_01.mbox b/Lib/test/mailbox_data/mailbox_01.mbox new file mode 100644 index 00000000000000..759dfca013ebe0 --- /dev/null +++ b/Lib/test/mailbox_data/mailbox_01.mbox @@ -0,0 +1,8 @@ +From MAILER-DAEMON Sun Aug 7 11:40:37 2022 extra info +From: foo +Subject: unquoted From in body; extra info on From line + +Hello + +From time to time + diff --git a/Lib/test/test_mailbox.py b/Lib/test/test_mailbox.py index 07c2764dfd1b2f..4bb6202535b784 100644 --- a/Lib/test/test_mailbox.py +++ b/Lib/test/test_mailbox.py @@ -1164,6 +1164,50 @@ def test_message_separator(self): data = f.read() self.assertEqual(data[-3:], '0\n\n') + # Test reading an mbox file with un-prefixed From in body text + # currently generates 2 messages + def _test_read_mbox(self, matcher=0, count=2): + # create a basic mbox file + self._box.add('From: foo\n\nHello\n') + # Add an un-prefixed From to create a second entry + self._box._file.write(b'From time to time\n') + self._box.close() + # re-read it using the provided matcher + if matcher == 0: # not provided, so omit + self._box = mailbox.mbox(self._path, create=False) + else: + self._box = mailbox.mbox(self._path, create=False, from_matcher=matcher) + # How many messages were found? + self.assertEqual(len(self._box.keys()), count) + + def test_read_mbox_omitted(self): + self._test_read_mbox() + + def test_read_mbox_none(self): + self._test_read_mbox(None) + + def test_read_mbox_default(self): + self._test_read_mbox(lambda line: re.match(b'From ', line)) + + def test_read_mbox_full1(self): + self._test_read_mbox('full', count=1) + + def test_read_mbox_full2(self): + path = os.path.join(os.path.dirname(__file__), 'mailbox_data', 'mailbox_01.mbox') + box = mailbox.mbox(path, create=False, from_matcher='full') + self.assertEqual(len(box.keys()), 1) + box.close() + + def test_read_mbox_regex1(self): + import re + # stricter matching should only find one message + self._test_read_mbox(lambda line: re.match(b'From .+ \\d\\d\\d\\d\\r?\\n', line), count=1) + + def test_read_mbox_regex2(self): + import re + # invalid, so don't find any messages + self._test_read_mbox(lambda line: re.match(b'From .+ \\d\\d\\d\\r?\\n', line), count=0) + class TestMMDF(_TestMboxMMDF, unittest.TestCase): diff --git a/Misc/NEWS.d/next/Library/2022-08-07-18-49-49.gh-issue-93376.G7XqQo.rst b/Misc/NEWS.d/next/Library/2022-08-07-18-49-49.gh-issue-93376.G7XqQo.rst new file mode 100644 index 00000000000000..adaecf865b93e2 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2022-08-07-18-49-49.gh-issue-93376.G7XqQo.rst @@ -0,0 +1,5 @@ +Added from_matcher parameter to mailbox.mbox parser. +This allows the user to override +the default matcher (which looks for "From "" only) with a more specific +matcher that is less likely to match against un-quoted "From " lines in body +text. From 34ce7ce5e6f3165b5ec14424f065d661a85783c5 Mon Sep 17 00:00:00 2001 From: Sebb Date: Mon, 8 Aug 2022 12:23:21 +0100 Subject: [PATCH 2/2] A few more tests --- Lib/test/mailbox_data/mailbox_02.mbox | 8 ++++++ Lib/test/test_mailbox.py | 35 ++++++++++++++++++++++----- 2 files changed, 37 insertions(+), 6 deletions(-) create mode 100644 Lib/test/mailbox_data/mailbox_02.mbox diff --git a/Lib/test/mailbox_data/mailbox_02.mbox b/Lib/test/mailbox_data/mailbox_02.mbox new file mode 100644 index 00000000000000..dcafbf63705f24 --- /dev/null +++ b/Lib/test/mailbox_data/mailbox_02.mbox @@ -0,0 +1,8 @@ +From MAILER-DAEMON Sun Aug 7 11:40:37 20220 extra info +From: foo +Subject: unquoted From in body; invalid extra info on From line + +Hello + +From time to time + diff --git a/Lib/test/test_mailbox.py b/Lib/test/test_mailbox.py index 4bb6202535b784..5f307fd6c946bb 100644 --- a/Lib/test/test_mailbox.py +++ b/Lib/test/test_mailbox.py @@ -1192,12 +1192,6 @@ def test_read_mbox_default(self): def test_read_mbox_full1(self): self._test_read_mbox('full', count=1) - def test_read_mbox_full2(self): - path = os.path.join(os.path.dirname(__file__), 'mailbox_data', 'mailbox_01.mbox') - box = mailbox.mbox(path, create=False, from_matcher='full') - self.assertEqual(len(box.keys()), 1) - box.close() - def test_read_mbox_regex1(self): import re # stricter matching should only find one message @@ -1208,6 +1202,35 @@ def test_read_mbox_regex2(self): # invalid, so don't find any messages self._test_read_mbox(lambda line: re.match(b'From .+ \\d\\d\\d\\r?\\n', line), count=0) +class TestMboxFromFile(unittest.TestCase): + # test class without default setUp/tearDown which we don't want + + def setUp(self): + self._box = None + self._path = None + + def tearDown(self): + if self._box is not None: + self._box.close() + # Don't delete it! + + def checkmbox(self, name, matcher, count): + self._path = os.path.join(os.path.dirname(__file__), 'mailbox_data', name) + self._box = mailbox.mbox(self._path, create=False, from_matcher=matcher) + self.assertEqual(len(self._box.keys()), count) + + # default matcher finds two messages as there are 2 From lines + def test_read_mbox_None_01(self): + self.checkmbox('mailbox_01.mbox', None, 2) + + def test_read_mbox_None_02(self): + self.checkmbox('mailbox_02.mbox', None, 2) + + def test_read_mbox_full_01(self): + self.checkmbox('mailbox_01.mbox', 'full', 1) + + def test_read_mbox_full_02(self): + self.checkmbox('mailbox_02.mbox', 'full', 0) # From line has extra non-space chars after YYYY class TestMMDF(_TestMboxMMDF, unittest.TestCase):