Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

gh-93376: Allow override of mbox From matching #95774

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 14 additions & 1 deletion Doc/library/mailbox.rst
Original file line number Diff line number Diff line change
Expand Up @@ -441,7 +441,7 @@ Supported mailbox formats are Maildir, mbox, MH, Babyl, and MMDF.
^^^^^^^^^^^^^


.. class:: mbox(path, factory=None, create=True)
.. class:: mbox(path, factory=None, create=True, from_matcher=None)

A subclass of :class:`Mailbox` for mailboxes in mbox format. Parameter *factory*
is a callable object that accepts a file-like message representation (which
Expand All @@ -453,6 +453,19 @@ Supported mailbox formats are Maildir, mbox, MH, Babyl, and MMDF.
The mbox format is the classic format for storing mail on Unix systems. All
messages in an mbox mailbox are stored in a single file with the beginning of
each message indicated by a line whose first five characters are "From ".
The parameter *from_matcher* can be used to override this default.
The value must either be the string ``'full'``
or a boolean function which takes a single parameter (the line to be matched).
The default matcher is ``lambda line: line.startswith(b'From ')``.
One alternate matcher is included:
- ``'full'``: this matches the syntax ``From <sender> <asctime>[ info]``
The ``asctime`` field must match the standard syntax, i.e. the fixed length (24 char) string:
``(Mon|...|Sun) (Jan|...|Dec) [ |d]d hh:mm:ss yyyy``.
The date field can have a leading space instead of a leading ``0``.
[The month and day-of-week fields are always in English]
A boolean function might be useful in some cases where the body text contains
un-quoted "From " lines. In such cases, it might help to check that the year (and month)
are the expected values for the mbox. Any other "From " lines are likely to be un-quoted body text.

Several variations of the mbox format exist to address perceived shortcomings in
the original. In the interest of compatibility, :class:`mbox` implements the
Expand Down
20 changes: 18 additions & 2 deletions Lib/mailbox.py
Original file line number Diff line number Diff line change
Expand Up @@ -838,15 +838,31 @@ def _install_message(self, message):
class mbox(_mboxMMDF):
"""A classic mbox mailbox."""

# This is the full syntax, i.e. From sender asctime[ moreinfo]
DAY_RE = b' (?:Mon|Tue|Wed|Thu|Fri|Sat|Sun)'
MON_RE = b' (?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)'
DTY_RE = b' [ 0]\\d \\d\\d:\\d\\d:\\d\\d \\d{4}' # day, time, year
FULL_RE = b'From \\S+' + DAY_RE + MON_RE + DTY_RE + b'( .+)?' + linesep + b'\\Z'
# we capture the optional moreinfo group so we can check for lines that end in the date

_mangle_from_ = True

# All messages must end in a newline character, and
# _post_message_hooks outputs an empty line between messages.
_append_newline = True

def __init__(self, path, factory=None, create=True):
def __init__(self, path, factory=None, create=True, from_matcher=None):
"""Initialize an mbox mailbox."""
self._message_factory = mboxMessage
if from_matcher is None:
# default to original matcher
self._from_matcher = lambda line: line.startswith(b'From ')
elif from_matcher == 'full': # From sender date[ moreinfo]
import re
regex = re.compile(self.FULL_RE) # compile once
self._from_matcher = lambda line: re.match(regex, line)
else: # assume it is a boolean function with one parameter
self._from_matcher = from_matcher
_mboxMMDF.__init__(self, path, factory, create)

def _post_message_hook(self, f):
Expand All @@ -861,7 +877,7 @@ def _generate_toc(self):
while True:
line_pos = self._file.tell()
line = self._file.readline()
if line.startswith(b'From '):
if self._from_matcher(line):
if len(stops) < len(starts):
if last_was_empty:
stops.append(line_pos - len(linesep))
Expand Down
8 changes: 8 additions & 0 deletions Lib/test/mailbox_data/mailbox_01.mbox
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
From MAILER-DAEMON Sun Aug 7 11:40:37 2022 extra info
From: foo
Subject: unquoted From in body; extra info on From line

Hello

From time to time

8 changes: 8 additions & 0 deletions Lib/test/mailbox_data/mailbox_02.mbox
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
From MAILER-DAEMON Sun Aug 7 11:40:37 20220 extra info
From: foo
Subject: unquoted From in body; invalid extra info on From line

Hello

From time to time

67 changes: 67 additions & 0 deletions Lib/test/test_mailbox.py
Original file line number Diff line number Diff line change
Expand Up @@ -1164,6 +1164,73 @@ def test_message_separator(self):
data = f.read()
self.assertEqual(data[-3:], '0\n\n')

# Test reading an mbox file with un-prefixed From in body text
# currently generates 2 messages
def _test_read_mbox(self, matcher=0, count=2):
# create a basic mbox file
self._box.add('From: foo\n\nHello\n')
# Add an un-prefixed From to create a second entry
self._box._file.write(b'From time to time\n')
self._box.close()
# re-read it using the provided matcher
if matcher == 0: # not provided, so omit
self._box = mailbox.mbox(self._path, create=False)
else:
self._box = mailbox.mbox(self._path, create=False, from_matcher=matcher)
# How many messages were found?
self.assertEqual(len(self._box.keys()), count)

def test_read_mbox_omitted(self):
self._test_read_mbox()

def test_read_mbox_none(self):
self._test_read_mbox(None)

def test_read_mbox_default(self):
self._test_read_mbox(lambda line: re.match(b'From ', line))

def test_read_mbox_full1(self):
self._test_read_mbox('full', count=1)

def test_read_mbox_regex1(self):
import re
# stricter matching should only find one message
self._test_read_mbox(lambda line: re.match(b'From .+ \\d\\d\\d\\d\\r?\\n', line), count=1)

def test_read_mbox_regex2(self):
import re
# invalid, so don't find any messages
self._test_read_mbox(lambda line: re.match(b'From .+ \\d\\d\\d\\r?\\n', line), count=0)

class TestMboxFromFile(unittest.TestCase):
# test class without default setUp/tearDown which we don't want

def setUp(self):
self._box = None
self._path = None

def tearDown(self):
if self._box is not None:
self._box.close()
# Don't delete it!

def checkmbox(self, name, matcher, count):
self._path = os.path.join(os.path.dirname(__file__), 'mailbox_data', name)
self._box = mailbox.mbox(self._path, create=False, from_matcher=matcher)
self.assertEqual(len(self._box.keys()), count)

# default matcher finds two messages as there are 2 From lines
def test_read_mbox_None_01(self):
self.checkmbox('mailbox_01.mbox', None, 2)

def test_read_mbox_None_02(self):
self.checkmbox('mailbox_02.mbox', None, 2)

def test_read_mbox_full_01(self):
self.checkmbox('mailbox_01.mbox', 'full', 1)

def test_read_mbox_full_02(self):
self.checkmbox('mailbox_02.mbox', 'full', 0) # From line has extra non-space chars after YYYY

class TestMMDF(_TestMboxMMDF, unittest.TestCase):

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Added from_matcher parameter to mailbox.mbox parser.
This allows the user to override
the default matcher (which looks for "From "" only) with a more specific
matcher that is less likely to match against un-quoted "From " lines in body
text.