Skip to content

Commit 7f814c7

Browse files
committed
pythongh-93376 Allow override of mbox From matching
This fixes python#93376
1 parent 978e37b commit 7f814c7

File tree

6 files changed

+119
-3
lines changed

6 files changed

+119
-3
lines changed

Doc/library/mailbox.rst

+17-1
Original file line numberDiff line numberDiff line change
@@ -562,7 +562,7 @@ Supported mailbox formats are Maildir, mbox, MH, Babyl, and MMDF.
562562
^^^^^^^^^^^^^^^^^^^^^^
563563

564564

565-
.. class:: mbox(path, factory=None, create=True)
565+
.. class:: mbox(path, factory=None, create=True, from_matcher=None)
566566

567567
A subclass of :class:`Mailbox` for mailboxes in mbox format. Parameter *factory*
568568
is a callable object that accepts a file-like message representation (which
@@ -575,6 +575,22 @@ Supported mailbox formats are Maildir, mbox, MH, Babyl, and MMDF.
575575
messages in an mbox mailbox are stored in a single file with the beginning of
576576
each message indicated by a line whose first five characters are "From ".
577577

578+
The parameter *from_matcher* can be used to override this default, by providing
579+
a boolean function that takes the line as its sole parameter.
580+
The default matcher is ``lambda line: line.startswith(b'From ')``.
581+
A stricter matcher might be:
582+
``lambda line: re.match(b'From .+ \\d\\d\\d\\d\\r?\\n', line)``.
583+
584+
One alternate matcher is included:
585+
- ``'full'``: this matches the syntax ``From <sender> <asctime>[ info]``
586+
The ``asctime`` field must match the standard syntax, i.e. the fixed length (24 char) string:
587+
``(Mon|...|Sun) (Jan|...|Dec) [ |d]d hh:mm:ss yyyy``.
588+
The date field can have a leading space instead of a leading ``0``.
589+
[The month and day-of-week fields are always in English]
590+
A boolean function might be useful in some cases where the body text contains
591+
un-quoted "From " lines. In such cases, it might help to check that the year (and month)
592+
are the expected values for the mbox. Any other "From " lines are likely to be un-quoted body text.
593+
578594
Several variations of the mbox format exist to address perceived shortcomings in
579595
the original. In the interest of compatibility, :class:`!mbox` implements the
580596
original format, which is sometimes referred to as :dfn:`mboxo`. This means that

Lib/mailbox.py

+18-2
Original file line numberDiff line numberDiff line change
@@ -895,15 +895,31 @@ def _install_message(self, message):
895895
class mbox(_mboxMMDF):
896896
"""A classic mbox mailbox."""
897897

898+
# This is the full syntax, i.e. From sender asctime[ moreinfo]
899+
DAY_RE = b' (?:Mon|Tue|Wed|Thu|Fri|Sat|Sun)'
900+
MON_RE = b' (?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)'
901+
DTY_RE = b' [ 0]\\d \\d\\d:\\d\\d:\\d\\d \\d{4}' # day, time, year
902+
FULL_RE = b'From \\S+' + DAY_RE + MON_RE + DTY_RE + b'( .+)?' + linesep + b'\\Z'
903+
# we capture the optional moreinfo group so we can check for lines that end in the date
904+
898905
_mangle_from_ = True
899906

900907
# All messages must end in a newline character, and
901908
# _post_message_hooks outputs an empty line between messages.
902909
_append_newline = True
903910

904-
def __init__(self, path, factory=None, create=True):
911+
def __init__(self, path, factory=None, create=True, from_matcher=None):
905912
"""Initialize an mbox mailbox."""
906913
self._message_factory = mboxMessage
914+
if from_matcher is None:
915+
# default to original matcher
916+
self._from_matcher = lambda line: line.startswith(b'From ')
917+
elif from_matcher == 'full': # From sender date[ moreinfo]
918+
import re
919+
regex = re.compile(self.FULL_RE) # compile once
920+
self._from_matcher = lambda line: re.match(regex, line)
921+
else: # assume it is a boolean function with one parameter
922+
self._from_matcher = from_matcher
907923
_mboxMMDF.__init__(self, path, factory, create)
908924

909925
def _post_message_hook(self, f):
@@ -918,7 +934,7 @@ def _generate_toc(self):
918934
while True:
919935
line_pos = self._file.tell()
920936
line = self._file.readline()
921-
if line.startswith(b'From '):
937+
if self._from_matcher(line):
922938
if len(stops) < len(starts):
923939
if last_was_empty:
924940
stops.append(line_pos - len(linesep))

Lib/test/mailbox_data/mailbox_01.mbox

+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
From MAILER-DAEMON Sun Aug 7 11:40:37 2022 extra info
2+
From: foo
3+
Subject: unquoted From in body; extra info on From line
4+
5+
Hello
6+
7+
From time to time

Lib/test/mailbox_data/mailbox_02.mbox

+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
From MAILER-DAEMON Sun Aug 7 11:40:37 20220 extra info
2+
From: foo
3+
Subject: unquoted From in body; invalid extra info on From line
4+
5+
Hello
6+
7+
From time to time

Lib/test/test_mailbox.py

+67
Original file line numberDiff line numberDiff line change
@@ -1310,6 +1310,73 @@ def test_message_separator(self):
13101310
data = f.read()
13111311
self.assertEqual(data[-3:], '0\n\n')
13121312

1313+
# Test reading an mbox file with un-prefixed From in body text
1314+
# currently generates 2 messages
1315+
def _test_read_mbox(self, matcher=0, count=2):
1316+
# create a basic mbox file
1317+
self._box.add('From: foo\n\nHello\n')
1318+
# Add an un-prefixed From to create a second entry
1319+
self._box._file.write(b'From time to time\n')
1320+
self._box.close()
1321+
# re-read it using the provided matcher
1322+
if matcher == 0: # not provided, so omit
1323+
self._box = mailbox.mbox(self._path, create=False)
1324+
else:
1325+
self._box = mailbox.mbox(self._path, create=False, from_matcher=matcher)
1326+
# How many messages were found?
1327+
self.assertEqual(len(self._box.keys()), count)
1328+
1329+
def test_read_mbox_omitted(self):
1330+
self._test_read_mbox()
1331+
1332+
def test_read_mbox_none(self):
1333+
self._test_read_mbox(None)
1334+
1335+
def test_read_mbox_default(self):
1336+
self._test_read_mbox(lambda line: re.match(b'From ', line))
1337+
1338+
def test_read_mbox_full1(self):
1339+
self._test_read_mbox('full', count=1)
1340+
1341+
def test_read_mbox_regex1(self):
1342+
import re
1343+
# stricter matching should only find one message
1344+
self._test_read_mbox(lambda line: re.match(b'From .+ \\d\\d\\d\\d\\r?\\n', line), count=1)
1345+
1346+
def test_read_mbox_regex2(self):
1347+
import re
1348+
# invalid, so don't find any messages
1349+
self._test_read_mbox(lambda line: re.match(b'From .+ \\d\\d\\d\\r?\\n', line), count=0)
1350+
1351+
class TestMboxFromFile(unittest.TestCase):
1352+
# test class without default setUp/tearDown which we don't want
1353+
1354+
def setUp(self):
1355+
self._box = None
1356+
self._path = None
1357+
1358+
def tearDown(self):
1359+
if self._box is not None:
1360+
self._box.close()
1361+
# Don't delete it!
1362+
1363+
def checkmbox(self, name, matcher, count):
1364+
self._path = os.path.join(os.path.dirname(__file__), 'mailbox_data', name)
1365+
self._box = mailbox.mbox(self._path, create=False, from_matcher=matcher)
1366+
self.assertEqual(len(self._box.keys()), count)
1367+
1368+
# default matcher finds two messages as there are 2 From lines
1369+
def test_read_mbox_None_01(self):
1370+
self.checkmbox('mailbox_01.mbox', None, 2)
1371+
1372+
def test_read_mbox_None_02(self):
1373+
self.checkmbox('mailbox_02.mbox', None, 2)
1374+
1375+
def test_read_mbox_full_01(self):
1376+
self.checkmbox('mailbox_01.mbox', 'full', 1)
1377+
1378+
def test_read_mbox_full_02(self):
1379+
self.checkmbox('mailbox_02.mbox', 'full', 0) # From line has extra non-space chars after YYYY
13131380

13141381
class TestMMDF(_TestMboxMMDF, unittest.TestCase):
13151382

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Added *from_matcher* parameter to mailbox.mbox parser.
2+
This allows the user to override the default matcher (which looks for "From " only) with a
3+
more specific matcher that is less likely to match against un-quoted "From " lines in body text.

0 commit comments

Comments
 (0)