Skip to content

Commit afabff1

Browse files
authored
Ignore UTF-8 BOM sequences in various scenarios (#69)
- Adapt the .open() helper to use encoding=utf-8-sig (for reading only) - Adapt the Reader() to ignore a (single) UTF-8 BOM sequences at the start of a line; this handles concatenated files, non-file input, etc. - Add exhaustive tests Note that this deliberately does not strip multiple concatenated BOM sequences since that's indicative of malformed input. Fixes #68.
1 parent 6231269 commit afabff1

File tree

2 files changed

+75
-2
lines changed

2 files changed

+75
-2
lines changed

jsonlines/jsonlines.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
"""
44

55
import builtins
6+
import codecs
67
import enum
78
import io
89
import json
@@ -42,6 +43,13 @@
4243
str,
4344
}
4445

46+
# Characters to skip at the beginning of a line. Note: at most one such
47+
# character is skipped per line.
48+
SKIPPABLE_SINGLE_INITIAL_CHARS = (
49+
"\x1e", # RFC7464 text sequence
50+
codecs.BOM_UTF8.decode(),
51+
)
52+
4553

4654
class DumpsResultConversion(enum.Enum):
4755
LeaveAsIs = enum.auto()
@@ -293,7 +301,7 @@ def read(
293301
)
294302
raise exc from orig_exc
295303

296-
if line.startswith("\x1e"): # RFC7464 text sequence
304+
if line.startswith(SKIPPABLE_SINGLE_INITIAL_CHARS):
297305
line = line[1:]
298306

299307
try:
@@ -611,7 +619,8 @@ def open(
611619
raise ValueError("'mode' must be either 'r', 'w', or 'a'")
612620

613621
cls = Reader if mode == "r" else Writer
614-
fp = builtins.open(file, mode=mode + "t", encoding="utf-8")
622+
encoding = "utf-8-sig" if mode == "r" else "utf-8"
623+
fp = builtins.open(file, mode=mode + "t", encoding=encoding)
615624
kwargs = dict(
616625
loads=loads,
617626
dumps=dumps,

tests/test_jsonlines.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,10 @@
22
Tests for the jsonlines library.
33
"""
44

5+
import codecs
56
import collections
67
import io
8+
import json
79
import tempfile
810

911
import jsonlines
@@ -38,6 +40,48 @@ def test_reader_rfc7464_text_sequences() -> None:
3840
assert list(reader) == ["a", "b"]
3941

4042

43+
def test_reader_utf8_bom_bytes() -> None:
44+
"""
45+
UTF-8 BOM is ignored, even if it occurs in the middle of a stream.
46+
"""
47+
chunks = [
48+
codecs.BOM_UTF8,
49+
b"1\n",
50+
codecs.BOM_UTF8,
51+
b"2\n",
52+
]
53+
fp = io.BytesIO(b"".join(chunks))
54+
with jsonlines.Reader(fp) as reader:
55+
assert list(reader) == [1, 2]
56+
57+
58+
def test_reader_utf8_bom_text() -> None:
59+
"""
60+
Text version of ``test_reader_utf8_bom_bytes()``.
61+
"""
62+
chunks = [
63+
"1\n",
64+
codecs.BOM_UTF8.decode(),
65+
"2\n",
66+
]
67+
fp = io.StringIO("".join(chunks))
68+
with jsonlines.Reader(fp) as reader:
69+
assert list(reader) == [1, 2]
70+
71+
72+
def test_reader_utf8_bom_bom_bom() -> None:
73+
"""
74+
Too many UTF-8 BOM BOM BOM chars cause BOOM 💥 BOOM.
75+
"""
76+
reader = jsonlines.Reader([codecs.BOM_UTF8.decode() * 3 + "1\n"])
77+
with pytest.raises(jsonlines.InvalidLineError) as excinfo:
78+
reader.read()
79+
80+
exc = excinfo.value
81+
assert "invalid json" in str(exc)
82+
assert isinstance(exc.__cause__, json.JSONDecodeError)
83+
84+
4185
def test_writer_text() -> None:
4286
fp = io.StringIO()
4387
with jsonlines.Writer(fp) as writer:
@@ -78,6 +122,7 @@ def test_invalid_lines() -> None:
78122
exc = excinfo.value
79123
assert "invalid json" in str(exc)
80124
assert exc.line == data
125+
assert isinstance(exc.__cause__, json.JSONDecodeError)
81126

82127

83128
def test_skip_invalid() -> None:
@@ -203,6 +248,18 @@ def test_open_reading() -> None:
203248
assert list(reader) == [123]
204249

205250

251+
def test_open_reading_with_utf8_bom() -> None:
252+
"""
253+
The ``.open()`` helper ignores a UTF-8 BOM.
254+
"""
255+
with tempfile.NamedTemporaryFile("wb") as fp:
256+
fp.write(codecs.BOM_UTF8)
257+
fp.write(b"123\n")
258+
fp.flush()
259+
with jsonlines.open(fp.name) as reader:
260+
assert list(reader) == [123]
261+
262+
206263
def test_open_writing() -> None:
207264
with tempfile.NamedTemporaryFile("w+b") as fp:
208265
with jsonlines.open(fp.name, mode="w") as writer:
@@ -224,3 +281,10 @@ def test_open_invalid_mode() -> None:
224281
with pytest.raises(ValueError) as excinfo:
225282
jsonlines.open("foo", mode="foo")
226283
assert "mode" in str(excinfo.value)
284+
285+
286+
def test_single_char_stripping() -> None:
287+
""" "
288+
Sanity check that a helper constant actually contains single-char strings.
289+
"""
290+
assert all(len(s) == 1 for s in jsonlines.jsonlines.SKIPPABLE_SINGLE_INITIAL_CHARS)

0 commit comments

Comments
 (0)