Skip to content

Commit 815977d

Browse files
committed
wip: Add support for a maximum line length while reading
Fixes #39.
1 parent afabff1 commit 815977d

File tree

2 files changed

+105
-3
lines changed

2 files changed

+105
-3
lines changed

jsonlines/jsonlines.py

Lines changed: 94 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
import typing
1414
from typing import (
1515
Any,
16+
AnyStr,
1617
Callable,
1718
Dict,
1819
Iterable,
@@ -182,16 +183,34 @@ class Reader(ReaderWriterBase):
182183
decoder. If specified, it must be a callable that accepts a
183184
(unicode) string and returns the decoded object.
184185
185-
:param file_or_iterable: file-like object or iterable yielding lines as
186-
strings
186+
The `max_line_length` argument limits the maximum line length. If
187+
specified, this prevents reading and parsing of too large values.
188+
When reading from an input file that has a ``.readline()`` method,
189+
that will be used. For custom iterables, it is not possible to
190+
limit the size of yielded items, but the limit will still prevent
191+
JSON parsing of too large lines. Note that the limit applies per
192+
line, not to the total amount of data.
193+
194+
.. warning::
195+
196+
Use `max_line_length` as a safety measure for untrusted input:
197+
without a limit, (potentially malicious) large input without
198+
newlines will be read into memory in its entirety, and parsed
199+
afterwards. This could quickly exhaust memory and other system
200+
resources.
201+
202+
:param file_or_iterable: file-like object or iterable yielding
203+
lines as strings
187204
:param loads: custom json decoder callable
205+
:param max_line_length: the maximum line length to read/parse
188206
"""
189207

190208
_file_or_iterable: Union[
191209
typing.IO[str], typing.IO[bytes], Iterable[Union[str, bytes]]
192210
]
193211
_line_iter: Iterator[Tuple[int, Union[bytes, str]]] = attr.ib(init=False)
194212
_loads: LoadsCallable = attr.ib(default=default_loads, kw_only=True)
213+
_max_line_length: Optional[int] = attr.ib(default=None, kw_only=True)
195214

196215
def __attrs_post_init__(self) -> None:
197216
if isinstance(self._file_or_iterable, io.IOBase):
@@ -200,7 +219,18 @@ def __attrs_post_init__(self) -> None:
200219
self._file_or_iterable,
201220
)
202221

203-
self._line_iter = enumerate(self._file_or_iterable, 1)
222+
iterable: Iterable[Union[str, bytes]]
223+
if (
224+
self._fp is not None
225+
and hasattr(self._fp, "readline")
226+
and self._max_line_length is not None
227+
):
228+
self._line_iter = ReadlineIterator(
229+
self._fp, # type: ignore[misc]
230+
max_line_length=self._max_line_length,
231+
)
232+
else:
233+
self._line_iter = enumerate(self._file_or_iterable, 1)
204234

205235
# No type specified, None not allowed
206236
@overload
@@ -301,6 +331,10 @@ def read(
301331
)
302332
raise exc from orig_exc
303333

334+
if self._max_line_length is not None and len(line) > self._max_line_length:
335+
# TODO: add tests for this
336+
raise InvalidLineError("line too long", line, lineno)
337+
304338
if line.startswith(SKIPPABLE_SINGLE_INITIAL_CHARS):
305339
line = line[1:]
306340

@@ -643,3 +677,60 @@ def repr_for_fp(fp: typing.IO[Any]) -> str:
643677
return repr(name)
644678
else:
645679
return repr(fp)
680+
681+
682+
@attr.s(auto_attribs=True)
683+
class ReadlineIterator(typing.Iterator[Tuple[int, AnyStr]]):
684+
"""
685+
Iterator over a file-like object using ``.readline()``, enforcing a length limit.
686+
687+
This can be used to avoid reading too large values into memory.
688+
"""
689+
690+
# TODO: add more tests
691+
692+
# Note: this iterator is ‘special’ in the sense that it can continue after
693+
# a call to next() resulted in an exception. Usually this exception will
694+
# reach the application, which will usually abort reading from the file.
695+
# However, Reader.iter(skip_invalid=True) continues afterwards: a too long
696+
# line should not be parsed, but the next line may be fine. This is why the
697+
# subsequent call to ``next()`` will continue with the next line.
698+
#
699+
# This code is implemented as a class instead of a simpler generator
700+
# function, because the latter cannot do the above.
701+
702+
fp: typing.IO[AnyStr]
703+
max_line_length: int
704+
at_line_boundary: bool = True
705+
lineno: int = 1
706+
707+
def __next__(self) -> Tuple[int, Union[AnyStr]]:
708+
"""
709+
Read the next line.
710+
711+
If needed, this reads past a previously detected too long line.
712+
"""
713+
# If previously interrupted, read until the next line boundary.
714+
# TODO: make this nicer and simpler, e.g. why not read in
715+
# chunk of size ‘max_line_length’ here as well, and reduce
716+
# duplicated/convoluted logic.
717+
if not self.at_line_boundary:
718+
buf_size = 16 * 1024
719+
while True:
720+
line = self.fp.readline(buf_size)
721+
if not line:
722+
raise StopIteration
723+
if line.endswith("\n" if isinstance(line, str) else b"\n"):
724+
self.at_line_boundary = True
725+
break
726+
727+
line = self.fp.readline(self.max_line_length + 1)
728+
if not line:
729+
raise StopIteration
730+
731+
self.lineno += 1
732+
if len(line) > self.max_line_length:
733+
self.at_line_boundary = False
734+
raise InvalidLineError("line too long", line, self.lineno)
735+
736+
return self.lineno, line

tests/test_jsonlines.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,17 @@ def test_skip_invalid() -> None:
133133
assert next(it) == 34
134134

135135

136+
def test_skip_invalid_long_lines() -> None:
137+
"""
138+
A line length limited reader is able to skip over too long lines.
139+
"""
140+
fp = io.StringIO("12\ninvalid\n34")
141+
reader = jsonlines.Reader(fp, max_line_length=3)
142+
it = reader.iter(skip_invalid=True)
143+
assert next(it) == 12
144+
assert next(it) == 34
145+
146+
136147
def test_empty_strings_in_iterable() -> None:
137148
input = ["123", "", "456"]
138149
it = iter(jsonlines.Reader(input))

0 commit comments

Comments
 (0)