13
13
import typing
14
14
from typing import (
15
15
Any ,
16
+ AnyStr ,
16
17
Callable ,
17
18
Dict ,
18
19
Iterable ,
@@ -182,16 +183,34 @@ class Reader(ReaderWriterBase):
182
183
decoder. If specified, it must be a callable that accepts a
183
184
(unicode) string and returns the decoded object.
184
185
185
- :param file_or_iterable: file-like object or iterable yielding lines as
186
- strings
186
+ The `max_line_length` argument limits the maximum line length. If
187
+ specified, this prevents reading and parsing of too large values.
188
+ When reading from an input file that has a ``.readline()`` method,
189
+ that will be used. For custom iterables, it is not possible to
190
+ limit the size of yielded items, but the limit will still prevent
191
+ JSON parsing of too large lines. Note that the limit applies per
192
+ line, not to the total amount of data.
193
+
194
+ .. warning::
195
+
196
+ Use `max_line_length` as a safety measure for untrusted input:
197
+ without a limit, (potentially malicious) large input without
198
+ newlines will be read into memory in its entirety, and parsed
199
+ afterwards. This could quickly exhaust memory and other system
200
+ resources.
201
+
202
+ :param file_or_iterable: file-like object or iterable yielding
203
+ lines as strings
187
204
:param loads: custom json decoder callable
205
+ :param max_line_length: the maximum line length to read/parse
188
206
"""
189
207
190
208
_file_or_iterable : Union [
191
209
typing .IO [str ], typing .IO [bytes ], Iterable [Union [str , bytes ]]
192
210
]
193
211
_line_iter : Iterator [Tuple [int , Union [bytes , str ]]] = attr .ib (init = False )
194
212
_loads : LoadsCallable = attr .ib (default = default_loads , kw_only = True )
213
+ _max_line_length : Optional [int ] = attr .ib (default = None , kw_only = True )
195
214
196
215
def __attrs_post_init__ (self ) -> None :
197
216
if isinstance (self ._file_or_iterable , io .IOBase ):
@@ -200,7 +219,18 @@ def __attrs_post_init__(self) -> None:
200
219
self ._file_or_iterable ,
201
220
)
202
221
203
- self ._line_iter = enumerate (self ._file_or_iterable , 1 )
222
+ iterable : Iterable [Union [str , bytes ]]
223
+ if (
224
+ self ._fp is not None
225
+ and hasattr (self ._fp , "readline" )
226
+ and self ._max_line_length is not None
227
+ ):
228
+ self ._line_iter = ReadlineIterator (
229
+ self ._fp , # type: ignore[misc]
230
+ max_line_length = self ._max_line_length ,
231
+ )
232
+ else :
233
+ self ._line_iter = enumerate (self ._file_or_iterable , 1 )
204
234
205
235
# No type specified, None not allowed
206
236
@overload
@@ -301,6 +331,10 @@ def read(
301
331
)
302
332
raise exc from orig_exc
303
333
334
+ if self ._max_line_length is not None and len (line ) > self ._max_line_length :
335
+ # TODO: add tests for this
336
+ raise InvalidLineError ("line too long" , line , lineno )
337
+
304
338
if line .startswith (SKIPPABLE_SINGLE_INITIAL_CHARS ):
305
339
line = line [1 :]
306
340
@@ -643,3 +677,60 @@ def repr_for_fp(fp: typing.IO[Any]) -> str:
643
677
return repr (name )
644
678
else :
645
679
return repr (fp )
680
+
681
+
682
+ @attr .s (auto_attribs = True )
683
+ class ReadlineIterator (typing .Iterator [Tuple [int , AnyStr ]]):
684
+ """
685
+ Iterator over a file-like object using ``.readline()``, enforcing a length limit.
686
+
687
+ This can be used to avoid reading too large values into memory.
688
+ """
689
+
690
+ # TODO: add more tests
691
+
692
+ # Note: this iterator is ‘special’ in the sense that it can continue after
693
+ # a call to next() resulted in an exception. Usually this exception will
694
+ # reach the application, which will usually abort reading from the file.
695
+ # However, Reader.iter(skip_invalid=True) continues afterwards: a too long
696
+ # line should not be parsed, but the next line may be fine. This is why the
697
+ # subsequent call to ``next()`` will continue with the next line.
698
+ #
699
+ # This code is implemented as a class instead of a simpler generator
700
+ # function, because the latter cannot do the above.
701
+
702
+ fp : typing .IO [AnyStr ]
703
+ max_line_length : int
704
+ at_line_boundary : bool = True
705
+ lineno : int = 1
706
+
707
+ def __next__ (self ) -> Tuple [int , Union [AnyStr ]]:
708
+ """
709
+ Read the next line.
710
+
711
+ If needed, this reads past a previously detected too long line.
712
+ """
713
+ # If previously interrupted, read until the next line boundary.
714
+ # TODO: make this nicer and simpler, e.g. why not read in
715
+ # chunk of size ‘max_line_length’ here as well, and reduce
716
+ # duplicated/convoluted logic.
717
+ if not self .at_line_boundary :
718
+ buf_size = 16 * 1024
719
+ while True :
720
+ line = self .fp .readline (buf_size )
721
+ if not line :
722
+ raise StopIteration
723
+ if line .endswith ("\n " if isinstance (line , str ) else b"\n " ):
724
+ self .at_line_boundary = True
725
+ break
726
+
727
+ line = self .fp .readline (self .max_line_length + 1 )
728
+ if not line :
729
+ raise StopIteration
730
+
731
+ self .lineno += 1
732
+ if len (line ) > self .max_line_length :
733
+ self .at_line_boundary = False
734
+ raise InvalidLineError ("line too long" , line , self .lineno )
735
+
736
+ return self .lineno , line
0 commit comments