2525
2626
2727def parse_pdf_to_pages (
28- path : str | os .PathLike , page_size_limit : int | None = None
28+ path : str | os .PathLike ,
29+ page_size_limit : int | None = None ,
30+ use_block_parsing : bool = False ,
2931) -> ParsedText :
3032
3133 with pymupdf .open (path ) as file :
@@ -42,17 +44,23 @@ def parse_pdf_to_pages(
4244 " file is corrupt."
4345 ) from exc
4446
45- # Extract text blocks from the page
46- # Note: sort=False is important to preserve the order of text blocks
47- # as they appear in the PDF
48- blocks = page .get_text ("blocks" , sort = False )
47+ if use_block_parsing :
48+ # NOTE: this block-based parsing appears to be better, but until
49+ # fully validated on 1+ benchmarks, it's considered experimental
4950
50- # Concatenate text blocks into a single string
51- text = "\n " .join (
52- block [BLOCK_TEXT_INDEX ]
53- for block in blocks
54- if len (block ) > BLOCK_TEXT_INDEX
55- )
51+ # Extract text blocks from the page
52+ # Note: sort=False is important to preserve the order of text blocks
53+ # as they appear in the PDF
54+ blocks = page .get_text ("blocks" , sort = False )
55+
56+ # Concatenate text blocks into a single string
57+ text = "\n " .join (
58+ block [BLOCK_TEXT_INDEX ]
59+ for block in blocks
60+ if len (block ) > BLOCK_TEXT_INDEX
61+ )
62+ else :
63+ text = page .get_text ("text" , sort = True )
5664
5765 if page_size_limit and len (text ) > page_size_limit :
5866 raise ImpossibleParsingError (
@@ -281,7 +289,7 @@ async def read_doc(
281289 include_metadata : Literal [True ],
282290 chunk_chars : int = ...,
283291 overlap : int = ...,
284- page_size_limit : int | None = ... ,
292+ ** parser_kwargs ,
285293) -> ParsedText : ...
286294@overload
287295async def read_doc (
@@ -291,7 +299,7 @@ async def read_doc(
291299 include_metadata : Literal [False ] = ...,
292300 chunk_chars : int = ...,
293301 overlap : int = ...,
294- page_size_limit : int | None = ... ,
302+ ** parser_kwargs ,
295303) -> ParsedText : ...
296304@overload
297305async def read_doc (
@@ -301,7 +309,7 @@ async def read_doc(
301309 include_metadata : Literal [True ],
302310 chunk_chars : int = ...,
303311 overlap : int = ...,
304- page_size_limit : int | None = ... ,
312+ ** parser_kwargs ,
305313) -> tuple [list [Text ], ParsedMetadata ]: ...
306314@overload
307315async def read_doc (
@@ -311,7 +319,7 @@ async def read_doc(
311319 include_metadata : Literal [False ] = ...,
312320 chunk_chars : int = ...,
313321 overlap : int = ...,
314- page_size_limit : int | None = ... ,
322+ ** parser_kwargs ,
315323) -> list [Text ]: ...
316324@overload
317325async def read_doc (
@@ -321,7 +329,7 @@ async def read_doc(
321329 include_metadata : Literal [True ],
322330 chunk_chars : int = ...,
323331 overlap : int = ...,
324- page_size_limit : int | None = ... ,
332+ ** parser_kwargs ,
325333) -> tuple [list [Text ], ParsedMetadata ]: ...
326334async def read_doc (
327335 path : str | os .PathLike ,
@@ -330,7 +338,7 @@ async def read_doc(
330338 include_metadata : bool = False ,
331339 chunk_chars : int = 3000 ,
332340 overlap : int = 100 ,
333- page_size_limit : int | None = None ,
341+ ** parser_kwargs ,
334342) -> list [Text ] | ParsedText | tuple [list [Text ], ParsedMetadata ]:
335343 """Parse a document and split into chunks.
336344
@@ -342,32 +350,27 @@ async def read_doc(
342350 include_metadata: return a tuple
343351 chunk_chars: size of chunks
344352 overlap: size of overlap between chunks
345- page_size_limit: optional limit on the number of characters per page
353+ parser_kwargs: Keyword arguments to pass to the used parsing function.
346354 """
347355 str_path = str (path )
348356
349357 # start with parsing -- users may want to store this separately
350358 if str_path .endswith (".pdf" ):
351359 # TODO: Make parse_pdf_to_pages async
352- parsed_text = await asyncio .to_thread (
353- parse_pdf_to_pages , path , page_size_limit = page_size_limit
354- )
360+ parsed_text = await asyncio .to_thread (parse_pdf_to_pages , path , ** parser_kwargs )
355361 elif str_path .endswith (".txt" ):
356362 # TODO: Make parse_text async
357- parsed_text = await asyncio .to_thread (
358- parse_text , path , page_size_limit = page_size_limit
359- )
363+ parser_kwargs .pop ("use_block_parsing" , None ) # Not a parse_text kwarg
364+ parsed_text = await asyncio .to_thread (parse_text , path , ** parser_kwargs )
360365 elif str_path .endswith (".html" ):
366+ parser_kwargs .pop ("use_block_parsing" , None ) # Not a parse_text kwarg
361367 parsed_text = await asyncio .to_thread (
362- parse_text , path , html = True , page_size_limit = page_size_limit
368+ parse_text , path , html = True , ** parser_kwargs
363369 )
364370 else :
371+ parser_kwargs .pop ("use_block_parsing" , None ) # Not a parse_text kwarg
365372 parsed_text = await asyncio .to_thread (
366- parse_text ,
367- path ,
368- split_lines = True ,
369- use_tiktoken = False ,
370- page_size_limit = page_size_limit ,
373+ parse_text , path , split_lines = True , use_tiktoken = False , ** parser_kwargs
371374 )
372375
373376 if parsed_text_only :
0 commit comments