Skip to content

Commit 522efa0

Browse files
committed
Made block-based parsing an opt-in via a keyword argument, with a
ParsingSetting too
1 parent 6865988 commit 522efa0

File tree

5 files changed

+44
-31
lines changed

5 files changed

+44
-31
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -802,6 +802,7 @@ will return much faster than the first query and we'll be certain the authors ma
802802
| `answer.get_evidence_if_no_contexts` | `True` | Allow lazy evidence gathering. |
803803
| `parsing.chunk_size` | `5000` | Characters per chunk (0 for no chunking). |
804804
| `parsing.page_size_limit` | `1,280,000` | Character limit per page. |
805+
| `parsing.pdfs_use_block_parsing` | `False` | Opt-in flag for block-based PDF parsing over text-based PDF parsing. |
805806
| `parsing.use_doc_details` | `True` | Whether to get metadata details for docs. |
806807
| `parsing.overlap` | `250` | Characters to overlap chunks. |
807808
| `parsing.defer_embedding` | `False` | Whether to defer embedding until summarization. |

paperqa/docs.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -298,6 +298,7 @@ async def aadd( # noqa: PLR0912
298298
chunk_chars=parse_config.chunk_size,
299299
overlap=parse_config.overlap,
300300
page_size_limit=parse_config.page_size_limit,
301+
use_block_parsing=parse_config.pdfs_use_block_parsing,
301302
)
302303
if not texts:
303304
raise ValueError(f"Could not read document {path}. Is it empty?")
@@ -390,6 +391,7 @@ async def aadd( # noqa: PLR0912
390391
chunk_chars=parse_config.chunk_size,
391392
overlap=parse_config.overlap,
392393
page_size_limit=parse_config.page_size_limit,
394+
use_block_parsing=parse_config.pdfs_use_block_parsing,
393395
)
394396
# loose check to see if document was loaded
395397
if (

paperqa/readers.py

Lines changed: 33 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,9 @@
2525

2626

2727
def parse_pdf_to_pages(
28-
path: str | os.PathLike, page_size_limit: int | None = None
28+
path: str | os.PathLike,
29+
page_size_limit: int | None = None,
30+
use_block_parsing: bool = False,
2931
) -> ParsedText:
3032

3133
with pymupdf.open(path) as file:
@@ -42,17 +44,23 @@ def parse_pdf_to_pages(
4244
" file is corrupt."
4345
) from exc
4446

45-
# Extract text blocks from the page
46-
# Note: sort=False is important to preserve the order of text blocks
47-
# as they appear in the PDF
48-
blocks = page.get_text("blocks", sort=False)
47+
if use_block_parsing:
48+
# NOTE: this block-based parsing appears to be better, but until
49+
# fully validated on 1+ benchmarks, it's considered experimental
4950

50-
# Concatenate text blocks into a single string
51-
text = "\n".join(
52-
block[BLOCK_TEXT_INDEX]
53-
for block in blocks
54-
if len(block) > BLOCK_TEXT_INDEX
55-
)
51+
# Extract text blocks from the page
52+
# Note: sort=False is important to preserve the order of text blocks
53+
# as they appear in the PDF
54+
blocks = page.get_text("blocks", sort=False)
55+
56+
# Concatenate text blocks into a single string
57+
text = "\n".join(
58+
block[BLOCK_TEXT_INDEX]
59+
for block in blocks
60+
if len(block) > BLOCK_TEXT_INDEX
61+
)
62+
else:
63+
text = page.get_text("text", sort=True)
5664

5765
if page_size_limit and len(text) > page_size_limit:
5866
raise ImpossibleParsingError(
@@ -281,7 +289,7 @@ async def read_doc(
281289
include_metadata: Literal[True],
282290
chunk_chars: int = ...,
283291
overlap: int = ...,
284-
page_size_limit: int | None = ...,
292+
**parser_kwargs,
285293
) -> ParsedText: ...
286294
@overload
287295
async def read_doc(
@@ -291,7 +299,7 @@ async def read_doc(
291299
include_metadata: Literal[False] = ...,
292300
chunk_chars: int = ...,
293301
overlap: int = ...,
294-
page_size_limit: int | None = ...,
302+
**parser_kwargs,
295303
) -> ParsedText: ...
296304
@overload
297305
async def read_doc(
@@ -301,7 +309,7 @@ async def read_doc(
301309
include_metadata: Literal[True],
302310
chunk_chars: int = ...,
303311
overlap: int = ...,
304-
page_size_limit: int | None = ...,
312+
**parser_kwargs,
305313
) -> tuple[list[Text], ParsedMetadata]: ...
306314
@overload
307315
async def read_doc(
@@ -311,7 +319,7 @@ async def read_doc(
311319
include_metadata: Literal[False] = ...,
312320
chunk_chars: int = ...,
313321
overlap: int = ...,
314-
page_size_limit: int | None = ...,
322+
**parser_kwargs,
315323
) -> list[Text]: ...
316324
@overload
317325
async def read_doc(
@@ -321,7 +329,7 @@ async def read_doc(
321329
include_metadata: Literal[True],
322330
chunk_chars: int = ...,
323331
overlap: int = ...,
324-
page_size_limit: int | None = ...,
332+
**parser_kwargs,
325333
) -> tuple[list[Text], ParsedMetadata]: ...
326334
async def read_doc(
327335
path: str | os.PathLike,
@@ -330,7 +338,7 @@ async def read_doc(
330338
include_metadata: bool = False,
331339
chunk_chars: int = 3000,
332340
overlap: int = 100,
333-
page_size_limit: int | None = None,
341+
**parser_kwargs,
334342
) -> list[Text] | ParsedText | tuple[list[Text], ParsedMetadata]:
335343
"""Parse a document and split into chunks.
336344
@@ -342,32 +350,27 @@ async def read_doc(
342350
include_metadata: return a tuple
343351
chunk_chars: size of chunks
344352
overlap: size of overlap between chunks
345-
page_size_limit: optional limit on the number of characters per page
353+
parser_kwargs: Keyword arguments to pass to the used parsing function.
346354
"""
347355
str_path = str(path)
348356

349357
# start with parsing -- users may want to store this separately
350358
if str_path.endswith(".pdf"):
351359
# TODO: Make parse_pdf_to_pages async
352-
parsed_text = await asyncio.to_thread(
353-
parse_pdf_to_pages, path, page_size_limit=page_size_limit
354-
)
360+
parsed_text = await asyncio.to_thread(parse_pdf_to_pages, path, **parser_kwargs)
355361
elif str_path.endswith(".txt"):
356362
# TODO: Make parse_text async
357-
parsed_text = await asyncio.to_thread(
358-
parse_text, path, page_size_limit=page_size_limit
359-
)
363+
parser_kwargs.pop("use_block_parsing", None) # Not a parse_text kwarg
364+
parsed_text = await asyncio.to_thread(parse_text, path, **parser_kwargs)
360365
elif str_path.endswith(".html"):
366+
parser_kwargs.pop("use_block_parsing", None) # Not a parse_text kwarg
361367
parsed_text = await asyncio.to_thread(
362-
parse_text, path, html=True, page_size_limit=page_size_limit
368+
parse_text, path, html=True, **parser_kwargs
363369
)
364370
else:
371+
parser_kwargs.pop("use_block_parsing", None) # Not a parse_text kwarg
365372
parsed_text = await asyncio.to_thread(
366-
parse_text,
367-
path,
368-
split_lines=True,
369-
use_tiktoken=False,
370-
page_size_limit=page_size_limit,
373+
parse_text, path, split_lines=True, use_tiktoken=False, **parser_kwargs
371374
)
372375

373376
if parsed_text_only:

paperqa/settings.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,13 @@ class ParsingSettings(BaseModel):
165165
" (ignoring chars vs tokens difference)."
166166
),
167167
)
168+
pdfs_use_block_parsing: bool = Field(
169+
default=False,
170+
description=(
171+
"Opt-in flag to use block-based parsing for PDFs instead of"
172+
" text-based parsing, which is known to be better for some PDFs."
173+
),
174+
)
168175
use_doc_details: bool = Field(
169176
default=True, description="Whether to try to get metadata details for a Doc."
170177
)

tests/test_paperqa.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -991,7 +991,7 @@ async def test_pdf_reader_w_no_chunks(stub_data_dir: Path) -> None:
991991

992992
def test_parse_pdf_to_pages(stub_data_dir: Path) -> None:
993993
filepath = stub_data_dir / "pasa.pdf"
994-
parsed_text = parse_pdf_to_pages(filepath)
994+
parsed_text = parse_pdf_to_pages(filepath, use_block_parsing=True)
995995
assert isinstance(parsed_text.content, dict)
996996
assert "1" in parsed_text.content, "Parsed text should contain page 1"
997997
assert (

0 commit comments

Comments
 (0)