Made block-based parsing an opt-in via a keyword argument, with a

jamesbraza · jamesbraza · commit 522efa089f0b · 2025-06-13T14:13:25.000-07:00
ParsingSetting too
diff --git a/README.md b/README.md
@@ -802,6 +802,7 @@ will return much faster than the first query and we'll be certain the authors ma
 | `answer.get_evidence_if_no_contexts`         | `True`                                 | Allow lazy evidence gathering.                                                                          |
 | `parsing.chunk_size`                         | `5000`                                 | Characters per chunk (0 for no chunking).                                                               |
 | `parsing.page_size_limit`                    | `1,280,000`                            | Character limit per page.                                                                               |
+| `parsing.pdfs_use_block_parsing`             | `False`                                | Opt-in flag for block-based PDF parsing over text-based PDF parsing.                                    |
 | `parsing.use_doc_details`                    | `True`                                 | Whether to get metadata details for docs.                                                               |
 | `parsing.overlap`                            | `250`                                  | Characters to overlap chunks.                                                                           |
 | `parsing.defer_embedding`                    | `False`                                | Whether to defer embedding until summarization.                                                         |
diff --git a/paperqa/docs.py b/paperqa/docs.py
@@ -298,6 +298,7 @@ async def aadd(  # noqa: PLR0912
                 chunk_chars=parse_config.chunk_size,
                 overlap=parse_config.overlap,
                 page_size_limit=parse_config.page_size_limit,
+                use_block_parsing=parse_config.pdfs_use_block_parsing,
             )
             if not texts:
                 raise ValueError(f"Could not read document {path}. Is it empty?")
@@ -390,6 +391,7 @@ async def aadd(  # noqa: PLR0912
             chunk_chars=parse_config.chunk_size,
             overlap=parse_config.overlap,
             page_size_limit=parse_config.page_size_limit,
+            use_block_parsing=parse_config.pdfs_use_block_parsing,
         )
         # loose check to see if document was loaded
         if (
diff --git a/paperqa/readers.py b/paperqa/readers.py
@@ -25,7 +25,9 @@
 
 
 def parse_pdf_to_pages(
-    path: str | os.PathLike, page_size_limit: int | None = None
+    path: str | os.PathLike,
+    page_size_limit: int | None = None,
+    use_block_parsing: bool = False,
 ) -> ParsedText:
 
     with pymupdf.open(path) as file:
@@ -42,17 +44,23 @@ def parse_pdf_to_pages(
                     " file is corrupt."
                 ) from exc
 
-            # Extract text blocks from the page
-            # Note: sort=False is important to preserve the order of text blocks
-            # as they appear in the PDF
-            blocks = page.get_text("blocks", sort=False)
+            if use_block_parsing:
+                # NOTE: this block-based parsing appears to be better, but until
+                # fully validated on 1+ benchmarks, it's considered experimental
 
-            # Concatenate text blocks into a single string
-            text = "\n".join(
-                block[BLOCK_TEXT_INDEX]
-                for block in blocks
-                if len(block) > BLOCK_TEXT_INDEX
-            )
+                # Extract text blocks from the page
+                # Note: sort=False is important to preserve the order of text blocks
+                # as they appear in the PDF
+                blocks = page.get_text("blocks", sort=False)
+
+                # Concatenate text blocks into a single string
+                text = "\n".join(
+                    block[BLOCK_TEXT_INDEX]
+                    for block in blocks
+                    if len(block) > BLOCK_TEXT_INDEX
+                )
+            else:
+                text = page.get_text("text", sort=True)
 
             if page_size_limit and len(text) > page_size_limit:
                 raise ImpossibleParsingError(
@@ -281,7 +289,7 @@ async def read_doc(
     include_metadata: Literal[True],
     chunk_chars: int = ...,
     overlap: int = ...,
-    page_size_limit: int | None = ...,
+    **parser_kwargs,
 ) -> ParsedText: ...
 @overload
 async def read_doc(
@@ -291,7 +299,7 @@ async def read_doc(
     include_metadata: Literal[False] = ...,
     chunk_chars: int = ...,
     overlap: int = ...,
-    page_size_limit: int | None = ...,
+    **parser_kwargs,
 ) -> ParsedText: ...
 @overload
 async def read_doc(
@@ -301,7 +309,7 @@ async def read_doc(
     include_metadata: Literal[True],
     chunk_chars: int = ...,
     overlap: int = ...,
-    page_size_limit: int | None = ...,
+    **parser_kwargs,
 ) -> tuple[list[Text], ParsedMetadata]: ...
 @overload
 async def read_doc(
@@ -311,7 +319,7 @@ async def read_doc(
     include_metadata: Literal[False] = ...,
     chunk_chars: int = ...,
     overlap: int = ...,
-    page_size_limit: int | None = ...,
+    **parser_kwargs,
 ) -> list[Text]: ...
 @overload
 async def read_doc(
@@ -321,7 +329,7 @@ async def read_doc(
     include_metadata: Literal[True],
     chunk_chars: int = ...,
     overlap: int = ...,
-    page_size_limit: int | None = ...,
+    **parser_kwargs,
 ) -> tuple[list[Text], ParsedMetadata]: ...
 async def read_doc(
     path: str | os.PathLike,
@@ -330,7 +338,7 @@ async def read_doc(
     include_metadata: bool = False,
     chunk_chars: int = 3000,
     overlap: int = 100,
-    page_size_limit: int | None = None,
+    **parser_kwargs,
 ) -> list[Text] | ParsedText | tuple[list[Text], ParsedMetadata]:
     """Parse a document and split into chunks.
 
@@ -342,32 +350,27 @@ async def read_doc(
         include_metadata: return a tuple
         chunk_chars: size of chunks
         overlap: size of overlap between chunks
-        page_size_limit: optional limit on the number of characters per page
+        parser_kwargs: Keyword arguments to pass to the used parsing function.
     """
     str_path = str(path)
 
     # start with parsing -- users may want to store this separately
     if str_path.endswith(".pdf"):
         # TODO: Make parse_pdf_to_pages async
-        parsed_text = await asyncio.to_thread(
-            parse_pdf_to_pages, path, page_size_limit=page_size_limit
-        )
+        parsed_text = await asyncio.to_thread(parse_pdf_to_pages, path, **parser_kwargs)
     elif str_path.endswith(".txt"):
         # TODO: Make parse_text async
-        parsed_text = await asyncio.to_thread(
-            parse_text, path, page_size_limit=page_size_limit
-        )
+        parser_kwargs.pop("use_block_parsing", None)  # Not a parse_text kwarg
+        parsed_text = await asyncio.to_thread(parse_text, path, **parser_kwargs)
     elif str_path.endswith(".html"):
+        parser_kwargs.pop("use_block_parsing", None)  # Not a parse_text kwarg
         parsed_text = await asyncio.to_thread(
-            parse_text, path, html=True, page_size_limit=page_size_limit
+            parse_text, path, html=True, **parser_kwargs
         )
     else:
+        parser_kwargs.pop("use_block_parsing", None)  # Not a parse_text kwarg
         parsed_text = await asyncio.to_thread(
-            parse_text,
-            path,
-            split_lines=True,
-            use_tiktoken=False,
-            page_size_limit=page_size_limit,
+            parse_text, path, split_lines=True, use_tiktoken=False, **parser_kwargs
         )
 
     if parsed_text_only:
diff --git a/paperqa/settings.py b/paperqa/settings.py
@@ -165,6 +165,13 @@ class ParsingSettings(BaseModel):
             " (ignoring chars vs tokens difference)."
         ),
     )
+    pdfs_use_block_parsing: bool = Field(
+        default=False,
+        description=(
+            "Opt-in flag to use block-based parsing for PDFs instead of"
+            " text-based parsing, which is known to be better for some PDFs."
+        ),
+    )
     use_doc_details: bool = Field(
         default=True, description="Whether to try to get metadata details for a Doc."
     )
diff --git a/tests/test_paperqa.py b/tests/test_paperqa.py
@@ -991,7 +991,7 @@ async def test_pdf_reader_w_no_chunks(stub_data_dir: Path) -> None:
 
 def test_parse_pdf_to_pages(stub_data_dir: Path) -> None:
     filepath = stub_data_dir / "pasa.pdf"
-    parsed_text = parse_pdf_to_pages(filepath)
+    parsed_text = parse_pdf_to_pages(filepath, use_block_parsing=True)
     assert isinstance(parsed_text.content, dict)
     assert "1" in parsed_text.content, "Parsed text should contain page 1"
     assert (

Original file line number	Diff line number	Diff line change
`@@ -165,6 +165,13 @@ class ParsingSettings(BaseModel):`
`165`	`165`	`" (ignoring chars vs tokens difference)."`
`166`	`166`	`),`
`167`	`167`	`)`
	`168`	`+ pdfs_use_block_parsing: bool = Field(`
	`169`	`+ default=False,`
	`170`	`+ description=(`
	`171`	`+ "Opt-in flag to use block-based parsing for PDFs instead of"`
	`172`	`+ " text-based parsing, which is known to be better for some PDFs."`
	`173`	`+ ),`
	`174`	`+ )`
`168`	`175`	`use_doc_details: bool = Field(`
`169`	`176`	`default=True, description="Whether to try to get metadata details for a Doc."`
`170`	`177`	`)`