Skip to content

Fix pdf parsing bug #938

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Jun 13, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -802,6 +802,7 @@ will return much faster than the first query and we'll be certain the authors ma
| `answer.get_evidence_if_no_contexts` | `True` | Allow lazy evidence gathering. |
| `parsing.chunk_size` | `5000` | Characters per chunk (0 for no chunking). |
| `parsing.page_size_limit` | `1,280,000` | Character limit per page. |
| `parsing.pdfs_use_block_parsing` | `False` | Opt-in flag for block-based PDF parsing over text-based PDF parsing. |
| `parsing.use_doc_details` | `True` | Whether to get metadata details for docs. |
| `parsing.overlap` | `250` | Characters to overlap chunks. |
| `parsing.defer_embedding` | `False` | Whether to defer embedding until summarization. |
Expand Down
2 changes: 2 additions & 0 deletions paperqa/docs.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,7 @@ async def aadd( # noqa: PLR0912
chunk_chars=parse_config.chunk_size,
overlap=parse_config.overlap,
page_size_limit=parse_config.page_size_limit,
use_block_parsing=parse_config.pdfs_use_block_parsing,
)
if not texts:
raise ValueError(f"Could not read document {path}. Is it empty?")
Expand Down Expand Up @@ -390,6 +391,7 @@ async def aadd( # noqa: PLR0912
chunk_chars=parse_config.chunk_size,
overlap=parse_config.overlap,
page_size_limit=parse_config.page_size_limit,
use_block_parsing=parse_config.pdfs_use_block_parsing,
)
# loose check to see if document was loaded
if (
Expand Down
59 changes: 38 additions & 21 deletions paperqa/readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,13 @@
from paperqa.utils import ImpossibleParsingError
from paperqa.version import __version__ as pqa_version

BLOCK_TEXT_INDEX = 4


def parse_pdf_to_pages(
path: str | os.PathLike, page_size_limit: int | None = None
path: str | os.PathLike,
page_size_limit: int | None = None,
use_block_parsing: bool = False,
) -> ParsedText:

with pymupdf.open(path) as file:
Expand All @@ -39,7 +43,25 @@ def parse_pdf_to_pages(
f" {file.page_count} for the PDF at path {path}, likely this PDF"
" file is corrupt."
) from exc
text = page.get_text("text", sort=True)

if use_block_parsing:
# NOTE: this block-based parsing appears to be better, but until
# fully validated on 1+ benchmarks, it's considered experimental

# Extract text blocks from the page
# Note: sort=False is important to preserve the order of text blocks
# as they appear in the PDF
blocks = page.get_text("blocks", sort=False)

# Concatenate text blocks into a single string
text = "\n".join(
block[BLOCK_TEXT_INDEX]
for block in blocks
if len(block) > BLOCK_TEXT_INDEX
)
else:
text = page.get_text("text", sort=True)

if page_size_limit and len(text) > page_size_limit:
raise ImpossibleParsingError(
f"The text in page {i} of {file.page_count} was {len(text)} chars"
Expand Down Expand Up @@ -267,7 +289,7 @@ async def read_doc(
include_metadata: Literal[True],
chunk_chars: int = ...,
overlap: int = ...,
page_size_limit: int | None = ...,
**parser_kwargs,
) -> ParsedText: ...
@overload
async def read_doc(
Expand All @@ -277,7 +299,7 @@ async def read_doc(
include_metadata: Literal[False] = ...,
chunk_chars: int = ...,
overlap: int = ...,
page_size_limit: int | None = ...,
**parser_kwargs,
) -> ParsedText: ...
@overload
async def read_doc(
Expand All @@ -287,7 +309,7 @@ async def read_doc(
include_metadata: Literal[True],
chunk_chars: int = ...,
overlap: int = ...,
page_size_limit: int | None = ...,
**parser_kwargs,
) -> tuple[list[Text], ParsedMetadata]: ...
@overload
async def read_doc(
Expand All @@ -297,7 +319,7 @@ async def read_doc(
include_metadata: Literal[False] = ...,
chunk_chars: int = ...,
overlap: int = ...,
page_size_limit: int | None = ...,
**parser_kwargs,
) -> list[Text]: ...
@overload
async def read_doc(
Expand All @@ -307,7 +329,7 @@ async def read_doc(
include_metadata: Literal[True],
chunk_chars: int = ...,
overlap: int = ...,
page_size_limit: int | None = ...,
**parser_kwargs,
) -> tuple[list[Text], ParsedMetadata]: ...
async def read_doc(
path: str | os.PathLike,
Expand All @@ -316,7 +338,7 @@ async def read_doc(
include_metadata: bool = False,
chunk_chars: int = 3000,
overlap: int = 100,
page_size_limit: int | None = None,
**parser_kwargs,
) -> list[Text] | ParsedText | tuple[list[Text], ParsedMetadata]:
"""Parse a document and split into chunks.

Expand All @@ -328,32 +350,27 @@ async def read_doc(
include_metadata: return a tuple
chunk_chars: size of chunks
overlap: size of overlap between chunks
page_size_limit: optional limit on the number of characters per page
parser_kwargs: Keyword arguments to pass to the used parsing function.
"""
str_path = str(path)

# start with parsing -- users may want to store this separately
if str_path.endswith(".pdf"):
# TODO: Make parse_pdf_to_pages async
parsed_text = await asyncio.to_thread(
parse_pdf_to_pages, path, page_size_limit=page_size_limit
)
parsed_text = await asyncio.to_thread(parse_pdf_to_pages, path, **parser_kwargs)
elif str_path.endswith(".txt"):
# TODO: Make parse_text async
parsed_text = await asyncio.to_thread(
parse_text, path, page_size_limit=page_size_limit
)
parser_kwargs.pop("use_block_parsing", None) # Not a parse_text kwarg
parsed_text = await asyncio.to_thread(parse_text, path, **parser_kwargs)
elif str_path.endswith(".html"):
parser_kwargs.pop("use_block_parsing", None) # Not a parse_text kwarg
parsed_text = await asyncio.to_thread(
parse_text, path, html=True, page_size_limit=page_size_limit
parse_text, path, html=True, **parser_kwargs
)
else:
parser_kwargs.pop("use_block_parsing", None) # Not a parse_text kwarg
parsed_text = await asyncio.to_thread(
parse_text,
path,
split_lines=True,
use_tiktoken=False,
page_size_limit=page_size_limit,
parse_text, path, split_lines=True, use_tiktoken=False, **parser_kwargs
)

if parsed_text_only:
Expand Down
7 changes: 7 additions & 0 deletions paperqa/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,13 @@ class ParsingSettings(BaseModel):
" (ignoring chars vs tokens difference)."
),
)
pdfs_use_block_parsing: bool = Field(
default=False,
description=(
"Opt-in flag to use block-based parsing for PDFs instead of"
" text-based parsing, which is known to be better for some PDFs."
),
)
use_doc_details: bool = Field(
default=True, description="Whether to try to get metadata details for a Doc."
)
Expand Down
Binary file added tests/stub_data/pasa.pdf
Binary file not shown.
12 changes: 10 additions & 2 deletions tests/test_agents.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,11 +89,11 @@ async def test_get_directory_index(
"year",
], "Incorrect fields in index"
assert not index.changed, "Expected index to not have changes at this point"
# bates.txt + empty.txt + flag_day.html + gravity_hill.md + obama.txt + paper.pdf,
# bates.txt + empty.txt + flag_day.html + gravity_hill.md + obama.txt + paper.pdf + pasa.pdf,
# but empty.txt fails to be added
path_to_id = await index.index_files
assert (
sum(id_ != FAILED_DOCUMENT_ADD_ID for id_ in path_to_id.values()) == 5
sum(id_ != FAILED_DOCUMENT_ADD_ID for id_ in path_to_id.values()) == 6
), "Incorrect number of parsed index files"

with subtests.test(msg="check-txt-query"):
Expand Down Expand Up @@ -248,6 +248,7 @@ async def test_getting_manifest(
"gravity_hill.md",
"obama.txt",
"paper.pdf",
"pasa.pdf",
}


Expand Down Expand Up @@ -565,6 +566,13 @@ async def gen_answer(self, state) -> str: # noqa: ARG001, RUF029
async def test_agent_sharing_state(
agent_test_settings: Settings, subtests: SubTests, callback_type: str | None
) -> None:
def files_filter(f) -> bool:
# Filter out pasa.pdf just to speed the test and save API costs
return f.name != "pasa.pdf" and IndexSettings.model_fields[
"files_filter"
].default(f)

agent_test_settings.agent.index.files_filter = files_filter
agent_test_settings.agent.search_count = 3 # Keep low for speed
agent_test_settings.answer.evidence_k = 2
agent_test_settings.answer.answer_max_sources = 1
Expand Down
13 changes: 12 additions & 1 deletion tests/test_paperqa.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@
from paperqa.core import llm_parse_json
from paperqa.prompts import CANNOT_ANSWER_PHRASE
from paperqa.prompts import qa_prompt as default_qa_prompt
from paperqa.readers import read_doc
from paperqa.readers import parse_pdf_to_pages, read_doc
from paperqa.types import ChunkMetadata
from paperqa.utils import (
extract_score,
Expand Down Expand Up @@ -989,6 +989,17 @@ async def test_pdf_reader_w_no_chunks(stub_data_dir: Path) -> None:
assert docs.texts[0].embedding is None, "Should have deferred the embedding"


def test_parse_pdf_to_pages(stub_data_dir: Path) -> None:
filepath = stub_data_dir / "pasa.pdf"
parsed_text = parse_pdf_to_pages(filepath, use_block_parsing=True)
assert isinstance(parsed_text.content, dict)
assert "1" in parsed_text.content, "Parsed text should contain page 1"
assert (
"Abstract\n\nWe introduce PaSa, an advanced Paper Search"
"\nagent powered by large language models."
) in parsed_text.content["1"]


@pytest.mark.vcr
@pytest.mark.parametrize("defer_embeddings", [True, False])
@pytest.mark.asyncio
Expand Down
Loading