diff --git a/README.md b/README.md index 1eaf8bc5..6d15b8aa 100644 --- a/README.md +++ b/README.md @@ -802,6 +802,7 @@ will return much faster than the first query and we'll be certain the authors ma | `answer.get_evidence_if_no_contexts` | `True` | Allow lazy evidence gathering. | | `parsing.chunk_size` | `5000` | Characters per chunk (0 for no chunking). | | `parsing.page_size_limit` | `1,280,000` | Character limit per page. | +| `parsing.pdfs_use_block_parsing` | `False` | Opt-in flag for block-based PDF parsing over text-based PDF parsing. | | `parsing.use_doc_details` | `True` | Whether to get metadata details for docs. | | `parsing.overlap` | `250` | Characters to overlap chunks. | | `parsing.defer_embedding` | `False` | Whether to defer embedding until summarization. | diff --git a/paperqa/docs.py b/paperqa/docs.py index 7ac46f09..f7e0bdc6 100644 --- a/paperqa/docs.py +++ b/paperqa/docs.py @@ -298,6 +298,7 @@ async def aadd( # noqa: PLR0912 chunk_chars=parse_config.chunk_size, overlap=parse_config.overlap, page_size_limit=parse_config.page_size_limit, + use_block_parsing=parse_config.pdfs_use_block_parsing, ) if not texts: raise ValueError(f"Could not read document {path}. Is it empty?") @@ -390,6 +391,7 @@ async def aadd( # noqa: PLR0912 chunk_chars=parse_config.chunk_size, overlap=parse_config.overlap, page_size_limit=parse_config.page_size_limit, + use_block_parsing=parse_config.pdfs_use_block_parsing, ) # loose check to see if document was loaded if ( diff --git a/paperqa/readers.py b/paperqa/readers.py index 2db0e614..7f33f2ac 100644 --- a/paperqa/readers.py +++ b/paperqa/readers.py @@ -21,9 +21,13 @@ from paperqa.utils import ImpossibleParsingError from paperqa.version import __version__ as pqa_version +BLOCK_TEXT_INDEX = 4 + def parse_pdf_to_pages( - path: str | os.PathLike, page_size_limit: int | None = None + path: str | os.PathLike, + page_size_limit: int | None = None, + use_block_parsing: bool = False, ) -> ParsedText: with pymupdf.open(path) as file: @@ -39,7 +43,25 @@ def parse_pdf_to_pages( f" {file.page_count} for the PDF at path {path}, likely this PDF" " file is corrupt." ) from exc - text = page.get_text("text", sort=True) + + if use_block_parsing: + # NOTE: this block-based parsing appears to be better, but until + # fully validated on 1+ benchmarks, it's considered experimental + + # Extract text blocks from the page + # Note: sort=False is important to preserve the order of text blocks + # as they appear in the PDF + blocks = page.get_text("blocks", sort=False) + + # Concatenate text blocks into a single string + text = "\n".join( + block[BLOCK_TEXT_INDEX] + for block in blocks + if len(block) > BLOCK_TEXT_INDEX + ) + else: + text = page.get_text("text", sort=True) + if page_size_limit and len(text) > page_size_limit: raise ImpossibleParsingError( f"The text in page {i} of {file.page_count} was {len(text)} chars" @@ -267,7 +289,7 @@ async def read_doc( include_metadata: Literal[True], chunk_chars: int = ..., overlap: int = ..., - page_size_limit: int | None = ..., + **parser_kwargs, ) -> ParsedText: ... @overload async def read_doc( @@ -277,7 +299,7 @@ async def read_doc( include_metadata: Literal[False] = ..., chunk_chars: int = ..., overlap: int = ..., - page_size_limit: int | None = ..., + **parser_kwargs, ) -> ParsedText: ... @overload async def read_doc( @@ -287,7 +309,7 @@ async def read_doc( include_metadata: Literal[True], chunk_chars: int = ..., overlap: int = ..., - page_size_limit: int | None = ..., + **parser_kwargs, ) -> tuple[list[Text], ParsedMetadata]: ... @overload async def read_doc( @@ -297,7 +319,7 @@ async def read_doc( include_metadata: Literal[False] = ..., chunk_chars: int = ..., overlap: int = ..., - page_size_limit: int | None = ..., + **parser_kwargs, ) -> list[Text]: ... @overload async def read_doc( @@ -307,7 +329,7 @@ async def read_doc( include_metadata: Literal[True], chunk_chars: int = ..., overlap: int = ..., - page_size_limit: int | None = ..., + **parser_kwargs, ) -> tuple[list[Text], ParsedMetadata]: ... async def read_doc( path: str | os.PathLike, @@ -316,7 +338,7 @@ async def read_doc( include_metadata: bool = False, chunk_chars: int = 3000, overlap: int = 100, - page_size_limit: int | None = None, + **parser_kwargs, ) -> list[Text] | ParsedText | tuple[list[Text], ParsedMetadata]: """Parse a document and split into chunks. @@ -328,32 +350,27 @@ async def read_doc( include_metadata: return a tuple chunk_chars: size of chunks overlap: size of overlap between chunks - page_size_limit: optional limit on the number of characters per page + parser_kwargs: Keyword arguments to pass to the used parsing function. """ str_path = str(path) # start with parsing -- users may want to store this separately if str_path.endswith(".pdf"): # TODO: Make parse_pdf_to_pages async - parsed_text = await asyncio.to_thread( - parse_pdf_to_pages, path, page_size_limit=page_size_limit - ) + parsed_text = await asyncio.to_thread(parse_pdf_to_pages, path, **parser_kwargs) elif str_path.endswith(".txt"): # TODO: Make parse_text async - parsed_text = await asyncio.to_thread( - parse_text, path, page_size_limit=page_size_limit - ) + parser_kwargs.pop("use_block_parsing", None) # Not a parse_text kwarg + parsed_text = await asyncio.to_thread(parse_text, path, **parser_kwargs) elif str_path.endswith(".html"): + parser_kwargs.pop("use_block_parsing", None) # Not a parse_text kwarg parsed_text = await asyncio.to_thread( - parse_text, path, html=True, page_size_limit=page_size_limit + parse_text, path, html=True, **parser_kwargs ) else: + parser_kwargs.pop("use_block_parsing", None) # Not a parse_text kwarg parsed_text = await asyncio.to_thread( - parse_text, - path, - split_lines=True, - use_tiktoken=False, - page_size_limit=page_size_limit, + parse_text, path, split_lines=True, use_tiktoken=False, **parser_kwargs ) if parsed_text_only: diff --git a/paperqa/settings.py b/paperqa/settings.py index 0709abeb..c66bbc6a 100644 --- a/paperqa/settings.py +++ b/paperqa/settings.py @@ -165,6 +165,13 @@ class ParsingSettings(BaseModel): " (ignoring chars vs tokens difference)." ), ) + pdfs_use_block_parsing: bool = Field( + default=False, + description=( + "Opt-in flag to use block-based parsing for PDFs instead of" + " text-based parsing, which is known to be better for some PDFs." + ), + ) use_doc_details: bool = Field( default=True, description="Whether to try to get metadata details for a Doc." ) diff --git a/tests/stub_data/pasa.pdf b/tests/stub_data/pasa.pdf new file mode 100755 index 00000000..8affd170 Binary files /dev/null and b/tests/stub_data/pasa.pdf differ diff --git a/tests/test_agents.py b/tests/test_agents.py index c0c9ee2c..8d2202de 100644 --- a/tests/test_agents.py +++ b/tests/test_agents.py @@ -89,11 +89,11 @@ async def test_get_directory_index( "year", ], "Incorrect fields in index" assert not index.changed, "Expected index to not have changes at this point" - # bates.txt + empty.txt + flag_day.html + gravity_hill.md + obama.txt + paper.pdf, + # bates.txt + empty.txt + flag_day.html + gravity_hill.md + obama.txt + paper.pdf + pasa.pdf, # but empty.txt fails to be added path_to_id = await index.index_files assert ( - sum(id_ != FAILED_DOCUMENT_ADD_ID for id_ in path_to_id.values()) == 5 + sum(id_ != FAILED_DOCUMENT_ADD_ID for id_ in path_to_id.values()) == 6 ), "Incorrect number of parsed index files" with subtests.test(msg="check-txt-query"): @@ -248,6 +248,7 @@ async def test_getting_manifest( "gravity_hill.md", "obama.txt", "paper.pdf", + "pasa.pdf", } @@ -565,6 +566,13 @@ async def gen_answer(self, state) -> str: # noqa: ARG001, RUF029 async def test_agent_sharing_state( agent_test_settings: Settings, subtests: SubTests, callback_type: str | None ) -> None: + def files_filter(f) -> bool: + # Filter out pasa.pdf just to speed the test and save API costs + return f.name != "pasa.pdf" and IndexSettings.model_fields[ + "files_filter" + ].default(f) + + agent_test_settings.agent.index.files_filter = files_filter agent_test_settings.agent.search_count = 3 # Keep low for speed agent_test_settings.answer.evidence_k = 2 agent_test_settings.answer.answer_max_sources = 1 diff --git a/tests/test_paperqa.py b/tests/test_paperqa.py index 9a16fe15..9db9720a 100644 --- a/tests/test_paperqa.py +++ b/tests/test_paperqa.py @@ -47,7 +47,7 @@ from paperqa.core import llm_parse_json from paperqa.prompts import CANNOT_ANSWER_PHRASE from paperqa.prompts import qa_prompt as default_qa_prompt -from paperqa.readers import read_doc +from paperqa.readers import parse_pdf_to_pages, read_doc from paperqa.types import ChunkMetadata from paperqa.utils import ( extract_score, @@ -989,6 +989,17 @@ async def test_pdf_reader_w_no_chunks(stub_data_dir: Path) -> None: assert docs.texts[0].embedding is None, "Should have deferred the embedding" +def test_parse_pdf_to_pages(stub_data_dir: Path) -> None: + filepath = stub_data_dir / "pasa.pdf" + parsed_text = parse_pdf_to_pages(filepath, use_block_parsing=True) + assert isinstance(parsed_text.content, dict) + assert "1" in parsed_text.content, "Parsed text should contain page 1" + assert ( + "Abstract\n\nWe introduce PaSa, an advanced Paper Search" + "\nagent powered by large language models." + ) in parsed_text.content["1"] + + @pytest.mark.vcr @pytest.mark.parametrize("defer_embeddings", [True, False]) @pytest.mark.asyncio