Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 18 additions & 5 deletions src/parse/page_item_sanitators/cells.h
Original file line number Diff line number Diff line change
Expand Up @@ -184,11 +184,14 @@ namespace pdflib

LOG_S(INFO) << "# char-cells: " << line_cells.size();

sanitize_bbox(line_cells,
config.horizontal_cell_tolerance,
config.enforce_same_font,
config.line_space_width_factor_for_merge,
config.line_space_width_factor_for_merge_with_space);
// Line cells can legitimately mix fonts (e.g., fallback symbol glyphs such as arrows).
// Also, PDF content-stream order is not guaranteed to match visual reading order.
// Use the order-independent merge path for line construction and do not require font equality.
contract_cells_into_lines_v2(line_cells,
config.horizontal_cell_tolerance,
false,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why would you hard code it if we have a config parameter for this?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Due to lack of understanding of the code base.

The bug surfaced in docling-serve and it took a bit of triage to dig this deep.

What about adding a new config option, e.g. decode_page_config.enforce_same_font_for_line_cells (as the existing one decode_page_config.enforce_same_font is too coarse)?

config.line_space_width_factor_for_merge,
config.line_space_width_factor_for_merge_with_space);

LOG_S(INFO) << "# line-cells: " << line_cells.size();

Expand Down Expand Up @@ -580,6 +583,16 @@ namespace pdflib
erased_cell = true;

LOG_S(INFO) << " -> merging cell-" << i << " with " << j << " '" << cells[j].text << "'"<< ": " << cells[i].text;
}
else if(cells[j].is_adjacent_to(cells[i], delta_0))
{
cells[j].merge_with(cells[i], delta_1);

cells[i].active = false;
erased_cell = true;

LOG_S(INFO) << " -> merging reverse cell-" << j << " with " << i << " '" << cells[i].text << "'"<< ": " << cells[j].text;
break;
}
}
}
Expand Down
Binary file added tests/data/targeted/mixed_font_arrows_01.pdf
Binary file not shown.
25 changes: 25 additions & 0 deletions tests/test_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -918,3 +918,28 @@ def test_annotations_match_groundtruth():
verify_annotations_recursive(true_annotations, pred_dict)

pdf_doc.unload()


def test_mixed_font_arrows_are_merged_into_line_cells():
"""Regression: mixed-font arrow glyphs must stay in the surrounding line text."""
parser = DoclingPdfParser(loglevel="fatal")
pdf_doc = parser.load(
path_or_stream="tests/data/targeted/mixed_font_arrows_01.pdf",
lazy=False,
)

expected_line_texts = [
"Arrow Ordering Parser Fixture",
"Alpha → Beta → Gamma",
"One → Two → Three",
"North → South",
]

page = pdf_doc.get_page(1)
line_texts = [normalize_text(cell.text) for cell in page.textline_cells]

assert (
line_texts == expected_line_texts
), f"unexpected line cells: {line_texts}"

pdf_doc.unload()
Loading