Skip to content

Commit f9adab1

Browse files
David Huggins-DainesDavid Huggins-Daines
David Huggins-Daines
authored and
David Huggins-Daines
committed
fix: handle case of overlapping cells in contract_cells_into_lines_v2
Fixes: docling-project#99 Signed-off-by: David Huggins-Daines <[email protected]> Signed-off-by: David Huggins-Daines <[email protected]>
1 parent c14ec54 commit f9adab1

File tree

2 files changed

+43
-7
lines changed

2 files changed

+43
-7
lines changed

src/v2/pdf_resources/page_cell.h

+42-6
Original file line numberDiff line numberDiff line change
@@ -262,10 +262,38 @@ namespace pdflib
262262

263263
bool pdf_resource<PAGE_CELL>::is_adjacent_to(pdf_resource<PAGE_CELL>& other, double eps)
264264
{
265-
double d0 = std::sqrt((r_x1-other.r_x0)*(r_x1-other.r_x0) + (r_y1-other.r_y0)*(r_y1-other.r_y0));
266-
double d1 = std::sqrt((r_x2-other.r_x3)*(r_x2-other.r_x3) + (r_y2-other.r_y3)*(r_y2-other.r_y3));
265+
// This assumes (even for right-to-left text) that other is to the
266+
// right of this. If two cells overlap then they are obviously
267+
// adjacent, otherwise the right side corners of this must be
268+
// within eps of the left side corners of other.
269+
270+
// Intersection of bounding rectangles (FIXME: Does not actually
271+
// imply overlap for rotated cells, not immediately sure the
272+
// correct and efficient way to compute that)
273+
double max_x0 = std::max(x0, other.x0);
274+
double min_x1 = std::min(x1, other.x1);
275+
double max_y0 = std::max(y0, other.y0);
276+
double min_y1 = std::min(y1, other.y1);
277+
if (max_x0 < min_x1 and max_y0 < min_y1)
278+
return true;
279+
280+
// lower_right(this) : lower_left(other)
281+
double dx0 = other.r_x0 - r_x1;
282+
double dy0 = other.r_y0 - r_y1;
283+
double d0 = std::sqrt(dx0 * dx0 + dy0 * dy0);
284+
285+
if (d0 >= eps)
286+
return false;
287+
288+
// upper_right(this) : upper_left(other)
289+
double dx1 = other.r_x3 - r_x2;
290+
double dy1 = other.r_y3 - r_y2;
291+
double d1 = std::sqrt(dx1 * dx1 + dy1 * dy1);
292+
293+
if (d1 >= eps)
294+
return false;
267295

268-
return ((d0<eps) and (d1<eps));
296+
return true;
269297
}
270298

271299
bool pdf_resource<PAGE_CELL>::has_same_reading_orientation(pdf_resource<PAGE_CELL>& other)
@@ -285,11 +313,19 @@ namespace pdflib
285313
LOG_S(ERROR) << "inconsistent merging of cells!";
286314
}
287315

288-
double d0 = std::sqrt((r_x1-other.r_x0)*(r_x1-other.r_x0) + (r_y1-other.r_y0)*(r_y1-other.r_y0));
316+
double max_x0 = std::max(x0, other.x0);
317+
double min_x1 = std::min(x1, other.x1);
318+
double max_y0 = std::max(y0, other.y0);
319+
double min_y1 = std::min(y1, other.y1);
320+
bool overlap = (max_x0 < min_x1 and max_y0 < min_y1);
321+
322+
double dx0 = other.r_x0 - r_x1;
323+
double dy0 = other.r_y0 - r_y1;
324+
double d0 = std::sqrt(dx0 * dx0 + dy0 * dy0);
289325

290326
if((not left_to_right) or (not other.left_to_right))
291327
{
292-
if(delta<d0)
328+
if(d0 >= delta and not overlap)
293329
{
294330
text = " " + text;
295331
}
@@ -299,7 +335,7 @@ namespace pdflib
299335
}
300336
else
301337
{
302-
if(delta<d0)
338+
if(d0 >= delta and not overlap)
303339
{
304340
text += " ";
305341
}

src/v2/pdf_sanitators/cells.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -283,7 +283,7 @@ namespace pdflib
283283
*/
284284

285285
void pdf_sanitator<PAGE_CELLS>::contract_cells_into_lines_v2(pdf_resource<PAGE_CELLS>& cells,
286-
double horizontal_cell_tolerance,
286+
double horizontal_cell_tolerance, // FIXME: UNUSED
287287
bool enforce_same_font,
288288
double space_width_factor_for_merge,
289289
double space_width_factor_for_merge_with_space)

0 commit comments

Comments
 (0)