Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: handle case of overlapping cells in contract_cells_into_lines_v2 #105

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 67 additions & 7 deletions src/v2/pdf_resources/page_cell.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@ namespace pdflib

bool is_adjacent_to(pdf_resource<PAGE_CELL>& other, double delta);

bool intersects(pdf_resource<PAGE_CELL>& other);

bool contains(double x, double y);

bool has_same_reading_orientation(pdf_resource<PAGE_CELL>& other);

bool merge_with(pdf_resource<PAGE_CELL>& other, double delta);
Expand Down Expand Up @@ -259,13 +263,64 @@ namespace pdflib

return (num_chars>0? len/num_chars : 0.0);
}


bool pdf_resource<PAGE_CELL>::intersects(pdf_resource<PAGE_CELL>& other)
{
// Use point-in-polygon (via even-odd rule) to determine if
// bounding quadrilaterals intersect.
return (contains(other.r_x0, other.r_y0)
or contains(other.r_x1, other.r_y1)
or contains(other.r_x2, other.r_y2)
or contains(other.r_x3, other.r_y3)
or other.contains(r_x0, r_y0)
or other.contains(r_x1, r_y1)
or other.contains(r_x2, r_y2)
or other.contains(r_x3, r_y3));
}

inline bool inside_plane(double x, double y, double xi, double yi, double xj, double yj)
{
return ((yi > y) != (yj > y) and (x < (xj - xi) * (y - yi) / (yj - yi) + xi));
}

bool pdf_resource<PAGE_CELL>::contains(double x, double y)
{
// point-in-polygon via even-odd rule
bool inside = false;
if (inside_plane(x, y, r_x3, r_y3, r_x0, r_y0))
inside = not inside;
if (inside_plane(x, y, r_x0, r_y0, r_x1, r_y1))
inside = not inside;
if (inside_plane(x, y, r_x1, r_y1, r_x2, r_y2))
inside = not inside;
if (inside_plane(x, y, r_x2, r_y2, r_x3, r_y3))
inside = not inside;
return inside;
}


bool pdf_resource<PAGE_CELL>::is_adjacent_to(pdf_resource<PAGE_CELL>& other, double eps)
{
double d0 = std::sqrt((r_x1-other.r_x0)*(r_x1-other.r_x0) + (r_y1-other.r_y0)*(r_y1-other.r_y0));
double d1 = std::sqrt((r_x2-other.r_x3)*(r_x2-other.r_x3) + (r_y2-other.r_y3)*(r_y2-other.r_y3));
// NOTE: This assumes (even for right-to-left text) that other is
// to the right of this, as the calling code seems to do that.

// lower_right(this) : lower_left(other)
double dx0 = other.r_x0 - r_x1;
double dy0 = other.r_y0 - r_y1;
double d0 = std::sqrt(dx0 * dx0 + dy0 * dy0);

return ((d0<eps) and (d1<eps));
if (d0 >= eps)
return false;

// upper_right(this) : upper_left(other)
double dx1 = other.r_x3 - r_x2;
double dy1 = other.r_y3 - r_y2;
double d1 = std::sqrt(dx1 * dx1 + dy1 * dy1);

if (d1 >= eps)
return false;

return true;
}

bool pdf_resource<PAGE_CELL>::has_same_reading_orientation(pdf_resource<PAGE_CELL>& other)
Expand All @@ -285,11 +340,16 @@ namespace pdflib
LOG_S(ERROR) << "inconsistent merging of cells!";
}

double d0 = std::sqrt((r_x1-other.r_x0)*(r_x1-other.r_x0) + (r_y1-other.r_y0)*(r_y1-other.r_y0));
// FIXME: Redundant calculation with is_adjacent_to
double dx0 = other.r_x0 - r_x1;
double dy0 = other.r_y0 - r_y1;
double d0 = std::sqrt(dx0 * dx0 + dy0 * dy0);


if((not left_to_right) or (not other.left_to_right))
{
if(delta<d0)
// FIXME: Reundant calculation of intersects here as well...
if(d0 >= delta and not intersects(other))
{
text = " " + text;
}
Expand All @@ -299,7 +359,7 @@ namespace pdflib
}
else
{
if(delta<d0)
if(d0 >= delta and not intersects(other))
{
text += " ";
}
Expand Down
4 changes: 2 additions & 2 deletions src/v2/pdf_sanitators/cells.h
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,7 @@ namespace pdflib
*/

void pdf_sanitator<PAGE_CELLS>::contract_cells_into_lines_v2(pdf_resource<PAGE_CELLS>& cells,
double horizontal_cell_tolerance,
double horizontal_cell_tolerance, // FIXME: UNUSED
bool enforce_same_font,
double space_width_factor_for_merge,
double space_width_factor_for_merge_with_space)
Expand Down Expand Up @@ -319,7 +319,7 @@ namespace pdflib
double delta_0 = cells[i].average_char_width()*space_width_factor_for_merge;
double delta_1 = cells[i].average_char_width()*space_width_factor_for_merge_with_space;

if(cells[i].is_adjacent_to(cells[j], delta_0))
if(cells[i].is_adjacent_to(cells[j], delta_0) or cells[i].intersects(cells[j]))
{
cells[i].merge_with(cells[j], delta_1);

Expand Down