Skip to content

Commit

Permalink
fixed parameter-operator confusion in the qpdf-stream, added regressi…
Browse files Browse the repository at this point in the history
…on test files

Signed-off-by: Peter Staar <[email protected]>
  • Loading branch information
PeterStaar-IBM committed Jan 31, 2025
1 parent 75a28e5 commit 5ef3e35
Show file tree
Hide file tree
Showing 4 changed files with 44 additions and 22 deletions.
44 changes: 23 additions & 21 deletions docling_parse/visualize.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,29 +273,31 @@ def visualise_py(

pdf_doc: PdfDocument = parser.load(path_or_stream=pdf_path, lazy=True)

pdf_page: ParsedPdfPage = pdf_doc.get_page(page_no=page_num)
page_nos = [page_num]
if page_num==-1:
page_nos = [(page_ind+1) for page_ind in range(0, pdf_doc.number_of_pages())]

if category == "both":
pdf_page.original.render(
draw_cells_bbox=(not display_text), draw_cells_text=display_text
).show()
pdf_page.sanitized.render(
draw_cells_bbox=(not display_text), draw_cells_text=display_text
).show()
elif category == "sanitized":
pdf_page.sanitized.render(
draw_cells_bbox=(not display_text), draw_cells_text=display_text
).show()
elif category == "original":
pdf_page.original.render(
draw_cells_bbox=(not display_text), draw_cells_text=display_text
).show()

lines = pdf_page.original.export_to_textlines(add_fontkey=True)
print("text-lines (original): \n", "\n".join(lines))
for page_no in page_nos:
print(f"parsing {pdf_path} on page: {page_no}")

pdf_page: ParsedPdfPage = pdf_doc.get_page(page_no=page_no)

if category in ["sanitized", "both"]:
pdf_page.sanitized.render(
draw_cells_bbox=(not display_text), draw_cells_text=display_text
).show()
elif category in ["original", "both"]:
pdf_page.original.render(
draw_cells_bbox=(not display_text), draw_cells_text=display_text
).show()

lines = pdf_page.original.export_to_textlines(add_fontkey=True)
print(f"text-lines (original, page_no: {page_no}):")
print("\n".join(lines))

lines = pdf_page.sanitized.export_to_textlines(add_fontkey=True)
print("text-lines (sanitized): \n", "\n".join(lines))
lines = pdf_page.sanitized.export_to_textlines(add_fontkey=True)
print(f"text-lines (sanitized, page_no: {page_no}):")
print("\n".join(lines))

"""
lines = pdf_page.original.export_to_textlines(add_fontkey=True)
Expand Down
22 changes: 21 additions & 1 deletion src/v2/qpdf/stream_decoder.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,15 @@ namespace pdflib
private:

std::vector<qpdf_instruction>& stream;

std::regex value_pattern_0;
};

qpdf_stream_decoder::qpdf_stream_decoder(std::vector<qpdf_instruction>& stream_):
QPDFObjectHandle::ParserCallbacks(),
stream(stream_)
stream(stream_),

value_pattern_0(R"(^(\d\.\d+)(\-\d+)$)")
{}

qpdf_stream_decoder::~qpdf_stream_decoder()
Expand Down Expand Up @@ -95,6 +99,7 @@ namespace pdflib
row.key = obj.getTypeName();
row.val = obj.unparse();
row.obj = obj;

//LOG_S(INFO) << std::setw(12) << row.key << " | " << row.val;
}

Expand All @@ -105,6 +110,8 @@ namespace pdflib
}
*/

std::smatch match;

// if the row is null, reinterprete it as an empty array. We encountered
// this usecase for a parameter of the d operator (see Table 56) that is
// null but in reality should be an empty array.
Expand All @@ -113,6 +120,19 @@ namespace pdflib
row.key = "parameter";
row.val = "[]";
}
else if (std::regex_match(row.val, match, value_pattern_0))
{
LOG_S(WARNING) << std::setw(12) << row.key << " | " << row.val << " => new matched value: " << match[1];

double value = std::stod(match[1].str());

// Creating a real (floating-point) QPDFObjectHandle
QPDFObjectHandle new_obj = QPDFObjectHandle::newReal(value);

row.key = new_obj.getTypeName();
row.val = new_obj.unparse();
row.obj = new_obj;
}

stream.push_back(row);
}
Expand Down
Binary file added tests/data/regression/duplicate_bold_text_01.pdf
Binary file not shown.
Binary file not shown.

0 comments on commit 5ef3e35

Please sign in to comment.