Skip to content

Commit 5ef3e35

Browse files
fixed parameter-operator confusion in the qpdf-stream, added regression test files
Signed-off-by: Peter Staar <[email protected]>
1 parent 75a28e5 commit 5ef3e35

File tree

4 files changed

+44
-22
lines changed

4 files changed

+44
-22
lines changed

docling_parse/visualize.py

Lines changed: 23 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -273,29 +273,31 @@ def visualise_py(
273273

274274
pdf_doc: PdfDocument = parser.load(path_or_stream=pdf_path, lazy=True)
275275

276-
pdf_page: ParsedPdfPage = pdf_doc.get_page(page_no=page_num)
276+
page_nos = [page_num]
277+
if page_num==-1:
278+
page_nos = [(page_ind+1) for page_ind in range(0, pdf_doc.number_of_pages())]
277279

278-
if category == "both":
279-
pdf_page.original.render(
280-
draw_cells_bbox=(not display_text), draw_cells_text=display_text
281-
).show()
282-
pdf_page.sanitized.render(
283-
draw_cells_bbox=(not display_text), draw_cells_text=display_text
284-
).show()
285-
elif category == "sanitized":
286-
pdf_page.sanitized.render(
287-
draw_cells_bbox=(not display_text), draw_cells_text=display_text
288-
).show()
289-
elif category == "original":
290-
pdf_page.original.render(
291-
draw_cells_bbox=(not display_text), draw_cells_text=display_text
292-
).show()
293-
294-
lines = pdf_page.original.export_to_textlines(add_fontkey=True)
295-
print("text-lines (original): \n", "\n".join(lines))
280+
for page_no in page_nos:
281+
print(f"parsing {pdf_path} on page: {page_no}")
282+
283+
pdf_page: ParsedPdfPage = pdf_doc.get_page(page_no=page_no)
284+
285+
if category in ["sanitized", "both"]:
286+
pdf_page.sanitized.render(
287+
draw_cells_bbox=(not display_text), draw_cells_text=display_text
288+
).show()
289+
elif category in ["original", "both"]:
290+
pdf_page.original.render(
291+
draw_cells_bbox=(not display_text), draw_cells_text=display_text
292+
).show()
293+
294+
lines = pdf_page.original.export_to_textlines(add_fontkey=True)
295+
print(f"text-lines (original, page_no: {page_no}):")
296+
print("\n".join(lines))
296297

297-
lines = pdf_page.sanitized.export_to_textlines(add_fontkey=True)
298-
print("text-lines (sanitized): \n", "\n".join(lines))
298+
lines = pdf_page.sanitized.export_to_textlines(add_fontkey=True)
299+
print(f"text-lines (sanitized, page_no: {page_no}):")
300+
print("\n".join(lines))
299301

300302
"""
301303
lines = pdf_page.original.export_to_textlines(add_fontkey=True)

src/v2/qpdf/stream_decoder.h

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,11 +32,15 @@ namespace pdflib
3232
private:
3333

3434
std::vector<qpdf_instruction>& stream;
35+
36+
std::regex value_pattern_0;
3537
};
3638

3739
qpdf_stream_decoder::qpdf_stream_decoder(std::vector<qpdf_instruction>& stream_):
3840
QPDFObjectHandle::ParserCallbacks(),
39-
stream(stream_)
41+
stream(stream_),
42+
43+
value_pattern_0(R"(^(\d\.\d+)(\-\d+)$)")
4044
{}
4145

4246
qpdf_stream_decoder::~qpdf_stream_decoder()
@@ -95,6 +99,7 @@ namespace pdflib
9599
row.key = obj.getTypeName();
96100
row.val = obj.unparse();
97101
row.obj = obj;
102+
98103
//LOG_S(INFO) << std::setw(12) << row.key << " | " << row.val;
99104
}
100105

@@ -105,6 +110,8 @@ namespace pdflib
105110
}
106111
*/
107112

113+
std::smatch match;
114+
108115
// if the row is null, reinterprete it as an empty array. We encountered
109116
// this usecase for a parameter of the d operator (see Table 56) that is
110117
// null but in reality should be an empty array.
@@ -113,6 +120,19 @@ namespace pdflib
113120
row.key = "parameter";
114121
row.val = "[]";
115122
}
123+
else if (std::regex_match(row.val, match, value_pattern_0))
124+
{
125+
LOG_S(WARNING) << std::setw(12) << row.key << " | " << row.val << " => new matched value: " << match[1];
126+
127+
double value = std::stod(match[1].str());
128+
129+
// Creating a real (floating-point) QPDFObjectHandle
130+
QPDFObjectHandle new_obj = QPDFObjectHandle::newReal(value);
131+
132+
row.key = new_obj.getTypeName();
133+
row.val = new_obj.unparse();
134+
row.obj = new_obj;
135+
}
116136

117137
stream.push_back(row);
118138
}
40.2 KB
Binary file not shown.
1.16 MB
Binary file not shown.

0 commit comments

Comments
 (0)