Skip to content

Commit 81ef9c4

Browse files
added page-tokens
Signed-off-by: Peter Staar <[email protected]>
1 parent be49711 commit 81ef9c4

File tree

2 files changed

+120
-109
lines changed

2 files changed

+120
-109
lines changed

docling_core/types/doc/document.py

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -413,19 +413,24 @@ def get_special_tokens(cls):
413413
return special_tokens
414414

415415
@staticmethod
416-
def get_loc_token(val: float, rnorm: int = 100):
416+
def get_page_token(page: int):
417+
"""Function to get page tokens."""
418+
return f"__page_{page}/"
419+
420+
@staticmethod
421+
def get_location_token(val: float, rnorm: int = 100):
417422
"""Function to get location tokens."""
418423
assert 0 <= val and val <= 1.0, "0<=val and val<=1.0"
419424

420425
val_ = round(rnorm * val)
421426

422427
if val_ < 0:
423-
return "__loc_0"
428+
return "__loc_0/"
424429

425430
if val_ > rnorm:
426-
return f"__loc_{rnorm}"
431+
return f"__loc_{rnorm}/"
427432

428-
return f"__loc_{val_}"
433+
return f"__loc_{val_}/"
429434

430435

431436
class ExportedCCSDocument(
@@ -629,6 +634,7 @@ def export_to_xml(
629634
"table",
630635
"figure",
631636
],
637+
page_tagging: bool = True,
632638
location_tagging: bool = True,
633639
location_dimensions: list[int] = [500, 500],
634640
add_new_line: bool = True,
@@ -703,21 +709,26 @@ def export_to_xml(
703709
x1 = float(prov[0].bbox[2]) / float(page_w)
704710
y1 = float(prov[0].bbox[3]) / float(page_h)
705711

706-
x0_tok = DocumentToken.get_loc_token(
712+
page_tok = ""
713+
if page_tagging:
714+
page_tok = DocumentToken.get_page_token(page=page)
715+
716+
x0_tok = DocumentToken.get_location_token(
707717
val=min(x0, x1), rnorm=location_dimensions[0]
708718
)
709-
y0_tok = DocumentToken.get_loc_token(
719+
y0_tok = DocumentToken.get_location_token(
710720
val=min(y0, y1), rnorm=location_dimensions[1]
711721
)
712-
x1_tok = DocumentToken.get_loc_token(
722+
x1_tok = DocumentToken.get_location_token(
713723
val=max(x0, x1), rnorm=location_dimensions[0]
714724
)
715-
y1_tok = DocumentToken.get_loc_token(
725+
y1_tok = DocumentToken.get_location_token(
716726
val=max(y0, y1), rnorm=location_dimensions[1]
717727
)
718728

719729
# update
720730
loc_str = f"{DocumentToken.BEG_LOCATION.value}"
731+
loc_str += f"{page_tok}"
721732
loc_str += f"{x0_tok}{y0_tok}{x1_tok}{y1_tok}"
722733
loc_str += f"{DocumentToken.END_LOCATION.value}"
723734

0 commit comments

Comments
 (0)