@@ -413,19 +413,24 @@ def get_special_tokens(cls):
413
413
return special_tokens
414
414
415
415
@staticmethod
416
- def get_loc_token (val : float , rnorm : int = 100 ):
416
+ def get_page_token (page : int ):
417
+ """Function to get page tokens."""
418
+ return f"__page_{ page } /"
419
+
420
+ @staticmethod
421
+ def get_location_token (val : float , rnorm : int = 100 ):
417
422
"""Function to get location tokens."""
418
423
assert 0 <= val and val <= 1.0 , "0<=val and val<=1.0"
419
424
420
425
val_ = round (rnorm * val )
421
426
422
427
if val_ < 0 :
423
- return "__loc_0"
428
+ return "__loc_0/ "
424
429
425
430
if val_ > rnorm :
426
- return f"__loc_{ rnorm } "
431
+ return f"__loc_{ rnorm } / "
427
432
428
- return f"__loc_{ val_ } "
433
+ return f"__loc_{ val_ } / "
429
434
430
435
431
436
class ExportedCCSDocument (
@@ -629,6 +634,7 @@ def export_to_xml(
629
634
"table" ,
630
635
"figure" ,
631
636
],
637
+ page_tagging : bool = True ,
632
638
location_tagging : bool = True ,
633
639
location_dimensions : list [int ] = [500 , 500 ],
634
640
add_new_line : bool = True ,
@@ -703,21 +709,26 @@ def export_to_xml(
703
709
x1 = float (prov [0 ].bbox [2 ]) / float (page_w )
704
710
y1 = float (prov [0 ].bbox [3 ]) / float (page_h )
705
711
706
- x0_tok = DocumentToken .get_loc_token (
712
+ page_tok = ""
713
+ if page_tagging :
714
+ page_tok = DocumentToken .get_page_token (page = page )
715
+
716
+ x0_tok = DocumentToken .get_location_token (
707
717
val = min (x0 , x1 ), rnorm = location_dimensions [0 ]
708
718
)
709
- y0_tok = DocumentToken .get_loc_token (
719
+ y0_tok = DocumentToken .get_location_token (
710
720
val = min (y0 , y1 ), rnorm = location_dimensions [1 ]
711
721
)
712
- x1_tok = DocumentToken .get_loc_token (
722
+ x1_tok = DocumentToken .get_location_token (
713
723
val = max (x0 , x1 ), rnorm = location_dimensions [0 ]
714
724
)
715
- y1_tok = DocumentToken .get_loc_token (
725
+ y1_tok = DocumentToken .get_location_token (
716
726
val = max (y0 , y1 ), rnorm = location_dimensions [1 ]
717
727
)
718
728
719
729
# update
720
730
loc_str = f"{ DocumentToken .BEG_LOCATION .value } "
731
+ loc_str += f"{ page_tok } "
721
732
loc_str += f"{ x0_tok } { y0_tok } { x1_tok } { y1_tok } "
722
733
loc_str += f"{ DocumentToken .END_LOCATION .value } "
723
734
0 commit comments