Skip to content

Commit

Permalink
updating the to-xml method
Browse files Browse the repository at this point in the history
Signed-off-by: Peter Staar <[email protected]>
  • Loading branch information
PeterStaar-IBM committed Sep 9, 2024
1 parent 5cd16cb commit c7a6b79
Showing 1 changed file with 30 additions and 9 deletions.
39 changes: 30 additions & 9 deletions docling_core/types/doc/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -412,6 +412,19 @@ def get_special_tokens(cls):

return special_tokens

@classmethod
def get_loc_token(val:float, rnorm:int=100):
"""Function to get location tokens."""
val_ = round(rnorm*val)

if val_<0:
return f"__loc_0"

if val_>rnorm:
return f"__loc_{rnorm}"

return f"__loc_{val_}"


class ExportedCCSDocument(
MinimalDocument,
Expand Down Expand Up @@ -598,6 +611,8 @@ def export_to_xml(
"subtitle-level-1",
"paragraph",
"caption",
"table",
"figure"
],
location_tagging: bool = True,
location_dimensions: list[int] = [500, 500],
Expand Down Expand Up @@ -654,7 +669,7 @@ def export_to_xml(

prov = item.prov

loc_str = ""
loc_str = "" # default is zero
if (
location_tagging
and self.page_dimensions is not None
Expand All @@ -665,16 +680,22 @@ def export_to_xml(
page = prov[0].page
page_dim = self.page_dimensions[page - 1]

page_w = float(page_dim.width) / float(location_dimensions[0])
page_h = float(page_dim.height) / float(location_dimensions[1])

page_w = float(page_dim.width)
page_h = float(page_dim.height)
x0 = round(float(prov[0].bbox[0]) / float(page_w))
x1 = round(float(prov[0].bbox[2]) / float(page_w))
y0 = round(float(prov[0].bbox[1]) / float(page_h))
y1 = round(float(prov[0].bbox[3]) / float(page_h))

x0_tok = DocumentToken.get_loc_token(min(x0, x1), location_dimensions[0])
y0_tok = DocumentToken.get_loc_token(min(y0, y1), location_dimensions[1])
x1_tok = DocumentToken.get_loc_token(max(x0, x1), location_dimensions[0])
y1_tok = DocumentToken.get_loc_token(max(y0, y1), location_dimensions[1])

# update
loc_str = (
f"<location>__loc_{x0}__loc_{y0}__loc_{x1}__loc_{y1}</location>"
f"{DocumentToken.BEG_LOCATION.value}{x0_tok}{y0_tok}{x1_tok}{y1_tok}{DocumentToken.END_LOCATION.value}"
)

item_type = item.obj_type
Expand All @@ -683,12 +704,12 @@ def export_to_xml(

xml_str += f"<{item_type}>{loc_str}{text}</{item_type}>{new_line}"

elif isinstance(item, Table):
elif isinstance(item, Table) and (item_type in main_text_labels):

xml_str += f"<{item_type}>{loc_str}"

if item.text is not None and len(item.text) > 0:
xml_str += f"<caption>{item.text}</caption>{new_line}"
xml_str += f"{DocumentToken.END_CAPTION.value}{item.text}{DocumentToken.END_CAPTION.value}{new_line}"

if item.data is not None and len(item.data) > 0:
for i, row in enumerate(item.data):
Expand All @@ -701,12 +722,12 @@ def export_to_xml(

xml_str += f"</{item_type}>{new_line}"

elif isinstance(item, Figure):
elif isinstance(item, Figure) and (item_type in main_text_labels):

xml_str += f"<{item_type}>{loc_str}"

if item.text is not None and len(item.text) > 0:
xml_str += f"<caption>{item.text}</caption>{new_line}"
xml_str += f"{DocumentToken.END_CAPTION.value}{item.text}{DocumentToken.END_CAPTION.value}{new_line}"

xml_str += f"</{item_type}>{new_line}"

Expand Down

0 comments on commit c7a6b79

Please sign in to comment.