Skip to content

Commit

Permalink
added the XML export
Browse files Browse the repository at this point in the history
Signed-off-by: Peter Staar <[email protected]>
  • Loading branch information
PeterStaar-IBM committed Sep 9, 2024
1 parent 25af125 commit ee1a743
Show file tree
Hide file tree
Showing 5 changed files with 239 additions and 204 deletions.
6 changes: 6 additions & 0 deletions docling_core/types/doc/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,12 @@ class Table(BaseCell):
data: Optional[list[list[Union[GlmTableCell, TableCell]]]] = None
model: Optional[str] = None


class Figure(BaseCell):
"""Figure."""

figure_type: str = None
model: Optional[str] = None

class BaseText(AliasModel):
"""Base model for text objects."""
Expand Down
98 changes: 94 additions & 4 deletions docling_core/types/doc/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
Ref,
S3Data,
Table,
Figure,
)
from docling_core.utils.alias import AliasModel

Expand Down Expand Up @@ -427,6 +428,10 @@ def export_to_markdown(
delim: str = "\n\n",
main_text_start: int = 0,
main_text_stop: Optional[int] = None,
main_text_labels: list[str] = ["title",
"subtitle-level-1",
"paragraph",
"caption",]
) -> str:
r"""Serialize to Markdown.
Expand Down Expand Up @@ -462,10 +467,7 @@ def export_to_markdown(

item_type = item.obj_type
if isinstance(item, BaseText) and item_type in {
"title",
"subtitle-level-1",
"paragraph",
"caption",

}:
text = item.text

Expand Down Expand Up @@ -518,3 +520,91 @@ def export_to_markdown(

result = delim.join(md_texts)
return result

def export_to_xml(
self,
delim: str = "\n\n",
main_text_start: int = 0,
main_text_stop: Optional[int] = None,
main_text_labels: list[str] = ["title",
"subtitle-level-1",
"paragraph",
"caption",],
location_tagging: bool = True,
location_dimensions: list[int] = [500, 500],
add_new_line: bool = True
) -> str:
r"""Serialize to XML."""

xml_str="<document>"

new_line = ""
if add_new_line:
new_line = "\n"

if self.main_text is not None:
for orig_item in self.main_text[main_text_start:main_text_stop]:

item = (
self._resolve_ref(orig_item)
if isinstance(orig_item, Ref)
else orig_item
)

if item is None:
continue

prov = item.prov

loc_str = ""
if location_tagging and (prov!=None) and (len(prov)>0):

page = prov[0].page
page_dim = self.page_dimensions[page-1]

page_w = float(page_dim.width)/float(location_dimensions[0])
page_h = float(page_dim.height)/float(location_dimensions[1])

X0 = round(float(prov[0].bbox[0])/float(page_w))
X1 = round(float(prov[0].bbox[2])/float(page_w))
Y0 = round(float(prov[0].bbox[1])/float(page_h))
Y1 = round(float(prov[0].bbox[3])/float(page_h))

loc_str = f"<location>__loc_{X0}__loc_{Y0}__loc_{X1}__loc_{Y1}</location>"

item_type = item.obj_type
if isinstance(item, BaseText) and (item_type in main_text_labels):
text = item.text

xml_str += f"<{item_type}>{loc_str}{text}</{item_type}>{new_line}"

elif isinstance(item, Table):

xml_str += f"<{item_type}>{loc_str}"

if item.text!=None and len(item.text)>0:
xml_str += f"<caption>{item.text}</caption>{new_line}"

if item.data!=None and len(item.data)>0:
for i,row in enumerate(item.data):
xml_str += f"<row_{i}>"
for j,col in enumerate(row):
text = col.text
xml_str += f"<col_{j}>{text}</col_{j}>"

xml_str += f"</row_{i}>{new_line}"

xml_str += f"</{item_type}>{new_line}"

elif isinstance(item, Figure):

xml_str += f"<{item_type}>{loc_str}"

if item.text!=None and len(item.text)>0:
xml_str += f"<caption>{item.text}</caption>{new_line}"

xml_str += f"</{item_type}>{new_line}"

xml_str += "</document>"

return xml_str
Loading

0 comments on commit ee1a743

Please sign in to comment.