fixed the to-md method

Signed-off-by: Peter Staar <[email protected]>
DS4SD · Sep 9, 2024 · 3999028 · 3999028
1 parent c7a6b79
commit 3999028
Show file tree

Hide file tree

Showing 2 changed files with 245 additions and 26 deletions.
diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py
@@ -412,17 +412,19 @@ def get_special_tokens(cls):
 
         return special_tokens
 
-    @classmethod
-    def get_loc_token(val:float, rnorm:int=100):
+    @staticmethod
+    def get_loc_token(val: float, rnorm: int = 100):
         """Function to get location tokens."""
-        val_ = round(rnorm*val)
+        assert 0 <= val and val <= 1.0, "0<=val and val<=1.0"
+
+        val_ = round(rnorm * val)
 
-        if val_<0:
-            return f"__loc_0"
+        if val_ < 0:
+            return "__loc_0"
 
-        if val_>rnorm:
+        if val_ > rnorm:
             return f"__loc_{rnorm}"
-        
+
         return f"__loc_{val_}"
 
 
@@ -513,6 +515,7 @@ def export_to_markdown(
             "subtitle-level-1",
             "paragraph",
             "caption",
+            "table",
         ],
     ) -> str:
         r"""Serialize to Markdown.
@@ -548,7 +551,7 @@ def export_to_markdown(
                     continue
 
                 item_type = item.obj_type
-                if isinstance(item, BaseText) and item_type in {}:
+                if isinstance(item, BaseText) and item_type in main_text_labels:
                     text = item.text
 
                     # ignore repeated text
@@ -572,7 +575,11 @@ def export_to_markdown(
                     else:
                         markdown_text = text
 
-                elif isinstance(item, Table) and item.data:
+                elif (
+                    isinstance(item, Table)
+                    and item.data
+                    and item_type in main_text_labels
+                ):
                     table = []
                     for row in item.data:
                         tmp = []
@@ -612,7 +619,7 @@ def export_to_xml(
             "paragraph",
             "caption",
             "table",
-            "figure"
+            "figure",
         ],
         location_tagging: bool = True,
         location_dimensions: list[int] = [500, 500],
@@ -669,7 +676,7 @@ def export_to_xml(
 
                 prov = item.prov
 
-                loc_str = "" # default is zero
+                loc_str = ""  # default is zero
                 if (
                     location_tagging
                     and self.page_dimensions is not None
@@ -682,21 +689,29 @@ def export_to_xml(
 
                     page_w = float(page_dim.width)
                     page_h = float(page_dim.height)
-
-                    x0 = round(float(prov[0].bbox[0]) / float(page_w))
-                    x1 = round(float(prov[0].bbox[2]) / float(page_w))
-                    y0 = round(float(prov[0].bbox[1]) / float(page_h))
-                    y1 = round(float(prov[0].bbox[3]) / float(page_h))
 
-                    x0_tok = DocumentToken.get_loc_token(min(x0, x1), location_dimensions[0])
-                    y0_tok = DocumentToken.get_loc_token(min(y0, y1), location_dimensions[1])
-                    x1_tok = DocumentToken.get_loc_token(max(x0, x1), location_dimensions[0])
-                    y1_tok = DocumentToken.get_loc_token(max(y0, y1), location_dimensions[1])
+                    x0 = float(prov[0].bbox[0]) / float(page_w)
+                    y0 = float(prov[0].bbox[1]) / float(page_h)
+                    x1 = float(prov[0].bbox[2]) / float(page_w)
+                    y1 = float(prov[0].bbox[3]) / float(page_h)
 
-                    # update
-                    loc_str = (
-                        f"{DocumentToken.BEG_LOCATION.value}{x0_tok}{y0_tok}{x1_tok}{y1_tok}{DocumentToken.END_LOCATION.value}"
+                    x0_tok = DocumentToken.get_loc_token(
+                        val=min(x0, x1), rnorm=location_dimensions[0]
                     )
+                    y0_tok = DocumentToken.get_loc_token(
+                        val=min(y0, y1), rnorm=location_dimensions[1]
+                    )
+                    x1_tok = DocumentToken.get_loc_token(
+                        val=max(x0, x1), rnorm=location_dimensions[0]
+                    )
+                    y1_tok = DocumentToken.get_loc_token(
+                        val=max(y0, y1), rnorm=location_dimensions[1]
+                    )
+
+                    # update
+                    loc_str = f"{DocumentToken.BEG_LOCATION.value}"
+                    loc_str += f"{x0_tok}{y0_tok}{x1_tok}{y1_tok}"
+                    loc_str += f"{DocumentToken.END_LOCATION.value}"
 
                 item_type = item.obj_type
                 if isinstance(item, BaseText) and (item_type in main_text_labels):
@@ -709,7 +724,10 @@ def export_to_xml(
                     xml_str += f"<{item_type}>{loc_str}"
 
                     if item.text is not None and len(item.text) > 0:
-                        xml_str += f"{DocumentToken.END_CAPTION.value}{item.text}{DocumentToken.END_CAPTION.value}{new_line}"
+                        xml_str += f"{DocumentToken.BEG_CAPTION.value}"
+                        xml_str += (
+                            f"{item.text}{DocumentToken.END_CAPTION.value}{new_line}"
+                        )
 
                     if item.data is not None and len(item.data) > 0:
                         for i, row in enumerate(item.data):
@@ -727,7 +745,10 @@ def export_to_xml(
                     xml_str += f"<{item_type}>{loc_str}"
 
                     if item.text is not None and len(item.text) > 0:
-                        xml_str += f"{DocumentToken.END_CAPTION.value}{item.text}{DocumentToken.END_CAPTION.value}{new_line}"
+                        xml_str += f"{DocumentToken.BEG_CAPTION.value}"
+                        xml_str += (
+                            f"{item.text}{DocumentToken.END_CAPTION.value}{new_line}"
+                        )
 
                     xml_str += f"</{item_type}>{new_line}"