diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py
index cee2e23..cb7ba4c 100644
--- a/docling_core/types/doc/document.py
+++ b/docling_core/types/doc/document.py
@@ -539,7 +539,6 @@ def get_location_tokens(
new_line: str,
xsize: int = 100,
ysize: int = 100,
- add_page_index: bool = True,
) -> str:
"""Get the location string for the BaseCell."""
if not len(self.prov):
@@ -613,7 +612,6 @@ def export_to_document_tokens(
ysize: int = 100,
add_location: bool = True,
add_content: bool = True,
- add_page_index: bool = True,
):
r"""Export text element to document tokens format.
@@ -623,7 +621,6 @@ def export_to_document_tokens(
:param ysize: int: (Default value = 100)
:param add_location: bool: (Default value = True)
:param add_content: bool: (Default value = True)
- :param add_page_index: bool: (Default value = True)
"""
body = f"<{self.label.value}>"
@@ -639,7 +636,6 @@ def export_to_document_tokens(
new_line="",
xsize=xsize,
ysize=ysize,
- add_page_index=add_page_index,
)
if add_content and self.text is not None:
@@ -666,7 +662,6 @@ def export_to_document_tokens(
ysize: int = 100,
add_location: bool = True,
add_content: bool = True,
- add_page_index: bool = True,
):
r"""Export text element to document tokens format.
@@ -676,7 +671,6 @@ def export_to_document_tokens(
:param ysize: int: (Default value = 100)
:param add_location: bool: (Default value = True)
:param add_content: bool: (Default value = True)
- :param add_page_index: bool: (Default value = True)
"""
body = f"<{self.label.value}>"
@@ -687,7 +681,6 @@ def export_to_document_tokens(
new_line="",
xsize=xsize,
ysize=ysize,
- add_page_index=add_page_index,
)
if add_content and self.text is not None:
@@ -714,7 +707,6 @@ def export_to_document_tokens(
ysize: int = 100,
add_location: bool = True,
add_content: bool = True,
- add_page_index: bool = True,
):
r"""Export text element to document tokens format.
@@ -724,7 +716,6 @@ def export_to_document_tokens(
:param ysize: int: (Default value = 100)
:param add_location: bool: (Default value = True)
:param add_content: bool: (Default value = True)
- :param add_page_index: bool: (Default value = True)
"""
body = f"<{self.label.value}_level_{self.level}>"
@@ -740,7 +731,6 @@ def export_to_document_tokens(
new_line="",
xsize=xsize,
ysize=ysize,
- add_page_index=add_page_index,
)
if add_content and self.text is not None:
@@ -941,7 +931,6 @@ def export_to_document_tokens(
add_location: bool = True,
add_caption: bool = True,
add_content: bool = True, # not used at the moment
- add_page_index: bool = True,
):
r"""Export picture to document tokens format.
@@ -952,7 +941,7 @@ def export_to_document_tokens(
:param add_location: bool: (Default value = True)
:param add_caption: bool: (Default value = True)
:param add_content: bool: (Default value = True)
- :param # not used at the momentadd_page_index: bool: (Default value = True)
+ :param # not used at the moment
"""
body = f"{DocumentToken.BEG_PICTURE.value}{new_line}"
@@ -963,7 +952,6 @@ def export_to_document_tokens(
new_line=new_line,
xsize=xsize,
ysize=ysize,
- add_page_index=add_page_index,
)
classifications = [
@@ -1239,12 +1227,9 @@ def export_to_document_tokens(
xsize: int = 100,
ysize: int = 100,
add_location: bool = True,
- add_caption: bool = True,
- add_content: bool = True,
add_cell_location: bool = True,
- add_cell_label: bool = True,
add_cell_text: bool = True,
- add_page_index: bool = True,
+ add_caption: bool = True,
):
r"""Export table to document tokens format.
@@ -1253,15 +1238,11 @@ def export_to_document_tokens(
:param xsize: int: (Default value = 100)
:param ysize: int: (Default value = 100)
:param add_location: bool: (Default value = True)
- :param add_caption: bool: (Default value = True)
- :param add_content: bool: (Default value = True)
:param add_cell_location: bool: (Default value = True)
- :param add_cell_label: bool: (Default value = True)
:param add_cell_text: bool: (Default value = True)
- :param add_page_index: bool: (Default value = True)
-
+ :param add_caption: bool: (Default value = True)
"""
- body = f"{DocumentToken.BEG_TABLE.value}{new_line}"
+ body = f"{DocumentToken.BEG_OTSL.value}{new_line}"
if add_location:
body += self.get_location_tokens(
@@ -1271,6 +1252,8 @@ def export_to_document_tokens(
ysize=ysize,
)
+ body += self.export_to_otsl(doc, add_cell_location, add_cell_text, xsize, ysize)
+
if add_caption and len(self.captions):
text = self.caption_text(doc)
@@ -1280,62 +1263,7 @@ def export_to_document_tokens(
body += f"{DocumentToken.END_CAPTION.value}"
body += f"{new_line}"
- if add_content and len(self.data.table_cells) > 0:
- for i, row in enumerate(self.data.grid):
- body += f""
- for j, col in enumerate(row):
-
- text = ""
- if add_cell_text:
- text = col.text.strip()
-
- cell_loc = ""
- if (
- col.bbox is not None
- and add_cell_location
- and add_page_index
- and len(self.prov) > 0
- ):
- page_w, page_h = doc.pages[self.prov[0].page_no].size.as_tuple()
- cell_loc = DocumentToken.get_location(
- bbox=col.bbox.to_bottom_left_origin(page_h).as_tuple(),
- page_w=page_w,
- page_h=page_h,
- xsize=xsize,
- ysize=ysize,
- )
- elif (
- col.bbox is not None
- and add_cell_location
- and not add_page_index
- and len(self.prov) > 0
- ):
- page_w, page_h = doc.pages[self.prov[0].page_no].size.as_tuple()
-
- cell_loc = DocumentToken.get_location(
- bbox=col.bbox.to_bottom_left_origin(page_h).as_tuple(),
- page_w=page_w,
- page_h=page_h,
- xsize=xsize,
- ysize=ysize,
- )
-
- cell_label = ""
- if add_cell_label:
- if col.column_header:
- cell_label = ""
- elif col.row_header:
- cell_label = ""
- elif col.row_section:
- cell_label = ""
- else:
- cell_label = ""
-
- body += f"{cell_loc}{cell_label}{text}"
-
- body += f"{new_line}"
-
- body += f"{DocumentToken.END_TABLE.value}{new_line}"
+ body += f"{DocumentToken.END_OTSL.value}{new_line}"
return body
@@ -2623,7 +2551,10 @@ def close_lists(
return (in_ordered_list, result)
- def add_page_break(result, item, previous_page_no, delim):
+ def add_page_break(result, item, previous_page_no, delim, add_page_break):
+ if not add_page_break:
+ return result, previous_page_no
+
prov_list = item.prov
if len(prov_list) == 0:
return result, previous_page_no
@@ -2690,7 +2621,7 @@ def add_page_break(result, item, previous_page_no, delim):
elif isinstance(item, SectionHeaderItem):
result, previous_page_no = add_page_break(
- result, item, previous_page_no, delim
+ result, item, previous_page_no, delim, add_page_index
)
result += item.export_to_document_tokens(
@@ -2700,11 +2631,10 @@ def add_page_break(result, item, previous_page_no, delim):
ysize=ysize,
add_location=add_location,
add_content=add_content,
- add_page_index=add_page_index,
)
elif isinstance(item, CodeItem) and (item.label in labels):
result, previous_page_no = add_page_break(
- result, item, previous_page_no, delim
+ result, item, previous_page_no, delim, add_page_index
)
result += item.export_to_document_tokens(
@@ -2714,12 +2644,11 @@ def add_page_break(result, item, previous_page_no, delim):
ysize=ysize,
add_location=add_location,
add_content=add_content,
- add_page_index=add_page_index,
)
elif isinstance(item, TextItem) and (item.label in labels):
result, previous_page_no = add_page_break(
- result, item, previous_page_no, delim
+ result, item, previous_page_no, delim, add_page_index
)
result += item.export_to_document_tokens(
@@ -2729,31 +2658,26 @@ def add_page_break(result, item, previous_page_no, delim):
ysize=ysize,
add_location=add_location,
add_content=add_content,
- add_page_index=add_page_index,
)
elif isinstance(item, TableItem) and (item.label in labels):
result, previous_page_no = add_page_break(
- result, item, previous_page_no, delim
+ result, item, previous_page_no, delim, add_page_index
)
-
result += item.export_to_document_tokens(
doc=self,
new_line=delim,
xsize=xsize,
ysize=ysize,
- add_caption=True,
add_location=add_location,
- add_content=add_content,
add_cell_location=add_table_cell_location,
- add_cell_label=add_table_cell_label,
add_cell_text=add_table_cell_text,
- add_page_index=add_page_index,
+ add_caption=True,
)
elif isinstance(item, PictureItem) and (item.label in labels):
result, previous_page_no = add_page_break(
- result, item, previous_page_no, delim
+ result, item, previous_page_no, delim, add_page_index
)
result += item.export_to_document_tokens(
@@ -2764,7 +2688,6 @@ def add_page_break(result, item, previous_page_no, delim):
add_caption=True,
add_location=add_location,
add_content=add_content,
- add_page_index=add_page_index,
)
result += DocumentToken.END_DOCUMENT.value
diff --git a/docling_core/types/doc/labels.py b/docling_core/types/doc/labels.py
index 8decdd4..50ef67a 100644
--- a/docling_core/types/doc/labels.py
+++ b/docling_core/types/doc/labels.py
@@ -111,7 +111,7 @@ class PictureClassificationLabel(str, Enum):
SIGNATURE = "signature"
STAMP = "stamp"
QR_CODE = "qr_code"
- BAR_CODE = "bat_code"
+ BAR_CODE = "bar_code"
SCREENSHOT = "screenshot"
# Geology/Geography
diff --git a/test/data/doc/2206.01062.yaml.dt b/test/data/doc/2206.01062.yaml.dt
index 83f62aa..9056b62 100644
--- a/test/data/doc/2206.01062.yaml.dt
+++ b/test/data/doc/2206.01062.yaml.dt
@@ -52,23 +52,9 @@
4 ANNOTATION CAMPAIGN
The annotation campaign was carried out in four phases. In phase one, we identified and prepared the data sources for annotation. In phase two, we determined the class labels and how annotations should be done on the documents in order to obtain maximum consistency. The latter was guided by a detailed requirement analysis and exhaustive experiments. In phase three, we trained the annotation staff and performed exams for quality assurance. In phase four,
-
+
-% of Total% of Total% of Totaltriple inter-annotator mAP @ 0.5-0.95 (%)triple inter-annotator mAP @ 0.5-0.95 (%)triple inter-annotator mAP @ 0.5-0.95 (%)triple inter-annotator mAP @ 0.5-0.95 (%)triple inter-annotator mAP @ 0.5-0.95 (%)triple inter-annotator mAP @ 0.5-0.95 (%)triple inter-annotator mAP @ 0.5-0.95 (%)
-class labelCountTrainTestValAllFinManSciLawPatTen
-Caption225242.041.772.3284-8940-6186-9294-9995-9969-78n/a
-Footnote63180.600.310.5883-91n/a10062-8885-94n/a82-97
-Formula250272.251.902.9683-85n/an/a84-8786-96n/an/a
-List-item18566017.1913.3415.8287-8874-8390-9297-9781-8575-8893-95
-Page-footer708786.515.586.0093-9488-9095-9610092-9710096-98
-Page-header580225.106.705.0685-8966-7690-9498-10091-9297-9981-86
-Picture459764.212.785.3169-7156-5982-8669-8280-9566-7159-76
-Section-header14288412.6015.7712.8583-8476-8190-9294-9587-9469-7378-86
-Table347333.202.273.6077-8175-8083-8698-9958-8079-8470-85
-Text51037745.8249.2845.0084-8681-8688-9389-9387-9271-7987-95
-Title50710.470.300.5060-7224-6350-6394-10082-9668-7924-56
-Total1107470941123998166653182-8371-7479-8189-9486-9171-7668-85
-
+% of Totaltriple inter-annotator mAP @ 0.5-0.95 (%)class labelCountTrainTestValAllFinManSciLawPatTenCaption225242.041.772.3284-8940-6186-9294-9995-9969-78n/aFootnote63180.600.310.5883-91n/a10062-8885-94n/a82-97Formula250272.251.902.9683-85n/an/a84-8786-96n/an/aList-item18566017.1913.3415.8287-8874-8390-9297-9781-8575-8893-95Page-footer708786.515.586.0093-9488-9095-9610092-9710096-98Page-header580225.106.705.0685-8966-7690-9498-10091-9297-9981-86Picture459764.212.785.3169-7156-5982-8669-8280-9566-7159-76Section-header14288412.6015.7712.8583-8476-8190-9294-9587-9469-7378-86Table347333.202.273.6077-8175-8083-8698-9958-8079-8470-85Text51037745.8249.2845.0084-8681-8688-9389-9387-9271-7987-95Title50710.470.300.5060-7224-6350-6394-10082-9668-7924-56Total1107470941123998166653182-8371-7479-8189-9486-9171-7668-85
we distributed the annotation workload and performed continuous quality controls. Phase one and two required a small team of experts only. For phases three and four, a group of 40 dedicated annotators were assembled and supervised.
Phase 1: Data selection and preparation. Our inclusion criteria for documents were described in Section 3. A large effort went into ensuring that all documents are free to use. The data sources
include publication repositories such as arXiv$^{3}$, government offices, company websites as well as data directory services for financial reports and patents. Scanned documents were excluded wherever possible because they can be rotated or skewed. This would not allow us to perform annotation with rectangular bounding-boxes and therefore complicate the annotation process.
@@ -93,24 +79,10 @@
were carried out over a timeframe of 12 weeks, after which 8 of the 40 initially allocated annotators did not pass the bar.
Phase 4: Production annotation. The previously selected 80K pages were annotated with the defined 11 class labels by 32 annotators. This production phase took around three months to complete. All annotations were created online through CCS, which visualises the programmatic PDF text-cells as an overlay on the page. The page annotation are obtained by drawing rectangular bounding-boxes, as shown in Figure 3. With regard to the annotation practices, we implemented a few constraints and capabilities on the tooling level. First, we only allow non-overlapping, vertically oriented, rectangular boxes. For the large majority of documents, this constraint was sufficient and it speeds up the annotation considerably in comparison with arbitrary segmentation shapes. Second, annotator staff were not able to see each other's annotations. This was enforced by design to avoid any bias in the annotation, which could skew the numbers of the inter-annotator agreement (see Table 1). We wanted
-
+
-Table 2: Prediction performance (mAP@0.5-0.95) of object detection networks on DocLayNet test set. The MRCNN (Mask R-CNN) and FRCNN (Faster R-CNN) models with ResNet-50 or ResNet-101 backbone were trained based on the network architectures from the detectron2 model zoo (Mask R-CNN R50, R101-FPN 3x, Faster R-CNN R101-FPN 3x), with default configurations. The YOLO implementation utilized was YOLOv5x6 [13]. All models were initialised using pre-trained weights from the COCO 2017 dataset.
-humanMRCNNMRCNNFRCNNYOLO
-humanR50R101R101v5x6
-Caption84-8968.471.570.177.7
-Footnote83-9170.971.873.777.2
-Formula83-8560.163.463.566.2
-List-item87-8881.280.881.086.2
-Page-footer93-9461.659.358.961.1
-Page-header85-8971.970.072.067.9
-Picture69-7171.772.772.077.1
-Section-header83-8467.669.368.474.6
-Table77-8182.282.982.286.3
-Text84-8684.685.885.488.1
-Title60-7276.780.479.982.7
-All82-8372.473.573.476.8
-
+humanMRCNNFRCNNYOLOR50R101R101v5x6Caption84-8968.471.570.177.7Footnote83-9170.971.873.777.2Formula83-8560.163.463.566.2List-item87-8881.280.881.086.2Page-footer93-9461.659.358.961.1Page-header85-8971.970.072.067.9Picture69-7171.772.772.077.1Section-header83-8467.669.368.474.6