diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index cee2e23..cb7ba4c 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -539,7 +539,6 @@ def get_location_tokens( new_line: str, xsize: int = 100, ysize: int = 100, - add_page_index: bool = True, ) -> str: """Get the location string for the BaseCell.""" if not len(self.prov): @@ -613,7 +612,6 @@ def export_to_document_tokens( ysize: int = 100, add_location: bool = True, add_content: bool = True, - add_page_index: bool = True, ): r"""Export text element to document tokens format. @@ -623,7 +621,6 @@ def export_to_document_tokens( :param ysize: int: (Default value = 100) :param add_location: bool: (Default value = True) :param add_content: bool: (Default value = True) - :param add_page_index: bool: (Default value = True) """ body = f"<{self.label.value}>" @@ -639,7 +636,6 @@ def export_to_document_tokens( new_line="", xsize=xsize, ysize=ysize, - add_page_index=add_page_index, ) if add_content and self.text is not None: @@ -666,7 +662,6 @@ def export_to_document_tokens( ysize: int = 100, add_location: bool = True, add_content: bool = True, - add_page_index: bool = True, ): r"""Export text element to document tokens format. @@ -676,7 +671,6 @@ def export_to_document_tokens( :param ysize: int: (Default value = 100) :param add_location: bool: (Default value = True) :param add_content: bool: (Default value = True) - :param add_page_index: bool: (Default value = True) """ body = f"<{self.label.value}>" @@ -687,7 +681,6 @@ def export_to_document_tokens( new_line="", xsize=xsize, ysize=ysize, - add_page_index=add_page_index, ) if add_content and self.text is not None: @@ -714,7 +707,6 @@ def export_to_document_tokens( ysize: int = 100, add_location: bool = True, add_content: bool = True, - add_page_index: bool = True, ): r"""Export text element to document tokens format. @@ -724,7 +716,6 @@ def export_to_document_tokens( :param ysize: int: (Default value = 100) :param add_location: bool: (Default value = True) :param add_content: bool: (Default value = True) - :param add_page_index: bool: (Default value = True) """ body = f"<{self.label.value}_level_{self.level}>" @@ -740,7 +731,6 @@ def export_to_document_tokens( new_line="", xsize=xsize, ysize=ysize, - add_page_index=add_page_index, ) if add_content and self.text is not None: @@ -941,7 +931,6 @@ def export_to_document_tokens( add_location: bool = True, add_caption: bool = True, add_content: bool = True, # not used at the moment - add_page_index: bool = True, ): r"""Export picture to document tokens format. @@ -952,7 +941,7 @@ def export_to_document_tokens( :param add_location: bool: (Default value = True) :param add_caption: bool: (Default value = True) :param add_content: bool: (Default value = True) - :param # not used at the momentadd_page_index: bool: (Default value = True) + :param # not used at the moment """ body = f"{DocumentToken.BEG_PICTURE.value}{new_line}" @@ -963,7 +952,6 @@ def export_to_document_tokens( new_line=new_line, xsize=xsize, ysize=ysize, - add_page_index=add_page_index, ) classifications = [ @@ -1239,12 +1227,9 @@ def export_to_document_tokens( xsize: int = 100, ysize: int = 100, add_location: bool = True, - add_caption: bool = True, - add_content: bool = True, add_cell_location: bool = True, - add_cell_label: bool = True, add_cell_text: bool = True, - add_page_index: bool = True, + add_caption: bool = True, ): r"""Export table to document tokens format. @@ -1253,15 +1238,11 @@ def export_to_document_tokens( :param xsize: int: (Default value = 100) :param ysize: int: (Default value = 100) :param add_location: bool: (Default value = True) - :param add_caption: bool: (Default value = True) - :param add_content: bool: (Default value = True) :param add_cell_location: bool: (Default value = True) - :param add_cell_label: bool: (Default value = True) :param add_cell_text: bool: (Default value = True) - :param add_page_index: bool: (Default value = True) - + :param add_caption: bool: (Default value = True) """ - body = f"{DocumentToken.BEG_TABLE.value}{new_line}" + body = f"{DocumentToken.BEG_OTSL.value}{new_line}" if add_location: body += self.get_location_tokens( @@ -1271,6 +1252,8 @@ def export_to_document_tokens( ysize=ysize, ) + body += self.export_to_otsl(doc, add_cell_location, add_cell_text, xsize, ysize) + if add_caption and len(self.captions): text = self.caption_text(doc) @@ -1280,62 +1263,7 @@ def export_to_document_tokens( body += f"{DocumentToken.END_CAPTION.value}" body += f"{new_line}" - if add_content and len(self.data.table_cells) > 0: - for i, row in enumerate(self.data.grid): - body += f"" - for j, col in enumerate(row): - - text = "" - if add_cell_text: - text = col.text.strip() - - cell_loc = "" - if ( - col.bbox is not None - and add_cell_location - and add_page_index - and len(self.prov) > 0 - ): - page_w, page_h = doc.pages[self.prov[0].page_no].size.as_tuple() - cell_loc = DocumentToken.get_location( - bbox=col.bbox.to_bottom_left_origin(page_h).as_tuple(), - page_w=page_w, - page_h=page_h, - xsize=xsize, - ysize=ysize, - ) - elif ( - col.bbox is not None - and add_cell_location - and not add_page_index - and len(self.prov) > 0 - ): - page_w, page_h = doc.pages[self.prov[0].page_no].size.as_tuple() - - cell_loc = DocumentToken.get_location( - bbox=col.bbox.to_bottom_left_origin(page_h).as_tuple(), - page_w=page_w, - page_h=page_h, - xsize=xsize, - ysize=ysize, - ) - - cell_label = "" - if add_cell_label: - if col.column_header: - cell_label = "" - elif col.row_header: - cell_label = "" - elif col.row_section: - cell_label = "" - else: - cell_label = "" - - body += f"{cell_loc}{cell_label}{text}" - - body += f"{new_line}" - - body += f"{DocumentToken.END_TABLE.value}{new_line}" + body += f"{DocumentToken.END_OTSL.value}{new_line}" return body @@ -2623,7 +2551,10 @@ def close_lists( return (in_ordered_list, result) - def add_page_break(result, item, previous_page_no, delim): + def add_page_break(result, item, previous_page_no, delim, add_page_break): + if not add_page_break: + return result, previous_page_no + prov_list = item.prov if len(prov_list) == 0: return result, previous_page_no @@ -2690,7 +2621,7 @@ def add_page_break(result, item, previous_page_no, delim): elif isinstance(item, SectionHeaderItem): result, previous_page_no = add_page_break( - result, item, previous_page_no, delim + result, item, previous_page_no, delim, add_page_index ) result += item.export_to_document_tokens( @@ -2700,11 +2631,10 @@ def add_page_break(result, item, previous_page_no, delim): ysize=ysize, add_location=add_location, add_content=add_content, - add_page_index=add_page_index, ) elif isinstance(item, CodeItem) and (item.label in labels): result, previous_page_no = add_page_break( - result, item, previous_page_no, delim + result, item, previous_page_no, delim, add_page_index ) result += item.export_to_document_tokens( @@ -2714,12 +2644,11 @@ def add_page_break(result, item, previous_page_no, delim): ysize=ysize, add_location=add_location, add_content=add_content, - add_page_index=add_page_index, ) elif isinstance(item, TextItem) and (item.label in labels): result, previous_page_no = add_page_break( - result, item, previous_page_no, delim + result, item, previous_page_no, delim, add_page_index ) result += item.export_to_document_tokens( @@ -2729,31 +2658,26 @@ def add_page_break(result, item, previous_page_no, delim): ysize=ysize, add_location=add_location, add_content=add_content, - add_page_index=add_page_index, ) elif isinstance(item, TableItem) and (item.label in labels): result, previous_page_no = add_page_break( - result, item, previous_page_no, delim + result, item, previous_page_no, delim, add_page_index ) - result += item.export_to_document_tokens( doc=self, new_line=delim, xsize=xsize, ysize=ysize, - add_caption=True, add_location=add_location, - add_content=add_content, add_cell_location=add_table_cell_location, - add_cell_label=add_table_cell_label, add_cell_text=add_table_cell_text, - add_page_index=add_page_index, + add_caption=True, ) elif isinstance(item, PictureItem) and (item.label in labels): result, previous_page_no = add_page_break( - result, item, previous_page_no, delim + result, item, previous_page_no, delim, add_page_index ) result += item.export_to_document_tokens( @@ -2764,7 +2688,6 @@ def add_page_break(result, item, previous_page_no, delim): add_caption=True, add_location=add_location, add_content=add_content, - add_page_index=add_page_index, ) result += DocumentToken.END_DOCUMENT.value diff --git a/docling_core/types/doc/labels.py b/docling_core/types/doc/labels.py index 8decdd4..50ef67a 100644 --- a/docling_core/types/doc/labels.py +++ b/docling_core/types/doc/labels.py @@ -111,7 +111,7 @@ class PictureClassificationLabel(str, Enum): SIGNATURE = "signature" STAMP = "stamp" QR_CODE = "qr_code" - BAR_CODE = "bat_code" + BAR_CODE = "bar_code" SCREENSHOT = "screenshot" # Geology/Geography diff --git a/test/data/doc/2206.01062.yaml.dt b/test/data/doc/2206.01062.yaml.dt index 83f62aa..9056b62 100644 --- a/test/data/doc/2206.01062.yaml.dt +++ b/test/data/doc/2206.01062.yaml.dt @@ -52,23 +52,9 @@ 4 ANNOTATION CAMPAIGN The annotation campaign was carried out in four phases. In phase one, we identified and prepared the data sources for annotation. In phase two, we determined the class labels and how annotations should be done on the documents in order to obtain maximum consistency. The latter was guided by a detailed requirement analysis and exhaustive experiments. In phase three, we trained the annotation staff and performed exams for quality assurance. In phase four, - + -% of Total% of Total% of Totaltriple inter-annotator mAP @ 0.5-0.95 (%)triple inter-annotator mAP @ 0.5-0.95 (%)triple inter-annotator mAP @ 0.5-0.95 (%)triple inter-annotator mAP @ 0.5-0.95 (%)triple inter-annotator mAP @ 0.5-0.95 (%)triple inter-annotator mAP @ 0.5-0.95 (%)triple inter-annotator mAP @ 0.5-0.95 (%) -class labelCountTrainTestValAllFinManSciLawPatTen -Caption225242.041.772.3284-8940-6186-9294-9995-9969-78n/a -Footnote63180.600.310.5883-91n/a10062-8885-94n/a82-97 -Formula250272.251.902.9683-85n/an/a84-8786-96n/an/a -List-item18566017.1913.3415.8287-8874-8390-9297-9781-8575-8893-95 -Page-footer708786.515.586.0093-9488-9095-9610092-9710096-98 -Page-header580225.106.705.0685-8966-7690-9498-10091-9297-9981-86 -Picture459764.212.785.3169-7156-5982-8669-8280-9566-7159-76 -Section-header14288412.6015.7712.8583-8476-8190-9294-9587-9469-7378-86 -Table347333.202.273.6077-8175-8083-8698-9958-8079-8470-85 -Text51037745.8249.2845.0084-8681-8688-9389-9387-9271-7987-95 -Title50710.470.300.5060-7224-6350-6394-10082-9668-7924-56 -Total1107470941123998166653182-8371-7479-8189-9486-9171-7668-85 -
+% of Totaltriple inter-annotator mAP @ 0.5-0.95 (%)class labelCountTrainTestValAllFinManSciLawPatTenCaption225242.041.772.3284-8940-6186-9294-9995-9969-78n/aFootnote63180.600.310.5883-91n/a10062-8885-94n/a82-97Formula250272.251.902.9683-85n/an/a84-8786-96n/an/aList-item18566017.1913.3415.8287-8874-8390-9297-9781-8575-8893-95Page-footer708786.515.586.0093-9488-9095-9610092-9710096-98Page-header580225.106.705.0685-8966-7690-9498-10091-9297-9981-86Picture459764.212.785.3169-7156-5982-8669-8280-9566-7159-76Section-header14288412.6015.7712.8583-8476-8190-9294-9587-9469-7378-86Table347333.202.273.6077-8175-8083-8698-9958-8079-8470-85Text51037745.8249.2845.0084-8681-8688-9389-9387-9271-7987-95Title50710.470.300.5060-7224-6350-6394-10082-9668-7924-56Total1107470941123998166653182-8371-7479-8189-9486-9171-7668-85 we distributed the annotation workload and performed continuous quality controls. Phase one and two required a small team of experts only. For phases three and four, a group of 40 dedicated annotators were assembled and supervised. Phase 1: Data selection and preparation. Our inclusion criteria for documents were described in Section 3. A large effort went into ensuring that all documents are free to use. The data sources include publication repositories such as arXiv$^{3}$, government offices, company websites as well as data directory services for financial reports and patents. Scanned documents were excluded wherever possible because they can be rotated or skewed. This would not allow us to perform annotation with rectangular bounding-boxes and therefore complicate the annotation process. @@ -93,24 +79,10 @@ were carried out over a timeframe of 12 weeks, after which 8 of the 40 initially allocated annotators did not pass the bar. Phase 4: Production annotation. The previously selected 80K pages were annotated with the defined 11 class labels by 32 annotators. This production phase took around three months to complete. All annotations were created online through CCS, which visualises the programmatic PDF text-cells as an overlay on the page. The page annotation are obtained by drawing rectangular bounding-boxes, as shown in Figure 3. With regard to the annotation practices, we implemented a few constraints and capabilities on the tooling level. First, we only allow non-overlapping, vertically oriented, rectangular boxes. For the large majority of documents, this constraint was sufficient and it speeds up the annotation considerably in comparison with arbitrary segmentation shapes. Second, annotator staff were not able to see each other's annotations. This was enforced by design to avoid any bias in the annotation, which could skew the numbers of the inter-annotator agreement (see Table 1). We wanted - + - -humanMRCNNMRCNNFRCNNYOLO -humanR50R101R101v5x6 -Caption84-8968.471.570.177.7 -Footnote83-9170.971.873.777.2 -Formula83-8560.163.463.566.2 -List-item87-8881.280.881.086.2 -Page-footer93-9461.659.358.961.1 -Page-header85-8971.970.072.067.9 -Picture69-7171.772.772.077.1 -Section-header83-8467.669.368.474.6 -Table77-8182.282.982.286.3 -Text84-8684.685.885.488.1 -Title60-7276.780.479.982.7 -All82-8372.473.573.476.8 -
Table 2: Prediction performance (mAP@0.5-0.95) of object detection networks on DocLayNet test set. The MRCNN (Mask R-CNN) and FRCNN (Faster R-CNN) models with ResNet-50 or ResNet-101 backbone were trained based on the network architectures from the detectron2 model zoo (Mask R-CNN R50, R101-FPN 3x, Faster R-CNN R101-FPN 3x), with default configurations. The YOLO implementation utilized was YOLOv5x6 [13]. All models were initialised using pre-trained weights from the COCO 2017 dataset.
+humanMRCNNFRCNNYOLOR50R101R101v5x6Caption84-8968.471.570.177.7Footnote83-9170.971.873.777.2Formula83-8560.163.463.566.2List-item87-8881.280.881.086.2Page-footer93-9461.659.358.961.1Page-header85-8971.970.072.067.9Picture69-7171.772.772.077.1Section-header83-8467.669.368.474.6Table77-8182.282.982.286.3Text84-8684.685.885.488.1Title60-7276.780.479.982.7All82-8372.473.573.476.8Table 2: Prediction performance (mAP@0.5-0.95) of object detection networks on DocLayNet test set. The MRCNN (Mask R-CNN) and FRCNN (Faster R-CNN) models with ResNet-50 or ResNet-101 backbone were trained based on the network architectures from the detectron2 model zoo (Mask R-CNN R50, R101-FPN 3x, Faster R-CNN R101-FPN 3x), with default configurations. The YOLO implementation utilized was YOLOv5x6 [13]. All models were initialised using pre-trained weights from the COCO 2017 dataset. + to avoid this at any cost in order to have clear, unbiased baseline numbers for human document-layout annotation. Third, we introduced the feature of snapping boxes around text segments to obtain a pixel-accurate annotation and again reduce time and effort. The CCS annotation tool automatically shrinks every user-drawn box to the minimum bounding-box around the enclosed text-cells for all purely text-based segments, which excludes only Table and Picture . For the latter, we instructed annotation staff to minimise inclusion of surrounding whitespace while including all graphical lines. A downside of snapping boxes to enclosed text cells is that some wrongly parsed PDF pages cannot be annotated correctly and need to be skipped. Fourth, we established a way to flag pages as rejected for cases where no valid annotation according to the label guidelines could be achieved. Example cases for this would be PDF pages that render incorrectly or contain layouts that are impossible to capture with non-overlapping rectangles. Such rejected pages are not contained in the final dataset. With all these measures in place, experienced annotation staff managed to annotate a single page in a typical timeframe of 20s to 60s, depending on its complexity. 5 EXPERIMENTS @@ -122,70 +94,28 @@ Baselines for Object Detection In Table 2, we present baseline experiments (given in mAP) on Mask R-CNN [12], Faster R-CNN [11], and YOLOv5 [13]. Both training and evaluation were performed on RGB images with dimensions of 1025 × 1025 pixels. For training, we only used one annotation in case of redundantly annotated pages. As one can observe, the variation in mAP between the models is rather low, but overall between 6 and 10% lower than the mAP computed from the pairwise human annotations on triple-annotated pages. This gives a good indication that the DocLayNet dataset poses a worthwhile challenge for the research community to close the gap between human recognition and ML approaches. It is interesting to see that Mask R-CNN and Faster R-CNN produce very comparable mAP scores, indicating that pixel-based image segmentation derived from bounding-boxes does not help to obtain better predictions. On the other hand, the more recent Yolov5x model does very well and even out-performs humans on selected labels such as Text , Table and Picture . This is not entirely surprising, as Text , Table and Picture are abundant and the most visually distinctive in a document. - + - -Class-count11654 -Caption68TextTextText -Footnote71TextTextText -Formula60TextTextText -List-item81Text82Text -Page-footer6262-- -Page-header7268-- -Picture72727272 -Section-header68676968 -Table82838282 -Text85848484 -Title77Sec.-h.Sec.-h.Sec.-h. -Overall72737877 -
Table 3: Performance of a Mask R-CNN R50 network in mAP@0.5-0.95 scores trained on DocLayNet with different class label sets. The reduced label sets were obtained by either down-mapping or
+Class-count11654Caption68TextTextTextFootnote71TextTextTextFormula60TextTextTextList-item81Text82TextPage-footer6262--Page-header7268--Picture72727272Section-header68676968Table82838282Text85848484Title77Sec.-h.Sec.-h.Sec.-h.Overall72737877Table 3: Performance of a Mask R-CNN R50 network in mAP@0.5-0.95 scores trained on DocLayNet with different class label sets. The reduced label sets were obtained by either down-mapping or + Learning Curve One of the fundamental questions related to any dataset is if it is "large enough". To answer this question for DocLayNet, we performed a data ablation study in which we evaluated a Mask R-CNN model trained on increasing fractions of the DocLayNet dataset. As can be seen in Figure 5, the mAP score rises sharply in the beginning and eventually levels out. To estimate the error-bar on the metrics, we ran the training five times on the entire data-set. This resulted in a 1% error-bar, depicted by the shaded area in Figure 5. In the inset of Figure 5, we show the exact same data-points, but with a logarithmic scale on the x-axis. As is expected, the mAP score increases linearly as a function of the data-size in the inset. The curve ultimately flattens out between the 80% and 100% mark, with the 80% mark falling within the error-bars of the 100% mark. This provides a good indication that the model would not improve significantly by yet increasing the data size. Rather, it would probably benefit more from improved data consistency (as discussed in Section 3), data augmentation methods [23], or the addition of more document categories and styles. Impact of Class Labels The choice and number of labels can have a significant effect on the overall model performance. Since PubLayNet, DocBank and DocLayNet all have different label sets, it is of particular interest to understand and quantify this influence of the label set on the model performance. We investigate this by either down-mapping labels into more common ones (e.g. Caption → Text ) or excluding them from the annotations entirely. Furthermore, it must be stressed that all mappings and exclusions were performed on the data before model training. In Table 3, we present the mAP scores for a Mask R-CNN R50 network on different label sets. Where a label is down-mapped, we show its corresponding label, otherwise it was excluded. We present three different label sets, with 6, 5 and 4 different labels respectively. The set of 5 labels contains the same labels as PubLayNet. However, due to the different definition of - + - -Class-count111155 -SplitDocPageDocPage -Caption6883 -Footnote7184 -Formula6066 -List-item81888288 -Page-footer6289 -Page-header7290 -Picture72827282 -Section-header68836983 -Table82898290 -Text85918490 -Title7781 -All72847887 -
Table 4: Performance of a Mask R-CNN R50 network with document-wise and page-wise split for different label sets. Naive page-wise
+Class-count115SplitDocPageDocPageCaption6883Footnote7184Formula6066List-item81888288Page-footer6289Page-header7290Picture72827282Section-header68836983Table82898290Text85918490Title7781All72847887Table 4: Performance of a Mask R-CNN R50 network with document-wise and page-wise split for different label sets. Naive page-wise + lists in PubLayNet (grouped list-items) versus DocLayNet (separate list-items), the label set of size 4 is the closest to PubLayNet, in the assumption that the List is down-mapped to Text in PubLayNet. The results in Table 3 show that the prediction accuracy on the remaining class labels does not change significantly when other classes are merged into them. The overall macro-average improves by around 5%, in particular when Page-footer and Page-header are excluded. Impact of Document Split in Train and Test Set Many documents in DocLayNet have a unique styling. In order to avoid overfitting on a particular style, we have split the train-, test- and validation-sets of DocLayNet on document boundaries, i.e. every document contributes pages to only one set. To the best of our knowledge, this was not considered in PubLayNet or DocBank. To quantify how this affects model performance, we trained and evaluated a Mask R-CNN R50 model on a modified dataset version. Here, the train-, test- and validation-sets were obtained by a randomised draw over the individual pages. As can be seen in Table 4, the difference in model performance is surprisingly large: pagewise splitting gains ˜ 10% in mAP over the document-wise splitting. Thus, random page-wise splitting of DocLayNet can easily lead to accidental overestimation of model performance and should be avoided. Dataset Comparison Throughout this paper, we claim that DocLayNet's wider variety of document layouts leads to more robust layout detection models. In Table 5, we provide evidence for that. We trained models on each of the available datasets (PubLayNet, DocBank and DocLayNet) and evaluated them on the test sets of the other datasets. Due to the different label sets and annotation styles, a direct comparison is not possible. Hence, we focussed on the common labels among the datasets. Between PubLayNet and DocLayNet, these are Picture , - + - -Testing onTesting onTesting on -labelsPLNDBDLN -Figure964323 -Sec-header87-32 -Table952449 -Text96-42 -total933430 -Figure777131 -Table196522 -total486827 -Figure675172 -Sec-header53-68 -Table874382 -Text77-84 -total594778 -
Table 5: Prediction Performance (mAP@0.5-0.95) of a Mask R-CNN R50 network across the PubLayNet, DocBank & DocLayNet data-sets. By evaluating on common label classes of each dataset, we observe that the DocLayNet-trained model has much less pronounced variations in performance across all datasets.
+Testing onlabelsPLNDBDLNFigure964323Sec-header87-32Table952449Text96-42total933430Figure777131Table196522total486827Figure675172Sec-header53-68Table874382Text77-84total594778Table 5: Prediction Performance (mAP@0.5-0.95) of a Mask R-CNN R50 network across the PubLayNet, DocBank & DocLayNet data-sets. By evaluating on common label classes of each dataset, we observe that the DocLayNet-trained model has much less pronounced variations in performance across all datasets. + Section-header , Table and Text . Before training, we either mapped or excluded DocLayNet's other labels as specified in table 3, and also PubLayNet's List to Text . Note that the different clustering of lists (by list-element vs. whole list objects) naturally decreases the mAP score for Text . For comparison of DocBank with DocLayNet, we trained only on Picture and Table clusters of each dataset. We had to exclude Text because successive paragraphs are often grouped together into a single object in DocBank. This paragraph grouping is incompatible with the individual paragraphs of DocLayNet. As can be seen in Table 5, DocLayNet trained models yield better performance compared to the previous datasets. It is noteworthy that the models trained on PubLayNet and DocBank perform very well on their own test set, but have a much lower performance on the foreign datasets. While this also applies to DocLayNet, the difference is far less pronounced. Thus we conclude that DocLayNet trained models are overall more robust and will produce better results for challenging, unseen layouts. Example Predictions diff --git a/test/data/doc/constructed_doc.dt b/test/data/doc/constructed_doc.dt index 339c740..26b4a1b 100644 --- a/test/data/doc/constructed_doc.dt +++ b/test/data/doc/constructed_doc.dt @@ -17,12 +17,9 @@ Affiliation 2 list item 4 - - -ProductYearsYears -Product20162017 -Apple49823695944 -
This is the caption of table 1.
+ +ProductYears20162017Apple49823695944This is the caption of table 1. + This is the caption of figure 1. diff --git a/test/data/doc/constructed_doc.dt.gt b/test/data/doc/constructed_doc.dt.gt index 339c740..26b4a1b 100644 --- a/test/data/doc/constructed_doc.dt.gt +++ b/test/data/doc/constructed_doc.dt.gt @@ -17,12 +17,9 @@ Affiliation 2 list item 4 - - -ProductYearsYears -Product20162017 -Apple49823695944 -
This is the caption of table 1.
+ +ProductYears20162017Apple49823695944This is the caption of table 1. + This is the caption of figure 1. diff --git a/test/data/doc/constructed_document.yaml.dt b/test/data/doc/constructed_document.yaml.dt index 339c740..26b4a1b 100644 --- a/test/data/doc/constructed_document.yaml.dt +++ b/test/data/doc/constructed_document.yaml.dt @@ -17,12 +17,9 @@ Affiliation 2 list item 4 - - -ProductYearsYears -Product20162017 -Apple49823695944 -
This is the caption of table 1.
+ +ProductYears20162017Apple49823695944This is the caption of table 1. + This is the caption of figure 1. diff --git a/test/data/doc/dummy_doc.yaml.dt b/test/data/doc/dummy_doc.yaml.dt index 9323a85..1c11aab 100644 --- a/test/data/doc/dummy_doc.yaml.dt +++ b/test/data/doc/dummy_doc.yaml.dt @@ -4,7 +4,7 @@ Figure 1: Four examples of complex page layouts across different document categories
- + -
+ \ No newline at end of file diff --git a/test/test_docling_doc.py b/test/test_docling_doc.py index 1e1652a..0041161 100644 --- a/test/test_docling_doc.py +++ b/test/test_docling_doc.py @@ -280,6 +280,10 @@ def _test_export_methods(doc: DoclingDocument, filename: str): # Test DocTags export ... dt_pred = doc.export_to_document_tokens() + # print("\n\n\n\n\n\n\n") + # print(filename) + # print(dt_pred) + # print("\n\n\n\n\n\n\n") _verify_regression_test(dt_pred, filename=filename, ext="dt") # Test Tables export ...