From 2e5d07f54560538cc421f0d20809b5fb305a8ffa Mon Sep 17 00:00:00 2001 From: Michele Dolfi Date: Fri, 31 Jan 2025 10:59:44 +0100 Subject: [PATCH] escape also html export Signed-off-by: Michele Dolfi --- docling_core/types/doc/document.py | 34 +++++++++++++++++++++--------- test/data/doc/2206.01062.yaml.html | 6 +++--- 2 files changed, 27 insertions(+), 13 deletions(-) diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index 8af2d80f..87815319 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -1046,7 +1046,7 @@ def export_to_html( text = "" if doc is not None and add_caption and len(self.captions): - text = self.caption_text(doc) + text = html.escape(self.caption_text(doc)) if len(self.data.table_cells) == 0: return "" @@ -1072,7 +1072,7 @@ def export_to_html( if colstart != j: continue - content = cell.text.strip() + content = html.escape(cell.text.strip()) celltag = "td" if cell.column_header: celltag = "th" @@ -2381,6 +2381,11 @@ def close_lists( in_ordered_list: List[bool] = [] # False + def _sanitize_text(text: str, do_escape_html=True) -> str: + if do_escape_html: + text = html.escape(text, quote=False) + return text + for ix, (item, curr_level) in enumerate( self.iterate_items(self.body, with_groups=True, page_no=page_no) ): @@ -2431,14 +2436,17 @@ def close_lists( elif isinstance(item, TextItem) and item.label in [DocItemLabel.TITLE]: - text = f"

{item.text}

" + text = f"

{_sanitize_text(item.text)}

" html_texts.append(text.strip()) elif isinstance(item, SectionHeaderItem): section_level: int = item.level + 1 - text = f"{item.text}" + text = ( + f"" + f"{_sanitize_text(item.text)}" + ) html_texts.append(text.strip()) elif isinstance(item, TextItem) and item.label in [ @@ -2453,31 +2461,37 @@ def close_lists( if section_level >= 6: section_level = 6 - text = f"{item.text}" + text = ( + f"{_sanitize_text(item.text)}" + ) html_texts.append(text.strip()) elif isinstance(item, TextItem) and item.label in [DocItemLabel.CODE]: - text = f"
{item.text}
" + text = f"
{_sanitize_text(item.text, do_escape_html=False)}
" html_texts.append(text) elif isinstance(item, ListItem): - text = f"
  • {item.text}
  • " + text = f"
  • {_sanitize_text(item.text)}
  • " html_texts.append(text) elif isinstance(item, TextItem) and item.label in [DocItemLabel.LIST_ITEM]: - text = f"
  • {item.text}
  • " + text = f"
  • {_sanitize_text(item.text)}
  • " html_texts.append(text) elif isinstance(item, CodeItem) and item.label in labels: - text = f"
    {item.text}
    " + text = ( + "
    "
    +                    f"{_sanitize_text(item.text, do_escape_html=False)}"
    +                    "
    " + ) html_texts.append(text.strip()) elif isinstance(item, TextItem) and item.label in labels: - text = f"

    {item.text}

    " + text = f"

    {_sanitize_text(item.text)}

    " html_texts.append(text.strip()) elif isinstance(item, TableItem): diff --git a/test/data/doc/2206.01062.yaml.html b/test/data/doc/2206.01062.yaml.html index c2ae548f..ba2add20 100644 --- a/test/data/doc/2206.01062.yaml.html +++ b/test/data/doc/2206.01062.yaml.html @@ -81,7 +81,7 @@

    1 INTRODUCTION

  • (3) Detailed Label Set : We define 11 class labels to distinguish layout features in high detail. PubLayNet provides 5 labels; DocBank provides 13, although not a superset of ours.
  • (4) Redundant Annotations : A fraction of the pages in the DocLayNet data set carry more than one human annotation.
  • This enables experimentation with annotation uncertainty and quality control analysis.

    -
  • (5) Pre-defined Train-, Test- & Validation-set : Like DocBank, we provide fixed train-, test- & validation-sets to ensure proportional representation of the class-labels. Further, we prevent leakage of unique layouts across sets, which has a large effect on model accuracy scores.
  • +
  • (5) Pre-defined Train-, Test- & Validation-set : Like DocBank, we provide fixed train-, test- & validation-sets to ensure proportional representation of the class-labels. Further, we prevent leakage of unique layouts across sets, which has a large effect on model accuracy scores.
  • All aspects outlined above are detailed in Section 3. In Section 4, we will elaborate on how we designed and executed this large-scale human annotation campaign. We will also share key insights and lessons learned that might prove helpful for other parties planning to set up annotation campaigns.

    In Section 5, we will present baseline accuracy numbers for a variety of object detection methods (Faster R-CNN, Mask R-CNN and YOLOv5) trained on DocLayNet. We further show how the model performance is impacted by varying the DocLayNet dataset size, reducing the label set and modifying the train/test-split. Last but not least, we compare the performance of models trained on PubLayNet, DocBank and DocLayNet and demonstrate that a model trained on DocLayNet provides overall more robust layout recovery.

    2 RELATED WORK

    @@ -91,7 +91,7 @@

    3 THE DOCLAYNET DATASET

    DocLayNet contains 80863 PDF pages. Among these, 7059 carry two instances of human annotations, and 1591 carry three. This amounts to 91104 total annotation instances. The annotations provide layout information in the shape of labeled, rectangular boundingboxes. We define 11 distinct labels for layout features, namely Caption , Footnote , Formula , List-item , Page-footer , Page-header , Picture , Section-header , Table , Text , and Title . Our reasoning for picking this particular label set is detailed in Section 4.

    In addition to open intellectual property constraints for the source documents, we required that the documents in DocLayNet adhere to a few conditions. Firstly, we kept scanned documents

    Figure 2: Distribution of DocLayNet pages across document categories.
    -

    The pages in DocLayNet can be grouped into six distinct categories, namely Financial Reports , Manuals , Scientific Articles , Laws & Regulations , Patents and Government Tenders . Each document category was sourced from various repositories. For example, Financial Reports contain both free-style format annual reports 2 which expose company-specific, artistic layouts as well as the more formal SEC filings. The two largest categories ( Financial Reports and Manuals ) contain a large amount of free-style layouts in order to obtain maximum variability. In the other four categories, we boosted the variability by mixing documents from independent providers, such as different government websites or publishers. In Figure 2, we show the document categories contained in DocLayNet with their respective sizes.

    +

    The pages in DocLayNet can be grouped into six distinct categories, namely Financial Reports , Manuals , Scientific Articles , Laws & Regulations , Patents and Government Tenders . Each document category was sourced from various repositories. For example, Financial Reports contain both free-style format annual reports 2 which expose company-specific, artistic layouts as well as the more formal SEC filings. The two largest categories ( Financial Reports and Manuals ) contain a large amount of free-style layouts in order to obtain maximum variability. In the other four categories, we boosted the variability by mixing documents from independent providers, such as different government websites or publishers. In Figure 2, we show the document categories contained in DocLayNet with their respective sizes.

    We did not control the document selection with regard to language. The vast majority of documents contained in DocLayNet (close to 95%) are published in English language. However, DocLayNet also contains a number of documents in other languages such as German (2.5%), French (1.0%) and Japanese (1.0%). While the document language has negligible impact on the performance of computer vision methods such as object detection and segmentation models, it might prove challenging for layout analysis methods which exploit textual features.

    To ensure that future benchmarks in the document-layout analysis community can be easily compared, we have split up DocLayNet into pre-defined train-, test- and validation-sets. In this way, we can avoid spurious variations in the evaluation scores due to random splitting in train-, test- and validation-sets. We also ensured that less frequent labels are represented in train and test sets in equal proportions.

    Table 1 shows the overall frequency and distribution of the labels among the different sets. Importantly, we ensure that subsets are only split on full-document boundaries. This avoids that pages of the same document are spread over train, test and validation set, which can give an undesired evaluation advantage to models and lead to overestimation of their prediction accuracy. We will show the impact of this decision in Section 5.

    @@ -138,7 +138,7 @@

    Impact of Document Split in Train and Test Set

    Many documents in DocLayNet have a unique styling. In order to avoid overfitting on a particular style, we have split the train-, test- and validation-sets of DocLayNet on document boundaries, i.e. every document contributes pages to only one set. To the best of our knowledge, this was not considered in PubLayNet or DocBank. To quantify how this affects model performance, we trained and evaluated a Mask R-CNN R50 model on a modified dataset version. Here, the train-, test- and validation-sets were obtained by a randomised draw over the individual pages. As can be seen in Table 4, the difference in model performance is surprisingly large: pagewise splitting gains ˜ 10% in mAP over the document-wise splitting. Thus, random page-wise splitting of DocLayNet can easily lead to accidental overestimation of model performance and should be avoided.

    Dataset Comparison

    Throughout this paper, we claim that DocLayNet's wider variety of document layouts leads to more robust layout detection models. In Table 5, we provide evidence for that. We trained models on each of the available datasets (PubLayNet, DocBank and DocLayNet) and evaluated them on the test sets of the other datasets. Due to the different label sets and annotation styles, a direct comparison is not possible. Hence, we focussed on the common labels among the datasets. Between PubLayNet and DocLayNet, these are Picture ,

    -
    Table 5: Prediction Performance (mAP@0.5-0.95) of a Mask R-CNN R50 network across the PubLayNet, DocBank & DocLayNet data-sets. By evaluating on common label classes of each dataset, we observe that the DocLayNet-trained model has much less pronounced variations in performance across all datasets.
    Testing on
    labelsPLNDBDLN
    Figure964323
    Sec-header87-32
    Table952449
    Text96-42
    total933430
    Figure777131
    Table196522
    total486827
    Figure675172
    Sec-header53-68
    Table874382
    Text77-84
    total594778
    +
    Table 5: Prediction Performance (mAP@0.5-0.95) of a Mask R-CNN R50 network across the PubLayNet, DocBank & DocLayNet data-sets. By evaluating on common label classes of each dataset, we observe that the DocLayNet-trained model has much less pronounced variations in performance across all datasets.
    Testing on
    labelsPLNDBDLN
    Figure964323
    Sec-header87-32
    Table952449
    Text96-42
    total933430
    Figure777131
    Table196522
    total486827
    Figure675172
    Sec-header53-68
    Table874382
    Text77-84
    total594778

    Section-header , Table and Text . Before training, we either mapped or excluded DocLayNet's other labels as specified in table 3, and also PubLayNet's List to Text . Note that the different clustering of lists (by list-element vs. whole list objects) naturally decreases the mAP score for Text .

    For comparison of DocBank with DocLayNet, we trained only on Picture and Table clusters of each dataset. We had to exclude Text because successive paragraphs are often grouped together into a single object in DocBank. This paragraph grouping is incompatible with the individual paragraphs of DocLayNet. As can be seen in Table 5, DocLayNet trained models yield better performance compared to the previous datasets. It is noteworthy that the models trained on PubLayNet and DocBank perform very well on their own test set, but have a much lower performance on the foreign datasets. While this also applies to DocLayNet, the difference is far less pronounced. Thus we conclude that DocLayNet trained models are overall more robust and will produce better results for challenging, unseen layouts.

    Example Predictions