From 19b7c9aa494047d1f8164158a9b82281caa4dc98 Mon Sep 17 00:00:00 2001
From: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
Date: Fri, 31 Jan 2025 13:52:50 +0100
Subject: [PATCH] fix: fix code handling in HTML export
Also did some minor refactoring
Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
---
docling_core/types/doc/document.py | 71 +++++++++++-------------------
1 file changed, 26 insertions(+), 45 deletions(-)
diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py
index 8781531..192d537 100644
--- a/docling_core/types/doc/document.py
+++ b/docling_core/types/doc/document.py
@@ -2381,10 +2381,16 @@ def close_lists(
in_ordered_list: List[bool] = [] # False
- def _sanitize_text(text: str, do_escape_html=True) -> str:
+ def _prepare_tag_content(
+ text: str,
+ do_escape_html=True,
+ replace_newlines_with_br=True,
+ ) -> str:
if do_escape_html:
- text = html.escape(text, quote=False)
- return text
+ _text = html.escape(text, quote=False)
+ if replace_newlines_with_br:
+ _text = _text.replace("\n", "
")
+ return _text
for ix, (item, curr_level) in enumerate(
self.iterate_items(self.body, with_groups=True, page_no=page_no)
@@ -2416,7 +2422,7 @@ def _sanitize_text(text: str, do_escape_html=True) -> str:
]:
text = "
{_sanitize_text(item.text, do_escape_html=False)}" + text = f"
{preped_cont}" html_texts.append(text) elif isinstance(item, ListItem): - text = f"
{_prepare_tag_content(item.text)}
" html_texts.append(text) - elif isinstance(item, CodeItem) and item.label in labels: - text = ( - ""
- f"{_sanitize_text(item.text, do_escape_html=False)}"
- "
"
- )
- html_texts.append(text.strip())
-
- elif isinstance(item, TextItem) and item.label in labels:
-
- text = f"{_sanitize_text(item.text)}
" - html_texts.append(text.strip()) elif isinstance(item, TableItem): text = item.export_to_html(doc=self, add_caption=True) @@ -2506,15 +2488,14 @@ def _sanitize_text(text: str, do_escape_html=True) -> str: ) ) - elif isinstance(item, DocItem) and item.label in labels: + elif isinstance(item, DocItem): continue html_texts.append("