From 19b7c9aa494047d1f8164158a9b82281caa4dc98 Mon Sep 17 00:00:00 2001 From: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Date: Fri, 31 Jan 2025 13:52:50 +0100 Subject: [PATCH] fix: fix code handling in HTML export Also did some minor refactoring Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> --- docling_core/types/doc/document.py | 71 +++++++++++------------------- 1 file changed, 26 insertions(+), 45 deletions(-) diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index 8781531..192d537 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -2381,10 +2381,16 @@ def close_lists( in_ordered_list: List[bool] = [] # False - def _sanitize_text(text: str, do_escape_html=True) -> str: + def _prepare_tag_content( + text: str, + do_escape_html=True, + replace_newlines_with_br=True, + ) -> str: if do_escape_html: - text = html.escape(text, quote=False) - return text + _text = html.escape(text, quote=False) + if replace_newlines_with_br: + _text = _text.replace("\n", "
") + return _text for ix, (item, curr_level) in enumerate( self.iterate_items(self.body, with_groups=True, page_no=page_no) @@ -2416,7 +2422,7 @@ def _sanitize_text(text: str, do_escape_html=True) -> str: ]: text = "
    " - html_texts.append(text.strip()) + html_texts.append(text) # Increment list nesting level when entering a new list in_ordered_list.append(True) @@ -2426,7 +2432,7 @@ def _sanitize_text(text: str, do_escape_html=True) -> str: ]: text = "