Skip to content

Commit

Permalink
fix: fix code handling in HTML export
Browse files Browse the repository at this point in the history
Also did some minor refactoring

Signed-off-by: Panos Vagenas <[email protected]>
  • Loading branch information
vagenas committed Jan 31, 2025
1 parent c6590e8 commit 19b7c9a
Showing 1 changed file with 26 additions and 45 deletions.
71 changes: 26 additions & 45 deletions docling_core/types/doc/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -2381,10 +2381,16 @@ def close_lists(

in_ordered_list: List[bool] = [] # False

def _sanitize_text(text: str, do_escape_html=True) -> str:
def _prepare_tag_content(
text: str,
do_escape_html=True,
replace_newlines_with_br=True,
) -> str:
if do_escape_html:
text = html.escape(text, quote=False)
return text
_text = html.escape(text, quote=False)
if replace_newlines_with_br:
_text = _text.replace("\n", "<br>")
return _text

for ix, (item, curr_level) in enumerate(
self.iterate_items(self.body, with_groups=True, page_no=page_no)
Expand Down Expand Up @@ -2416,7 +2422,7 @@ def _sanitize_text(text: str, do_escape_html=True) -> str:
]:

text = "<ol>"
html_texts.append(text.strip())
html_texts.append(text)

# Increment list nesting level when entering a new list
in_ordered_list.append(True)
Expand All @@ -2426,7 +2432,7 @@ def _sanitize_text(text: str, do_escape_html=True) -> str:
]:

text = "<ul>"
html_texts.append(text.strip())
html_texts.append(text)

# Increment list nesting level when entering a new list
in_ordered_list.append(False)
Expand All @@ -2436,63 +2442,39 @@ def _sanitize_text(text: str, do_escape_html=True) -> str:

elif isinstance(item, TextItem) and item.label in [DocItemLabel.TITLE]:

text = f"<h1>{_sanitize_text(item.text)}</h1>"
html_texts.append(text.strip())
text = f"<h1>{_prepare_tag_content(item.text)}</h1>"
html_texts.append(text)

elif isinstance(item, SectionHeaderItem):

section_level: int = item.level + 1

text = (
f"<h{(section_level)}>"
f"{_sanitize_text(item.text)}</h{(section_level)}>"
f"{_prepare_tag_content(item.text)}</h{(section_level)}>"
)
html_texts.append(text.strip())

elif isinstance(item, TextItem) and item.label in [
DocItemLabel.SECTION_HEADER
]:

section_level = curr_level

if section_level <= 1:
section_level = 2
html_texts.append(text)

if section_level >= 6:
section_level = 6
elif isinstance(item, CodeItem):

text = (
f"<h{section_level}>{_sanitize_text(item.text)}</h{section_level}>"
preped_cont = _prepare_tag_content(
item.text,
do_escape_html=False,
replace_newlines_with_br=False,
)
html_texts.append(text.strip())

elif isinstance(item, TextItem) and item.label in [DocItemLabel.CODE]:

text = f"<pre>{_sanitize_text(item.text, do_escape_html=False)}</pre>"
text = f"<pre>{preped_cont}</pre>"
html_texts.append(text)

elif isinstance(item, ListItem):

text = f"<li>{_sanitize_text(item.text)}</li>"
text = f"<li>{_prepare_tag_content(item.text)}</li>"
html_texts.append(text)

elif isinstance(item, TextItem) and item.label in [DocItemLabel.LIST_ITEM]:
elif isinstance(item, TextItem):

text = f"<li>{_sanitize_text(item.text)}</li>"
text = f"<p>{_prepare_tag_content(item.text)}</p>"
html_texts.append(text)

elif isinstance(item, CodeItem) and item.label in labels:
text = (
"<pre><code>"
f"{_sanitize_text(item.text, do_escape_html=False)}"
"</code></pre>"
)
html_texts.append(text.strip())

elif isinstance(item, TextItem) and item.label in labels:

text = f"<p>{_sanitize_text(item.text)}</p>"
html_texts.append(text.strip())
elif isinstance(item, TableItem):

text = item.export_to_html(doc=self, add_caption=True)
Expand All @@ -2506,15 +2488,14 @@ def _sanitize_text(text: str, do_escape_html=True) -> str:
)
)

elif isinstance(item, DocItem) and item.label in labels:
elif isinstance(item, DocItem):
continue

html_texts.append("</html>")

lines = []
lines.extend(head_lines)
for i, line in enumerate(html_texts):
lines.append(line.replace("\n", "<br>"))
lines.extend(html_texts)

delim = "\n"
html_text = (delim.join(lines)).strip()
Expand Down

0 comments on commit 19b7c9a

Please sign in to comment.