From f4885c63243071e3ddd4c18f5abb96ac30f1d097 Mon Sep 17 00:00:00 2001 From: Maksym Lysak Date: Mon, 11 Nov 2024 13:24:20 +0100 Subject: [PATCH] Added handling of code blocks in html with
 tag

Signed-off-by: Maksym Lysak 
---
 docling/backend/html_backend.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py
index 7d14c2ebf..9cd1e29b9 100644
--- a/docling/backend/html_backend.py
+++ b/docling/backend/html_backend.py
@@ -120,6 +120,8 @@ def analyse_element(self, element, idx, doc):
             self.handle_header(element, idx, doc)
         elif element.name in ["p"]:
             self.handle_paragraph(element, idx, doc)
+        elif element.name in ["pre"]:
+            self.handle_code(element, idx, doc)
         elif element.name in ["ul", "ol"]:
             self.handle_list(element, idx, doc)
         elif element.name in ["li"]:
@@ -205,6 +207,16 @@ def handle_header(self, element, idx, doc):
                 level=hlevel,
             )
 
+    def handle_code(self, element, idx, doc):
+        """Handles monospace code snippets (pre)."""
+        if element.text is None:
+            return
+        text = element.text.strip()
+        label = DocItemLabel.CODE
+        if len(text) == 0:
+            return
+        doc.add_text(parent=self.parents[self.level], label=label, text=text)
+
     def handle_paragraph(self, element, idx, doc):
         """Handles paragraph tags (p)."""
         if element.text is None: