Skip to content

Commit c7289f6

Browse files
committed
Add standalone equations as DocItem formula
Signed-off-by: Rafael Teixeira de Lima <[email protected]>
1 parent f503494 commit c7289f6

File tree

1 file changed

+25
-6
lines changed

1 file changed

+25
-6
lines changed

docling/backend/msword_backend.py

Lines changed: 25 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -242,8 +242,8 @@ def handle_equations_in_text(self, element, text):
242242
def handle_text_elements(self, element, docx_obj, doc):
243243
paragraph = docx.text.paragraph.Paragraph(element, docx_obj)
244244

245-
text = paragraph.text
246-
text = self.handle_equations_in_text(element=element, text=text)
245+
raw_text = paragraph.text
246+
text = self.handle_equations_in_text(element=element, text=raw_text)
247247

248248
if text is None:
249249
return
@@ -281,21 +281,20 @@ def handle_text_elements(self, element, docx_obj, doc):
281281
self.parents[key] = None
282282
self.level = self.level_at_new_list - 1
283283
self.level_at_new_list = None
284+
284285
if p_style_id in ["Title"]:
285286
for key, val in self.parents.items():
286287
self.parents[key] = None
287288
self.parents[0] = doc.add_text(
288289
parent=None, label=DocItemLabel.TITLE, text=text
289290
)
291+
290292
elif "Heading" in p_style_id:
291293
self.add_header(element, docx_obj, doc, p_style_id, p_level, text)
292294

293295
elif p_style_id in [
294-
"Paragraph",
295-
"Normal",
296296
"Subtitle",
297297
"Author",
298-
"DefaultText",
299298
"ListParagraph",
300299
"ListBullet",
301300
"Quote",
@@ -305,12 +304,32 @@ def handle_text_elements(self, element, docx_obj, doc):
305304
label=DocItemLabel.PARAGRAPH, parent=self.parents[level - 1], text=text
306305
)
307306

307+
elif (raw_text is None or len(raw_text) == 0) and len(text) > 0:
308+
# Standalone equation
309+
# Entities in which all text comes from equations
310+
level = self.get_level()
311+
if text.strip().startswith("$") and text.strip().endswith("$"):
312+
text = text.strip()[1:-1]
313+
doc.add_text(
314+
label=DocItemLabel.FORMULA, parent=self.parents[level - 1], text=text
315+
)
316+
317+
elif p_style_id in [
318+
"Paragraph",
319+
"Normal",
320+
"DefaultText",
321+
]:
322+
level = self.get_level()
323+
doc.add_text(
324+
label=DocItemLabel.PARAGRAPH, parent=self.parents[level - 1], text=text
325+
)
326+
308327
else:
309328
# Text style names can, and will have, not only default values but user values too
310329
# hence we treat all other labels as pure text
311330
level = self.get_level()
312331
doc.add_text(
313-
label=DocItemLabel.PARAGRAPH, parent=self.parents[level - 1], text=text
332+
label=DocItemLabel.TEXT, parent=self.parents[level - 1], text=text
314333
)
315334
self.update_history(p_style_id, p_level, numid, ilevel)
316335
return

0 commit comments

Comments
 (0)