Skip to content

Commit de7b963

Browse files
authored
fix(html): use 'start' attribute when parsing ordered lists from HTML docs (#1062)
* fix(html): use 'start' attribute in ordered lists When parsing ordered lists in HTML, take into account the 'start' attribute if it exists. Signed-off-by: Cesar Berrospi Ramis <[email protected]> * chore(html): reduce verbosity in HTML backend Signed-off-by: Cesar Berrospi Ramis <[email protected]> --------- Signed-off-by: Cesar Berrospi Ramis <[email protected]>
1 parent 37dd8c1 commit de7b963

File tree

2 files changed

+76
-6
lines changed

2 files changed

+76
-6
lines changed

docling/backend/html_backend.py

+19-5
Original file line numberDiff line numberDiff line change
@@ -256,10 +256,16 @@ def handle_list(self, element: Tag, doc: DoclingDocument) -> None:
256256
parent=self.parents[self.level], name="list", label=GroupLabel.LIST
257257
)
258258
elif element.name == "ol":
259+
start_attr = element.get("start")
260+
start: int = (
261+
int(start_attr)
262+
if isinstance(start_attr, str) and start_attr.isnumeric()
263+
else 1
264+
)
259265
# create a list group
260266
self.parents[self.level + 1] = doc.add_group(
261267
parent=self.parents[self.level],
262-
name="ordered list",
268+
name="ordered list" + (f" start {start}" if start != 1 else ""),
263269
label=GroupLabel.ORDERED_LIST,
264270
)
265271
self.level += 1
@@ -270,15 +276,23 @@ def handle_list(self, element: Tag, doc: DoclingDocument) -> None:
270276
self.level -= 1
271277

272278
def handle_list_item(self, element: Tag, doc: DoclingDocument) -> None:
273-
"""Handles listitem tags (li)."""
279+
"""Handles list item tags (li)."""
274280
nested_list = element.find(["ul", "ol"])
275281

276282
parent = self.parents[self.level]
277283
if parent is None:
278-
_log.warning(f"list-item has no parent in DoclingDocument: {element}")
284+
_log.debug(f"list-item has no parent in DoclingDocument: {element}")
279285
return
280286
parent_label: str = parent.label
281287
index_in_list = len(parent.children) + 1
288+
if (
289+
parent_label == GroupLabel.ORDERED_LIST
290+
and isinstance(parent, GroupItem)
291+
and parent.name
292+
):
293+
start_in_list: str = parent.name.split(" ")[-1]
294+
start: int = int(start_in_list) if start_in_list.isnumeric() else 1
295+
index_in_list += start - 1
282296

283297
if nested_list:
284298
# Text in list item can be hidden within hierarchy, hence
@@ -324,13 +338,13 @@ def handle_list_item(self, element: Tag, doc: DoclingDocument) -> None:
324338
parent=parent,
325339
)
326340
else:
327-
_log.warning(f"list-item has no text: {element}")
341+
_log.debug(f"list-item has no text: {element}")
328342

329343
@staticmethod
330344
def parse_table_data(element: Tag) -> Optional[TableData]:
331345
nested_tables = element.find("table")
332346
if nested_tables is not None:
333-
_log.warning("Skipping nested table.")
347+
_log.debug("Skipping nested table.")
334348
return None
335349

336350
# Count the number of rows (number of <tr> elements)

tests/test_backend_html.py

+57-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import os
1+
from io import BytesIO
22
from pathlib import Path
33

44
from docling.backend.html_backend import HTMLDocumentBackend
@@ -41,6 +41,62 @@ def test_heading_levels():
4141
assert found_lvl_2 and found_lvl_3
4242

4343

44+
def test_ordered_lists():
45+
test_set: list[tuple[bytes, str]] = []
46+
47+
test_set.append(
48+
(
49+
b"<html><body><ol><li>1st item</li><li>2nd item</li></ol></body></html>",
50+
"1. 1st item\n2. 2nd item",
51+
)
52+
)
53+
test_set.append(
54+
(
55+
b'<html><body><ol start="1"><li>1st item</li><li>2nd item</li></ol></body></html>',
56+
"1. 1st item\n2. 2nd item",
57+
)
58+
)
59+
test_set.append(
60+
(
61+
b'<html><body><ol start="2"><li>1st item</li><li>2nd item</li></ol></body></html>',
62+
"2. 1st item\n3. 2nd item",
63+
)
64+
)
65+
test_set.append(
66+
(
67+
b'<html><body><ol start="0"><li>1st item</li><li>2nd item</li></ol></body></html>',
68+
"0. 1st item\n1. 2nd item",
69+
)
70+
)
71+
test_set.append(
72+
(
73+
b'<html><body><ol start="-5"><li>1st item</li><li>2nd item</li></ol></body></html>',
74+
"1. 1st item\n2. 2nd item",
75+
)
76+
)
77+
test_set.append(
78+
(
79+
b'<html><body><ol start="foo"><li>1st item</li><li>2nd item</li></ol></body></html>',
80+
"1. 1st item\n2. 2nd item",
81+
)
82+
)
83+
84+
for pair in test_set:
85+
in_doc = InputDocument(
86+
path_or_stream=BytesIO(pair[0]),
87+
format=InputFormat.HTML,
88+
backend=HTMLDocumentBackend,
89+
filename="test",
90+
)
91+
backend = HTMLDocumentBackend(
92+
in_doc=in_doc,
93+
path_or_stream=BytesIO(pair[0]),
94+
)
95+
doc: DoclingDocument = backend.convert()
96+
assert doc
97+
assert doc.export_to_markdown() == pair[1]
98+
99+
44100
def get_html_paths():
45101

46102
# Define the directory you want to search

0 commit comments

Comments
 (0)