Skip to content

Commit 85c4df8

Browse files
authored
fix(html): fix HTML parsed heading level (docling-project#1244)
Signed-off-by: Panos Vagenas <[email protected]>
1 parent 9eb1686 commit 85c4df8

13 files changed

+68
-68
lines changed

docling/backend/html_backend.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -243,7 +243,7 @@ def handle_header(self, element: Tag, doc: DoclingDocument) -> None:
243243
self.parents[hlevel] = doc.add_heading(
244244
parent=self.parents[hlevel - 1],
245245
text=text,
246-
level=hlevel,
246+
level=hlevel - 1,
247247
content_layer=self.content_layer,
248248
)
249249

tests/data/groundtruth/docling_v2/example_01.html.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@
117117
"prov": [],
118118
"orig": "Background",
119119
"text": "Background",
120-
"level": 2
120+
"level": 1
121121
},
122122
{
123123
"self_ref": "#/texts/3",

tests/data/groundtruth/docling_v2/example_01.html.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
This is the first paragraph of the introduction.
44

5-
### Background
5+
## Background
66

77
Some background information here.
88

tests/data/groundtruth/docling_v2/example_02.html.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@
114114
"prov": [],
115115
"orig": "Background",
116116
"text": "Background",
117-
"level": 2
117+
"level": 1
118118
},
119119
{
120120
"self_ref": "#/texts/3",

tests/data/groundtruth/docling_v2/example_02.html.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
This is the first paragraph of the introduction.
44

5-
### Background
5+
## Background
66

77
Some background information here.
88

tests/data/groundtruth/docling_v2/example_03.html.json

+3-3
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,7 @@
133133
"prov": [],
134134
"orig": "Introduction",
135135
"text": "Introduction",
136-
"level": 2
136+
"level": 1
137137
},
138138
{
139139
"self_ref": "#/texts/2",
@@ -168,7 +168,7 @@
168168
"prov": [],
169169
"orig": "Background",
170170
"text": "Background",
171-
"level": 2
171+
"level": 1
172172
},
173173
{
174174
"self_ref": "#/texts/4",
@@ -317,7 +317,7 @@
317317
"prov": [],
318318
"orig": "Data Table",
319319
"text": "Data Table",
320-
"level": 2
320+
"level": 1
321321
}
322322
],
323323
"pictures": [],

tests/data/groundtruth/docling_v2/example_03.html.md

+3-3
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
# Example Document
22

3-
### Introduction
3+
## Introduction
44

55
This is the first paragraph of the introduction.
66

7-
### Background
7+
## Background
88

99
Some background information here.
1010

@@ -18,7 +18,7 @@ Some background information here.
1818
2. Nested ordered item 2
1919
2. Second item in ordered list
2020

21-
### Data Table
21+
## Data Table
2222

2323
| Header 1 | Header 2 | Header 3 |
2424
|--------------|--------------|--------------|

tests/data/groundtruth/docling_v2/mixed.md.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
Some text
44

5-
### Famous ducks
5+
## Famous ducks
66

77
Here is a table:
88

tests/data/groundtruth/docling_v2/unit_test_01.html.json

+6-6
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@
7676
"prov": [],
7777
"orig": "section-1",
7878
"text": "section-1",
79-
"level": 2
79+
"level": 1
8080
},
8181
{
8282
"self_ref": "#/texts/2",
@@ -89,7 +89,7 @@
8989
"prov": [],
9090
"orig": "section-1.1",
9191
"text": "section-1.1",
92-
"level": 3
92+
"level": 2
9393
},
9494
{
9595
"self_ref": "#/texts/3",
@@ -112,7 +112,7 @@
112112
"prov": [],
113113
"orig": "section-2",
114114
"text": "section-2",
115-
"level": 2
115+
"level": 1
116116
},
117117
{
118118
"self_ref": "#/texts/4",
@@ -125,7 +125,7 @@
125125
"prov": [],
126126
"orig": "section-2.0.1",
127127
"text": "section-2.0.1",
128-
"level": 4
128+
"level": 3
129129
},
130130
{
131131
"self_ref": "#/texts/5",
@@ -138,7 +138,7 @@
138138
"prov": [],
139139
"orig": "section-2.2",
140140
"text": "section-2.2",
141-
"level": 3
141+
"level": 2
142142
},
143143
{
144144
"self_ref": "#/texts/6",
@@ -151,7 +151,7 @@
151151
"prov": [],
152152
"orig": "section-2.3",
153153
"text": "section-2.3",
154-
"level": 3
154+
"level": 2
155155
}
156156
],
157157
"pictures": [],
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
# Title
22

3-
### section-1
3+
## section-1
44

5-
#### section-1.1
5+
### section-1.1
66

7-
### section-2
7+
## section-2
88

9-
##### section-2.0.1
9+
#### section-2.0.1
1010

11-
#### section-2.2
11+
### section-2.2
1212

13-
#### section-2.3
13+
### section-2.3

tests/data/groundtruth/docling_v2/wiki_duck.html.json

+20-20
Original file line numberDiff line numberDiff line change
@@ -1932,7 +1932,7 @@
19321932
"prov": [],
19331933
"orig": "Contents",
19341934
"text": "Contents",
1935-
"level": 2
1935+
"level": 1
19361936
},
19371937
{
19381938
"self_ref": "#/texts/23",
@@ -4785,7 +4785,7 @@
47854785
"prov": [],
47864786
"orig": "Etymology",
47874787
"text": "Etymology",
4788-
"level": 2
4788+
"level": 1
47894789
},
47904790
{
47914791
"self_ref": "#/texts/215",
@@ -4895,7 +4895,7 @@
48954895
"prov": [],
48964896
"orig": "Taxonomy",
48974897
"text": "Taxonomy",
4898-
"level": 2
4898+
"level": 1
48994899
},
49004900
{
49014901
"self_ref": "#/texts/223",
@@ -4966,7 +4966,7 @@
49664966
"prov": [],
49674967
"orig": "Morphology",
49684968
"text": "Morphology",
4969-
"level": 2
4969+
"level": 1
49704970
},
49714971
{
49724972
"self_ref": "#/texts/228",
@@ -5028,7 +5028,7 @@
50285028
"prov": [],
50295029
"orig": "Distribution and habitat",
50305030
"text": "Distribution and habitat",
5031-
"level": 2
5031+
"level": 1
50325032
},
50335033
{
50345034
"self_ref": "#/texts/232",
@@ -5102,7 +5102,7 @@
51025102
"prov": [],
51035103
"orig": "Behaviour",
51045104
"text": "Behaviour",
5105-
"level": 2
5105+
"level": 1
51065106
},
51075107
{
51085108
"self_ref": "#/texts/237",
@@ -5140,7 +5140,7 @@
51405140
"prov": [],
51415141
"orig": "Feeding",
51425142
"text": "Feeding",
5143-
"level": 3
5143+
"level": 2
51445144
},
51455145
{
51465146
"self_ref": "#/texts/238",
@@ -5256,7 +5256,7 @@
52565256
"prov": [],
52575257
"orig": "Breeding",
52585258
"text": "Breeding",
5259-
"level": 3
5259+
"level": 2
52605260
},
52615261
{
52625262
"self_ref": "#/texts/247",
@@ -5300,7 +5300,7 @@
53005300
"prov": [],
53015301
"orig": "Communication",
53025302
"text": "Communication",
5303-
"level": 3
5303+
"level": 2
53045304
},
53055305
{
53065306
"self_ref": "#/texts/250",
@@ -5347,7 +5347,7 @@
53475347
"prov": [],
53485348
"orig": "Predators",
53495349
"text": "Predators",
5350-
"level": 3
5350+
"level": 2
53515351
},
53525352
{
53535353
"self_ref": "#/texts/253",
@@ -5409,7 +5409,7 @@
54095409
"prov": [],
54105410
"orig": "Relationship with humans",
54115411
"text": "Relationship with humans",
5412-
"level": 2
5412+
"level": 1
54135413
},
54145414
{
54155415
"self_ref": "#/texts/257",
@@ -5429,7 +5429,7 @@
54295429
"prov": [],
54305430
"orig": "Hunting",
54315431
"text": "Hunting",
5432-
"level": 3
5432+
"level": 2
54335433
},
54345434
{
54355435
"self_ref": "#/texts/258",
@@ -5473,7 +5473,7 @@
54735473
"prov": [],
54745474
"orig": "Domestication",
54755475
"text": "Domestication",
5476-
"level": 3
5476+
"level": 2
54775477
},
54785478
{
54795479
"self_ref": "#/texts/261",
@@ -5517,7 +5517,7 @@
55175517
"prov": [],
55185518
"orig": "Heraldry",
55195519
"text": "Heraldry",
5520-
"level": 3
5520+
"level": 2
55215521
},
55225522
{
55235523
"self_ref": "#/texts/264",
@@ -5561,7 +5561,7 @@
55615561
"prov": [],
55625562
"orig": "Cultural references",
55635563
"text": "Cultural references",
5564-
"level": 3
5564+
"level": 2
55655565
},
55665566
{
55675567
"self_ref": "#/texts/267",
@@ -5605,7 +5605,7 @@
56055605
"prov": [],
56065606
"orig": "See also",
56075607
"text": "See also",
5608-
"level": 2
5608+
"level": 1
56095609
},
56105610
{
56115611
"self_ref": "#/texts/270",
@@ -5723,7 +5723,7 @@
57235723
"prov": [],
57245724
"orig": "Notes",
57255725
"text": "Notes",
5726-
"level": 2
5726+
"level": 1
57275727
},
57285728
{
57295729
"self_ref": "#/texts/278",
@@ -5740,7 +5740,7 @@
57405740
"prov": [],
57415741
"orig": "Citations",
57425742
"text": "Citations",
5743-
"level": 3
5743+
"level": 2
57445744
},
57455745
{
57465746
"self_ref": "#/texts/279",
@@ -6527,7 +6527,7 @@
65276527
"prov": [],
65286528
"orig": "Sources",
65296529
"text": "Sources",
6530-
"level": 3
6530+
"level": 2
65316531
},
65326532
{
65336533
"self_ref": "#/texts/335",
@@ -6860,7 +6860,7 @@
68606860
"prov": [],
68616861
"orig": "External links",
68626862
"text": "External links",
6863-
"level": 2
6863+
"level": 1
68646864
},
68656865
{
68666866
"self_ref": "#/texts/356",

0 commit comments

Comments
 (0)