From c2f048bd2d0f425574aa6d4a09eaaea7566b31eb Mon Sep 17 00:00:00 2001 From: Maksym Lysak Date: Thu, 30 Jan 2025 14:00:09 +0100 Subject: [PATCH] Fix for docx when headers are also lists, now recorded as appropriate headers and subheaders, unit test included Signed-off-by: Maksym Lysak --- docling/backend/msword_backend.py | 30 +- .../data/docx/unit_test_headers_numbered.docx | Bin 0 -> 16880 bytes .../unit_test_headers_numbered.docx.itxt | 52 ++ .../unit_test_headers_numbered.docx.json | 753 ++++++++++++++++++ .../unit_test_headers_numbered.docx.md | 43 + 5 files changed, 870 insertions(+), 8 deletions(-) create mode 100644 tests/data/docx/unit_test_headers_numbered.docx create mode 100644 tests/data/groundtruth/docling_v2/unit_test_headers_numbered.docx.itxt create mode 100644 tests/data/groundtruth/docling_v2/unit_test_headers_numbered.docx.json create mode 100644 tests/data/groundtruth/docling_v2/unit_test_headers_numbered.docx.md diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index f8148d525..f372d83a2 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -240,7 +240,11 @@ def handle_text_elements(self, element, docx_obj, doc): numid = None # Handle lists - if numid is not None and ilevel is not None: + if ( + numid is not None + and ilevel is not None + and p_style_id not in ["Title", "Heading"] + ): self.add_listitem( element, docx_obj, @@ -254,12 +258,22 @@ def handle_text_elements(self, element, docx_obj, doc): ) self.update_history(p_style_id, p_level, numid, ilevel) return - elif numid is None and self.prev_numid() is not None: # Close list - for key, val in self.parents.items(): - if key >= self.level_at_new_list: + elif ( + numid is None + and self.prev_numid() is not None + and p_style_id not in ["Title", "Heading"] + ): # Close list + if self.level_at_new_list: + for key, val in self.parents.items(): + if key >= self.level_at_new_list: + self.parents[key] = None + self.level = self.level_at_new_list - 1 + self.level_at_new_list = None + else: + for key, val in self.parents.items(): self.parents[key] = None - self.level = self.level_at_new_list - 1 - self.level_at_new_list = None + self.level = 0 + if p_style_id in ["Title"]: for key, val in self.parents.items(): self.parents[key] = None @@ -520,11 +534,11 @@ def get_docx_image(element, drawing_blip): image_data = image_part.blob # Get the binary image data return image_data - image_data = get_docx_image(element, drawing_blip) - image_bytes = BytesIO(image_data) level = self.get_level() # Open the BytesIO object with PIL to create an Image try: + image_data = get_docx_image(element, drawing_blip) + image_bytes = BytesIO(image_data) pil_image = Image.open(image_bytes) doc.add_picture( parent=self.parents[level - 1], diff --git a/tests/data/docx/unit_test_headers_numbered.docx b/tests/data/docx/unit_test_headers_numbered.docx new file mode 100644 index 0000000000000000000000000000000000000000..259125c35f4c3eff60cace0aa21bfceb6eac63f2 GIT binary patch literal 16880 zcmeIZWmH_*)-GH?aCdjNK!OGj?(XjH?ykYz9TJ@2?i$?P-6go+>OQAWcjxpszH#s0 z_bmpq2CLRH_h#3#)|_jKtOVFQGyntu3IG5Q1E9?*;oU(1fN}@`02KfQsv%%w?Pz4} zsH5m+YviEy-qp&IDDNF8RSp0YIRAfN|AU`EZQ`(W4p~a9PJh8cn1M3I`9SCjalyhuN z^rD3OcwA^VjT{H;r{&Lv@s~HVruA1ysPfAxHRO3d8(c%=SjbHKUGjytT~GxEs9ju>6~UlJj4qqB05N>SqX>DwyAKoergqn!*A6PY%>iVOKluIiL{<)OpL|zNl={WDvRf8&y)mcJ-rh(wHM9Ox_u&_Q3DcXz zbVOu0WfWSG%nEhYVq-jRdU=oFpaqFKy6s586E%Ppz(i z#Gux~53X~zTX5-nx5h6+q+--V0pV)sU&e7DV(&v!Mmte=X#{U~zrmW3P>;_Tkk#a8 z*(uH6rFb*tq<)y4GZ)qHfGpt_Y7fS59>eyAG3KpGW}=UY?p~|b>Qy|7Z~Pc(PJKg% zW68)so)N2G27`0aI(g)t-|?+4EQk&|7dnjL$p8Jnte!ZAM+gCZ8S6TgU88AmNU> zeFJu~mb8;`IV85^@B}eugTk~CT9o1Gc%Fio*pTCx0)DDq9)Rn!e_u215;g7p`ZLa? zhEk2qOigoq6Fp_z5II>)ovw1<=M9SY@}bNupCqihI%fV#WpSiq<@sS?e_u$x|5y{_ zRPr<0=vLF0%Kg&%a*U+eUfNWVQKhVYr^STmQzgwk<%j!}{weqO$G!Xj|FepcVO7@1 z>V1(ug_)5C+skllb$O9-^Y0L~<*w+n<(`WMoh|`L!&UbmkT^g@N0GV=sGhB<%*`sJ zd{5$i6I4V*cbE%wB%`u|1EXRW%tY_8!m@)MYcqdt!)q&7%VtI!Wr~!$mnAC`%<|69 zQi5v=Sv|%Wn?02c>J*I6nvd#b8~c1hpVr=`QUP^M?@TN&S(!3#(=^)*p z9)G;cqBX@bw-HNUpz9%+=zTd|+Fk6wS|{b#{ho%S&(_@=B#_=l*|gN<(L*W6M1+a+ zMF!C~3SdeHA1@T$)qthk8f_gJj}xpEU89B`8GjZQ6pX{}Yz&AgTL<^EJr8L=*aR!1 zFpHmk+IVECzFsJbRFQ`HfU{mf39D zdJ)HIuB7%&ojm?+-j|~TWh;jfD>$#^Y?pGH;kxF;hYSw9!?~am==k$_2LL-jHb*Ny zCVG;?!YR5Z2K9UUzV*Zmc57+Nu-8)>bqJ6l=W0@m@`xNA8oUjrZW!%p3Fa zmgTcT_p75zI4WnnvqwkffYb3VID{AGbU!D{SUv>o{H|Of6o}4UKbK~qXnC50<%}0{ zQmvStNSQv4_0|Mq8MQZ3VViMQRQ)@ofB6u z`Vx(BcYQ#bMt%ndBanQEg{R)@-KpgrEYve7Ncx%6&<9Wr+YE-`k>~WN%B+M&TUTkm zIQS?wW&`I!zOixzK>S9U+1=T*LRg4ILIm!ut3$^hx!_9&1-c_O7#x|Rt3LwqvvFg@ zj$t5JjBkm+ky;Jm-{W1;&oh16+QxTw<1s~rA!X+XbdLIbVXEpEeFh~Mpd9*l&y&I#) zw+U;)?ieIR!;r_E849h0k2C~|XZk{*S}-9L2&M(1ZHP+48sQ%YO(~eMd3aC6A4qV@ zx-vmhm$(uTzU!&LQ-oTy#g!skKxLde-ePW2f)2=vX2izl|A5$}9`trjJR|57c8-!9 zPc8HO-k9e&DqWyj3j@in7zHyGpT8-NJYB}MzZ#$L={O!gm%eo96w z@?ds4*V-tA(ISQs2If1;P&rx(-|2{YJ6>b|^urJJgX~}bq#1*_!$Gj?m-Wz@H|iN; zd%Fyk;g3kg4)D*8@sybl(TUw@8bKUb4T| z%Q9fz3G^_*n}2gKNkwytfSeIgNK)lz6oDSk721Ff#3D;J=qZAz)_M;$t1a0FVe5w( zhdQ=t((jb-zgNN9caEpC_Lk3)orpzTTP&u0lx6Fok)}Ls6Dw>vEMB*~TurW3vGlSn zuvl-@?lZQto#Rs2tx3}Y=M)`nw1_~Ed|dCbk`_K(CRe@;R_XoCLVD=lqWsOZVQ`9n ztclQDKkt`KXf=iLd$6Z>Ncxmg{yinu2-BUl4t=c$*ShK zbd;7#%=$L;Aes_Q)WgTl98CzN~V)b(j{Do3nS(BRk_mf_x zB_;nn~6nk~1X zAE!ngoliBF_cE)5cSb(A#u>@G}uSl@0ks`&YnOcN?$}7C3r%eM4>uiSm-rU z&yj@CAgMh?GuC$&kd*6ECB1Lu^d_(*rfeq;$b6tiiL3g=&)Jpl9UT{N#TR{TG2 zYoK*#)xZ^pgNO$*vfuU>_B&T!!i>j|C4|^DT3#dfIZ}wTkepRpI()pcg(~uy4WUQa zdo1oE5X}{AvqF_DW-AkpQ%7Z z(q_ley9rgz-^{IAhFNPniGP;1SH$^c=xB#{ueh7&3L?cO==Q!zWc2T97 z6yrcpIY<89F581v^3{|MIU3e>abv7D-p1};Q|={(3zI(5GwODxe!koLFhW6lYh&E6F<(Z*>JU@YmZaP1&VfIWSh2oZ3{m1cUNtbF@$!_4c*M&_&lVOpkef!bg3t_Jbxzogl(tNPg?IPf@(5O#!Z%^|LKc?;?`bfG`(Hu{+i!RUEvUyk0KcI)jB;g<;0i8(_3*I{A zJ}B%s`KVEN6F~`5m)l^`s2%G8D50Pk=r8_^R3_pQ{E>by1Xb+1+bZn#Cn6He@A(!% zY>>sO>aQ0*c6Y^KtLGYFqD2DyLgW^Nq#*P62GwRkR--`ul<=o4@eaQ@T-g;ANvKDV zO0^X1vdizYU!H*^e}Pe55gV-v>Tr){pQ~(21E}f^YpeF3K&N!I_Ulq;i>SU+hxe0- z?+OIp`k1@OTug0$y06}w_*yXO@^!%EDi+IG(-WXiDLT7RCMLt{l);~) zEja}9CkFyM?#k#lZT*)cMS}K78uiBa6u|Njbfrzlhx3f-Xdy_foH!AV)s$KGGtdY0 ztaN*_xG%~-VEU2~28MpE!#~@cVNj*0661hV*@yb1Woh?CN`W~_G~^Szg)OBwvQ6Ea zs+`ySM-}==uZZ8d?R5YLqju_K!JUN z-2?)Ors2ADvZnp!qz@^tlD`(}2Oudok7aZ+w@m&K5F68D47i?b7?U z7N4kB{O80lXUB`X*$jRaH(1~M4C#tu#a21?1rGyC(GU6*R9C5?!Vnn0EoYVnw{5F)vZeF>H)JXVep z0RZTK?A(r~Mpi})zsnE5ZQdtpqv5!0XdRf3yrE5;1koMIt^vF?QqJ+@#|1l7%ReLL z(k6+1w0t0zjZ;nh!I>jHyvy=- zKR-NFbGTUwi19`3W&bKSOs!>y@0<4_b7RovlS@bde-DUerzt3Or0lR7(V6D*%XNII zt;jnnLB1goPssWVzEIb1syhaY@DJ!X`Jz!$)UdfRQhS(`a9QGQ#MJH;8nnSkA_$?n zRI_VY(wA6%x$=i66$y7?&zR5^9Q}d#)U$&z6AM4e3fk}eU7*1zV`}j}jR$5SOoD9! zK$~K~=m%zfr0wp^?dSysqb!B3bKa8dCH#JjOxuqTmV>Xh&le@kJ+kqD` z00s=2rUE4}G)m@Hf*X2``F`XJxzuRWq~Axp{;YpBF5gyyYlR?>PCR8nNPE$q?X-~n zsk++TpjA0dhwBP2PGF&NFW#Yb_XwdK0P3H=$l9!I!)z{bjq|b4a2&>F&fL1KA@Ey~ zK1DOigXm75I#e_@uBfl`do*@XedK&}tQ=->k3ufV@85gv`ryRBgLqydCW`=)AlRbQEJkl-p#txxb$5y6C(-Oe?^)j&4xF z5^f>c*}`V}b22Bu)=K&cpPZ5fgY>F}Iqx`NeI_^fd?%!vV?$5mhoK3X>fAbD*!&%& zay%NoTLdH>))Hh5jvvgGkNQy7SZi8}MP&}=YOq0OY)@?kA4$G2rgQ0iPwy52WY_Q}Li%TdJ4p@5l+S ztdlcM#pk)GsML25WUPsySjMdg`=G(f{fuO4J3MH9kvo(Ak?;Wy#N4N({GWaN z*&BYW7{dmfNl{%T&7MLUBBzN`T#N*-gd`Zpa&sI*Fh-PT*r6F#WnZSk$w46$e~-)A zLy)sNgv8oyej*SKZW&)+b}_=Kh783?Wm-<+n^}p5{TMX(wRM!Aw67k!URcEB3^%te z8p-dQtv^!ZQ!2cSLC+rbA+FyDjmMyII_^Y-(rgq`a$OvaHC20yFHUH(4SCX{coGzdo`{+#rvJiz|+^Zn|=X% zXv`Sve1JPL=`6a(3*Vsft;+aXU~=&wUpM|)vWYI+u|cT;hv@3US32p=wPb9{C0!oy zrFOl%;>oC(B5qBcyA4XKbe>*Er@jJ;YO#ShHg$wdu0E!yl4G1WSH+o_E)4?%zK|z5 z=vCcah(H=pVsU9su1D4*6?`rDKpm^&cGu0FX(H*cm}TT9)pw)MWfe0Rm@YV`NK6^m!lA1tfU$ZfO)vblL z;bohJh{M1dx#X&UaQPkYvt*8NFi(#vrJnBU3%A=(d1N>5!xsBVPK|uxn<+LMCnP+J z8UHSR)!s!Twwm5l(KO`b_O17c5DMG;&=7*9Z-Jlwl%-fp_j63DHGZ0#Nv}{gM7mrl zLa35fv4#di!{LFdu(ynIsRK#lR~nx>#s#gmB*~9{XIe(dkf0;j)}Ihp&0D5!N>=@s zi*8{P?fdANOHS=`waSwuUd(@-I<<;SQISAN%o?8gJy*$K47=Pl`y9V`HPX~oMwqD< zG$Xt>*L!(w-`VZxfK*lA(;LNQ9v+o0pP*}i3=*%LrebcXlg?E+p7RLMZdfu!Ao#Q) z7(jb!-yna?22&^BpiwlVDbz7ZH$w|_31^lgy$fGV%`BRJ&fX2Rrp=%IvjK!-zivqY zo{Ep*|Ki*^7&$tcS)2UpSS1C+bt{6Hs?>WlKTtF&) zMP5WmcsaH?>3yeet<-!ZQMI4`VEi!)yJbE*>(jAi=#=6YF}7ByW_%L+uwqr2X5OT$ zHKT@ncoeJ;)Pn^rq|HxDBZlNw!C>PAxte2<&qyw~IG+xc#mV4>qNg>*=vDgD$n5L#KACy92Dyi? zFBwu?eL>aB7-%Pf(~sfDtewe<>_~?1?2fnezCp$vYh7c7Tf1n|D85+RqTf^tkpGck z3J=+viWu`{?(zbCa_tWM-kN7p&d&|2aaM09{|YKk<(-{$*7$a#CHteU+n~Uu8`ou4 zNPQUm_8qtbpKoamRi0mT&#USc*!xKM_hb22CNmB7gE^pwUP9`^_G^$)w~m}^dm1$j zin{RJQ;hB$%!aYuQBe79FDS8Rudg$P_+3#ET$ zWf=Q2(lM(VZLglbTUbq(VGxaR5|uDkJP4POL?6%^s-{cELH7INn!Q={qbj(C_mw{#Ti)`BPG6k@*fNHkG)#t6{?a_ zmd7E6fB1^vf)sL^q^M)P%pTll8(tWATQ%!s2ct3oH#cRnF7_DixE{37}VQK zz*l&%96pEBNEVX6M<>nU&msFozq*zsB@Lg{fjvLrwYKoGa4Hxyzcgb#bVCIMNsrqn zs6y>b{3GR{qAWb&&ItNS#-#n{#JHo+x~A!f4#Fz5d7U6ZN$Py$^5hNg$FrfkC(;Oa z8m4x1WP@{ARQd0?r3yd9`K%>^?>yc~PtDtFv|lK9SM;20aXJoZvozv}sN8KE6PR%h zbD@mpJv|*#?|g*nR&zx0%l+8S9i}O~x%xuALxZy}OlkGXTffZSm^La+JWbV(pAlh& zGjK`A6SNZ9-_!W1+`y0(>MIkbhEKLo)`ClXCeqDNy?)3%hD)f)m%Op=_OeH1QUw_W z8m=tll*=6?vC@%%Hf?y*D6Dh=IoD2nDjCLE@(76rW7vW@dYk-QX>mO&pGAj7+R&no z#IhlrRT{-P`aPxY&F@tO-CXi#yUj!LHsl*|T)s4Z9o7lM3h7x^`ZwVW6V)@HRdB4xm>t6mt^r=`^;plJ-q&XIeW8d8b-&_sUk5Z-|@D`tzi?_o0~ zoSvImHWFLWS7z@vqN}5(dlq#xxR7`su6i;}{7p9MFv{dKd?ODbLk`}cJa8Z&cF4fz z6CECf>K@{)5p&sYj~W-@*!eLJqR82#-pWpZ!<)3SLIHHe)5u%0^A+Jg`tWk$2B!sJ z06+!saQdIpsYc9JEeqM3T2LN4#`zg2mST_#sTCeb@9CYX)Q2YXpz!w%(hNk2GSJ>Om1 zqGN^2qY%W??K@iGnVEUEJ-jq9JjJM-q{6V+rn5k6~CD0dkN>Ma)0d6Q(B< zOg9MwE`XE;B2S?25o%A7&-JbZg2)9!v=}EQ=}MH>ZY*7-TOf`%dIwIdi)(6wqiPd; z9Cm1Sg90-`^oF`V6!}sfT?IxHDV67&8cp;UGdCOo`PYvEvDHUVBVm-Np1nU)jHSz{ zAC`*VA!9eg$}oT=9zS!3B-|5>D;Fzh}blasnMkO54Q zIWGE*78Z923OFaT>JEJ5w$WK26*yHXKGkr5qujuHI9;qNlk-kB)gYBdMat~Mf_`DC zL-}fbVYaEJ;ZWanJGhe%;tIGR;iy(M6`uGg>-fcbqkyzW+#yW?MB05x)x!|MlYS2l zNhKa>VUi;UMT3_3g@*HZK<<^25z-zRxMyB$ z=PoEKFk7V+8kO_zmq5dlJe z_Mpz3-o1KtM1Fh}a?YurSsdGJa_FZRIxe^I(!AE&o#>shBWe2CmJW#2P#58WHf|K; z6B0$B&2ch=XK7PWW`y~iMfojWc^{(NNVC*qIEt(Jw$6jz){lyN+7*XBC-X-mm5Y$3 zElt}#M+y%EgOl48HTARQGj>9UaGW1sIkoDd1G3D=;n$Q`1&;@cS(D>~3{I|X5Ldpk z1u^hyHy%pOpK)r-JPj42&KRApeKTcyIi!YadBHQ(=n5s<@k#P#N;&gCzYVC1T4U3K z;+X_@U|EDsWYobAJbt|%b@bZqi29fE2?@3a1OZ_Q*h@AHez zI&YTAgF-Kp44`-MVWszK+cxGwqFsN{U5%=Vnu=RS$NcMt)$Cm+kiSEb=bTgW14>WCQ^)W&8B$W#B}B z>8TAsgW-*p^+E^WL}?hA1G*rgr!$dAqCXDBoA2C4 zIWr@F^4QH@Ul|Y^+Kc5wA=rVYyg>W@taDxXobUa!(!9gc|Ct!)Hb7P-=HBMs-ztwQ zKK=09jAa`A%|1ui)rbd|Nh)EmOI})DnG)3(uRD&h?pXG2wWN+Q{yBGBYM)wy%bM2u zr;l*mlYVrUe&4FtpM!)j^TXB>U$oEIX>K)th_l=Z(X}(&Uf$2{FT3xWx9sH=g(FMo zthOr$Z|7f8wVFIopyvu(Q@_KHo!Stu&Gs2(ql=>?Oo^KGu~@Bp-*By_;YCXl5=z>@ zkr9^fdC7B0=xUoD4u+X8pvF*jbv62pSyCRk`n|A;iSa(YPSANb(T3e=uPZjh^E`Pp zuL)Pt?VD$Lchur0WxDEfR@MmGx*SI9%h6RGoc@;Sh~c?U0z@pA2dZ)q?bREGuEBJd z)vd&sA=zF3GE&JSRcKi4pn18~z=~y^R%Q^5X>+B2)UDFQWjjpTX=zTp7rG;G1Z~6t z1Fhc=>iWo26^?H;$Uu%1XMlv2IZ8;v6fazHf*pfA$Br&4svns6#~IlA3YEciX{D{| za9iBB++ybvjcnc667r61WtG$y_w3B~Jh+9ky=C(ZVIMf2-vAuajU1R&pg$KLq#J`Q zwTvZm%h*|opPueArNdA)casR9gyhbOV~zs|i?A{wAqromF0Gvsix-@2yS-O&)~>$5 z*lg3xfU97S?zo^*bBG4_>m+?A*`&&&=HA=)O`&~*I@(M6aKSN04$FaIt!(dIdz0~< zp(+CK<&1Hfw9N(yJq5+>ccEY}KJNg?erGk8djo8a7Pz8N+0*KhLQ8 zYf-|?wJS9RjWc^(nOX&KO%=CjE<<;ewveIAdU4-ZA?l~mQJS8c&M`HlyRFfv2hB*- zHqb2{K%d&yRe3gsD1kO2Oh-sUoPK{t@GK{7LY|2hTUoMu@kL3s$EXfJQMA;A2-+N* z{1r?J%loc&uUIvS8&0@cI*-b22Lyl1DvM%i{wOY+O59!S_0+u0P_1ZA96t0_z{`#{ z;LficlI=jDw>u@cp`lWlE`qBWySJqU@w~0YkUlxJ+Q1aR5YHu}Y?wlUdLOOx@SM$z z?n#f)b%`oxp@Ui&g0`R}$~b6RKt{xmGCjhGAl5G%bZ7I*ETf7uk>e%a)b2sGFBsZj zSvMDt0BRN1cu%#3*kviY`4Uo?I|3^gb`10b=E%pLL@M#{<2J4z@g5HuJJ5O=Cl@!R3Bye1lvr z?ma8A79@tRA-5_Z&Pfl&O4&|$Q9npzK{rTbRhvIq&h;Jjr=Yt^X1KvYSzSd*Dn=R{ zG%R#Tn1}@5O_zh2&fxX03ShY2l~E9+X`nNmMb8nA_ms^;3Sjv^nZSj2O<*F1#?i16 zv!&KuJ*$x7b(@L5LWu1;u!uJMQH(v9XBNq<|lvZX56H;UFkDvUx zqZ>5Te4!*O$md5w?hMNJEZ@(p15opN-$h=^W~ptJBEY=w#g0yFbc!sMBP!(zRS) z1Q#NfN7+v8^6uh{Z+TK`X)EG1U@Y&778<3)W;wda3DB%rtS76$5^=sT8#J6#60~p& z3pt}jWSmg7IT?i|F=$1W~-vcw3AMLl_T3O8AEc(!}IiAB;ARy)K-L^4*$-380)q2r^zk*cE0;z)muE$z4MJt|;)1 zvIHc8a~5eR)f@`=30#C~2{CAr`Q3)4h;1HDXQKjqZbosithHiKE}R65V5dAh5QT^r zQ_gs%C_rUWLY#m*Yil1;h(n@V91H~n`=ibR`ZIybBv}IO^Kr`JCV>~Bn?(WvqW`K< zORz%2WO!;xf%w%!?!c6i0qw)#$)`2ad)A#Sdu~>$(}?zX`)aoCfuyfMC<)>@TiYRx7CIoZP7T=@Z^qgnOMCK%x%Mq9M5o;wqhbHCZA~+W`CI57bJ_CS` z*NS)|qe-3;?~gS=(7?~WQ2M7r0ko%Z{fZ>ISrJ+l`mqoCWlKL@vyAm*@p!nnI8!s) zSo93aJ0@>CA0G3mbWY=O{Ry**%O!dFXh-&cX6olber38jLD$&fYQxIJnWcCvmpbnd zG!hHuP!QhV-&nB0&kl%(Tu<}*>AD?m3qO1mZfk#M^dQ8pQ_@;Ce>3?*@*$W*O~^MK zS>dtLc{pAE-NbI4a1jrYil$|S51;gTsn^nIpBshp{s-dFem4s4jBmx=_^*zi(nr@) z{45)I`_dD6C?jZ8J{%zpSc`A)KNNyo_u9$G(B-@uX0fw)%iRgK+2l}4muH!t!R21| z9>QJ=P8W{vhJkZp$gJ$zJYAH8r6RKO#1!AcUzd=Us-(6Ao3q^5^38N64{C$(-)4HSS%%)ZOQ*qXgxz)V)1i%maPJJ81(Woda_#&cl)QOtJ1K*?d&H|MRV@vMZ@31Dm?qw#Fp@6P;@-o+*bARr#>2)G-mPbj2 ztPB7-I&?5<%{!T$1$UGGaqgs4vZcYX!QI-zROaql&nG^tqA8Kma=tQFZMCBC(?Mh# z&6p*4g);s^cjBe3MNu_mc1K*1skl@7wD4hIsC+!zk(e2`+>Ht=id3;Q-V#+Nik!K9 z=DP&%(n7b)EMhZmGHwnqll=YAP~eg0gme9vP$64y?uE__os5e@pRBz&bXPx6VE3Jsp1nPn>-;-*?)#7rib(L^Dj;9(J zcs(vz*1r$7`suig!Zw2CmsFe2mFBG?a?x`(q|sq|M2-5YXY(9;IhK@CnL7pK+Lu1W zWxgb*x-amWoRZM1vz>*qH*WlQFlA<8$r0q98 zty*WX37|{YfJ?NVZaZ1K=-!VQmsD#pzS#$>Po4Cjo+*iGF5viZg*D(;_kzKc|6|~4 z{TWo`y?#>(xUBgA+0}vU>v0e9jeFx!qq2Rl`PcsMnemzaix%T`oZnmd_G-q;X~B*Sfwl=UY9+ z_h^kDx)$=tvG5i$O7^&hcD>nZ-ddY6`drlxH>2YPt(ch%E%R}G`&DZVoYDvCE$&jV zSjMTvz`M0o^>Kt;3_B(1)IEpa+N`0(?8Nb_d49#}5xvvU#KxP{Rc>cAhnCBM?F7;f zglI3j=JSZyS{<#G$P%%s5%A^K zY6|*hvAG%?G`LP-qWjw5=^G-MJ+7}EKeoFshsJ$MFa5UByIu|;Vx1SG@Sne2(CjG7 z%>+q+g*@7>$n=M*={YQtuf>te9NPx{i) zEL488>{W~i?*uC_quW5`ZN&Qwj6VE#!B@Mtv%WU4LT~~Yc0~IlJmF%buke?|NyfCC zOgAGEV8aFVt!K*RfR8wsQ8c3KUSD#CP<9E|!qP)R1MmL0O}Z0XJB8u6ZrQ8pciQYQjUEXzCh?5b?)qA%QF5;Hqx(D@OeI8{K1Ug$8t zHL~mhoZ`jz5}Psc`?1LiKAd!_mPo1h*!7O-tlg`MB&6)JcJKY3stAs}`v440NeQ9L zvk2Gj7b+U$k_xC7oQ#adHmN78-4nCziSRn&Flkxg5F(4s>w-2(0o?8iEgAX(UmT4I z^d1iL(hl9A*EIvw&Z}Cwh_1cehggc|ac)gd;1mDv?*ml$9_6>b0asut$A3b=#=xdG zMLm5>qu)ZnEAdh`8H`B5=ZTMmbn7)(x1_jetm5fqGZKhOk6^9+bjsf&xz>yx?&?NG zLT@Ju2T1%vTv9}?qDHr;g_I9Q;GxPt;xRx8YktUtbsD)iZ+Kb7ApNemTRZi|q;c0b z+gYcBi`6U_m{&MLFNia0tgV;Mhf@7P9u!p3WdDO7SFGHFWk6K^%Ddz|C5feOi-<;V znTpsjz$2n5C@-9!@ilKz%-%@A%g}WrK1Le${C9V@BG4(eD33lqrB zH?>g1PuH86Iy`!xt*`6}tQYbJ=jA0NV0IG*_pvpj8sVY z`Myk#PB9SgB&JOCF=3pqP2gOS3J$(Cj~SM z($mpGzDP!1z&lYRi6pak`57)`y79-Ia-StE#P`&lBpND;R?_j((%E)fJbyn=M3g>Y zI0opQ>~I$LzX_L?N8!t{yI%9F73?4N5isSif>4S*Dx2UfBn151E` zb-%LqHnt871~&G;GX&UX^}lt$z`ZO({_}5jzpbQ~A%l0lcIq@}|en86K3~^lM$H{K^pvA z!s!8oEy@5BU6-6ReqXs>5u}J?^^WED84m?#hzvcd&;Irp@u3^xbAGY6M|=a%^5UB~ z@G6BS_OgWKSb!uJ=VCQmSdf86y~bpRiGV58R*g{dHE}qxh$YZCU(Ys$X-F5!0ktm3bG!l%RWFaF zso<^+jf%X02x2kst|t}+kAB!TfT`0g)ew3fyS_uUlP*0v2P=j#Hs;fZ@1!e1NAr_c z6$`2{9~1iP^fxt)e2%q;S z{`(j67x53B=CTrhSMc{v*S}x^H9*_`m%i6ufxor@{t2B1n*HB1M1O7O_!a*59*{r5 zKyv`Befb^!|IrchtEOMe{r}Wu3$#7IYx-O9|9|0sDX9JvFAa2Fe#if{y!u!8uQjWG z!fAjdtiQv5tzi9C#jjOAf2yFx`?rd})&Ttq|25!={;A=T_1_x)<~#n1|GO*kCmMJb_yhp_mt*lO k{O|kyU*Q#;e}Vsd_m`Cb2eKOgKmh*u0vCun&u@SIA3ZH*&Hw-a literal 0 HcmV?d00001 diff --git a/tests/data/groundtruth/docling_v2/unit_test_headers_numbered.docx.itxt b/tests/data/groundtruth/docling_v2/unit_test_headers_numbered.docx.itxt new file mode 100644 index 000000000..fe3177281 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/unit_test_headers_numbered.docx.itxt @@ -0,0 +1,52 @@ +item-0 at level 0: unspecified: group _root_ + item-1 at level 1: title: Test Document + item-2 at level 2: paragraph: + item-3 at level 2: section_header: Section 1 + item-4 at level 1: paragraph: + item-5 at level 1: paragraph: Paragraph 1.1 + item-6 at level 1: paragraph: + item-7 at level 1: paragraph: Paragraph 1.2 + item-8 at level 1: paragraph: + item-9 at level 1: section: group header-0 + item-10 at level 2: section: group header-1 + item-11 at level 3: section_header: Section 1.1 + item-12 at level 4: paragraph: + item-13 at level 4: paragraph: Paragraph 1.1.1 + item-14 at level 4: paragraph: + item-15 at level 4: paragraph: Paragraph 1.1.2 + item-16 at level 4: paragraph: + item-17 at level 3: section_header: Section 1.2 + item-18 at level 4: paragraph: + item-19 at level 4: paragraph: Paragraph 1.1.1 + item-20 at level 4: paragraph: + item-21 at level 4: paragraph: Paragraph 1.1.2 + item-22 at level 4: paragraph: + item-23 at level 4: section_header: Section 1.2.3 + item-24 at level 5: paragraph: + item-25 at level 5: paragraph: Paragraph 1.2.3.1 + item-26 at level 5: paragraph: + item-27 at level 5: paragraph: Paragraph 1.2.3.1 + item-28 at level 5: paragraph: + item-29 at level 5: paragraph: + item-30 at level 2: section_header: Section 2 + item-31 at level 1: paragraph: + item-32 at level 1: paragraph: Paragraph 2.1 + item-33 at level 1: paragraph: + item-34 at level 1: paragraph: Paragraph 2.2 + item-35 at level 1: paragraph: + item-36 at level 1: section: group header-0 + item-37 at level 2: section: group header-1 + item-38 at level 3: section: group header-2 + item-39 at level 4: section_header: Section 2.1.1 + item-40 at level 5: paragraph: + item-41 at level 5: paragraph: Paragraph 2.1.1.1 + item-42 at level 5: paragraph: + item-43 at level 5: paragraph: Paragraph 2.1.1.1 + item-44 at level 5: paragraph: + item-45 at level 3: section_header: Section 2.1 + item-46 at level 4: paragraph: + item-47 at level 4: paragraph: Paragraph 2.1.1 + item-48 at level 4: paragraph: + item-49 at level 4: paragraph: Paragraph 2.1.2 + item-50 at level 4: paragraph: + item-51 at level 4: paragraph: \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/unit_test_headers_numbered.docx.json b/tests/data/groundtruth/docling_v2/unit_test_headers_numbered.docx.json new file mode 100644 index 000000000..38a25d339 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/unit_test_headers_numbered.docx.json @@ -0,0 +1,753 @@ +{ + "schema_name": "DoclingDocument", + "version": "1.0.0", + "name": "unit_test_headers_numbered", + "origin": { + "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "binary_hash": 7684538628968220703, + "filename": "unit_test_headers_numbered.docx" + }, + "furniture": { + "self_ref": "#/furniture", + "children": [], + "name": "_root_", + "label": "unspecified" + }, + "body": { + "self_ref": "#/body", + "children": [ + { + "$ref": "#/texts/0" + }, + { + "$ref": "#/texts/3" + }, + { + "$ref": "#/texts/4" + }, + { + "$ref": "#/texts/5" + }, + { + "$ref": "#/texts/6" + }, + { + "$ref": "#/texts/7" + }, + { + "$ref": "#/groups/0" + }, + { + "$ref": "#/texts/28" + }, + { + "$ref": "#/texts/29" + }, + { + "$ref": "#/texts/30" + }, + { + "$ref": "#/texts/31" + }, + { + "$ref": "#/texts/32" + }, + { + "$ref": "#/groups/2" + } + ], + "name": "_root_", + "label": "unspecified" + }, + "groups": [ + { + "self_ref": "#/groups/0", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/groups/1" + }, + { + "$ref": "#/texts/27" + } + ], + "name": "header-0", + "label": "section" + }, + { + "self_ref": "#/groups/1", + "parent": { + "$ref": "#/groups/0" + }, + "children": [ + { + "$ref": "#/texts/8" + }, + { + "$ref": "#/texts/14" + } + ], + "name": "header-1", + "label": "section" + }, + { + "self_ref": "#/groups/2", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/groups/3" + } + ], + "name": "header-0", + "label": "section" + }, + { + "self_ref": "#/groups/3", + "parent": { + "$ref": "#/groups/2" + }, + "children": [ + { + "$ref": "#/groups/4" + }, + { + "$ref": "#/texts/39" + } + ], + "name": "header-1", + "label": "section" + }, + { + "self_ref": "#/groups/4", + "parent": { + "$ref": "#/groups/3" + }, + "children": [ + { + "$ref": "#/texts/33" + } + ], + "name": "header-2", + "label": "section" + } + ], + "texts": [ + { + "self_ref": "#/texts/0", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/1" + }, + { + "$ref": "#/texts/2" + } + ], + "label": "title", + "prov": [], + "orig": "Test Document", + "text": "Test Document" + }, + { + "self_ref": "#/texts/1", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/2", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "section_header", + "prov": [], + "orig": "Section 1", + "text": "Section 1", + "level": 1 + }, + { + "self_ref": "#/texts/3", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/4", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Paragraph 1.1", + "text": "Paragraph 1.1" + }, + { + "self_ref": "#/texts/5", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/6", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Paragraph 1.2", + "text": "Paragraph 1.2" + }, + { + "self_ref": "#/texts/7", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/8", + "parent": { + "$ref": "#/groups/1" + }, + "children": [ + { + "$ref": "#/texts/9" + }, + { + "$ref": "#/texts/10" + }, + { + "$ref": "#/texts/11" + }, + { + "$ref": "#/texts/12" + }, + { + "$ref": "#/texts/13" + } + ], + "label": "section_header", + "prov": [], + "orig": "Section 1.1", + "text": "Section 1.1", + "level": 2 + }, + { + "self_ref": "#/texts/9", + "parent": { + "$ref": "#/texts/8" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/10", + "parent": { + "$ref": "#/texts/8" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Paragraph 1.1.1", + "text": "Paragraph 1.1.1" + }, + { + "self_ref": "#/texts/11", + "parent": { + "$ref": "#/texts/8" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/12", + "parent": { + "$ref": "#/texts/8" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Paragraph 1.1.2", + "text": "Paragraph 1.1.2" + }, + { + "self_ref": "#/texts/13", + "parent": { + "$ref": "#/texts/8" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/14", + "parent": { + "$ref": "#/groups/1" + }, + "children": [ + { + "$ref": "#/texts/15" + }, + { + "$ref": "#/texts/16" + }, + { + "$ref": "#/texts/17" + }, + { + "$ref": "#/texts/18" + }, + { + "$ref": "#/texts/19" + }, + { + "$ref": "#/texts/20" + } + ], + "label": "section_header", + "prov": [], + "orig": "Section 1.2", + "text": "Section 1.2", + "level": 2 + }, + { + "self_ref": "#/texts/15", + "parent": { + "$ref": "#/texts/14" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/16", + "parent": { + "$ref": "#/texts/14" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Paragraph 1.1.1", + "text": "Paragraph 1.1.1" + }, + { + "self_ref": "#/texts/17", + "parent": { + "$ref": "#/texts/14" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/18", + "parent": { + "$ref": "#/texts/14" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Paragraph 1.1.2", + "text": "Paragraph 1.1.2" + }, + { + "self_ref": "#/texts/19", + "parent": { + "$ref": "#/texts/14" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/20", + "parent": { + "$ref": "#/texts/14" + }, + "children": [ + { + "$ref": "#/texts/21" + }, + { + "$ref": "#/texts/22" + }, + { + "$ref": "#/texts/23" + }, + { + "$ref": "#/texts/24" + }, + { + "$ref": "#/texts/25" + }, + { + "$ref": "#/texts/26" + } + ], + "label": "section_header", + "prov": [], + "orig": "Section 1.2.3", + "text": "Section 1.2.3", + "level": 3 + }, + { + "self_ref": "#/texts/21", + "parent": { + "$ref": "#/texts/20" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/22", + "parent": { + "$ref": "#/texts/20" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Paragraph 1.2.3.1", + "text": "Paragraph 1.2.3.1" + }, + { + "self_ref": "#/texts/23", + "parent": { + "$ref": "#/texts/20" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/24", + "parent": { + "$ref": "#/texts/20" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Paragraph 1.2.3.1", + "text": "Paragraph 1.2.3.1" + }, + { + "self_ref": "#/texts/25", + "parent": { + "$ref": "#/texts/20" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/26", + "parent": { + "$ref": "#/texts/20" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/27", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "label": "section_header", + "prov": [], + "orig": "Section 2", + "text": "Section 2", + "level": 1 + }, + { + "self_ref": "#/texts/28", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/29", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Paragraph 2.1", + "text": "Paragraph 2.1" + }, + { + "self_ref": "#/texts/30", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/31", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Paragraph 2.2", + "text": "Paragraph 2.2" + }, + { + "self_ref": "#/texts/32", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/33", + "parent": { + "$ref": "#/groups/4" + }, + "children": [ + { + "$ref": "#/texts/34" + }, + { + "$ref": "#/texts/35" + }, + { + "$ref": "#/texts/36" + }, + { + "$ref": "#/texts/37" + }, + { + "$ref": "#/texts/38" + } + ], + "label": "section_header", + "prov": [], + "orig": "Section 2.1.1", + "text": "Section 2.1.1", + "level": 3 + }, + { + "self_ref": "#/texts/34", + "parent": { + "$ref": "#/texts/33" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/35", + "parent": { + "$ref": "#/texts/33" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Paragraph 2.1.1.1", + "text": "Paragraph 2.1.1.1" + }, + { + "self_ref": "#/texts/36", + "parent": { + "$ref": "#/texts/33" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/37", + "parent": { + "$ref": "#/texts/33" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Paragraph 2.1.1.1", + "text": "Paragraph 2.1.1.1" + }, + { + "self_ref": "#/texts/38", + "parent": { + "$ref": "#/texts/33" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/39", + "parent": { + "$ref": "#/groups/3" + }, + "children": [ + { + "$ref": "#/texts/40" + }, + { + "$ref": "#/texts/41" + }, + { + "$ref": "#/texts/42" + }, + { + "$ref": "#/texts/43" + }, + { + "$ref": "#/texts/44" + }, + { + "$ref": "#/texts/45" + } + ], + "label": "section_header", + "prov": [], + "orig": "Section 2.1", + "text": "Section 2.1", + "level": 2 + }, + { + "self_ref": "#/texts/40", + "parent": { + "$ref": "#/texts/39" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/41", + "parent": { + "$ref": "#/texts/39" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Paragraph 2.1.1", + "text": "Paragraph 2.1.1" + }, + { + "self_ref": "#/texts/42", + "parent": { + "$ref": "#/texts/39" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/43", + "parent": { + "$ref": "#/texts/39" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Paragraph 2.1.2", + "text": "Paragraph 2.1.2" + }, + { + "self_ref": "#/texts/44", + "parent": { + "$ref": "#/texts/39" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/45", + "parent": { + "$ref": "#/texts/39" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + } + ], + "pictures": [], + "tables": [], + "key_value_items": [], + "pages": {} +} \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/unit_test_headers_numbered.docx.md b/tests/data/groundtruth/docling_v2/unit_test_headers_numbered.docx.md new file mode 100644 index 000000000..d4c8accde --- /dev/null +++ b/tests/data/groundtruth/docling_v2/unit_test_headers_numbered.docx.md @@ -0,0 +1,43 @@ +# Test Document + +## Section 1 + +Paragraph 1.1 + +Paragraph 1.2 + +### Section 1.1 + +Paragraph 1.1.1 + +Paragraph 1.1.2 + +### Section 1.2 + +Paragraph 1.1.1 + +Paragraph 1.1.2 + +#### Section 1.2.3 + +Paragraph 1.2.3.1 + +Paragraph 1.2.3.1 + +## Section 2 + +Paragraph 2.1 + +Paragraph 2.2 + +#### Section 2.1.1 + +Paragraph 2.1.1.1 + +Paragraph 2.1.1.1 + +### Section 2.1 + +Paragraph 2.1.1 + +Paragraph 2.1.2 \ No newline at end of file