From 7fe32073de7c93cfef72769ea02139c44d807d2a Mon Sep 17 00:00:00 2001 From: Hankyeol Kyung Date: Thu, 26 Dec 2024 17:27:03 +0900 Subject: [PATCH 1/3] Add LLM-based image description to PptxConverter Signed-off-by: Hankyeol Kyung --- src/markitdown/_markitdown.py | 36 +++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 789c1e5..2f471aa 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -768,6 +768,17 @@ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: except Exception: pass + # Try describing the image using GPTV + llm_client = kwargs.get("llm_client") + llm_model = kwargs.get("llm_model") + if llm_client is not None and llm_model is not None: + alt_text += self._get_llm_description( + shape.image.blob, + llm_client, + llm_model, + prompt=kwargs.get("llm_prompt"), + ).strip() + # A placeholder name filename = re.sub(r"\W", "", shape.name) + ".jpg" md_content += ( @@ -857,6 +868,31 @@ def _convert_chart_to_markdown(self, chart): separator = "|" + "|".join(["---"] * len(data[0])) + "|" return md + "\n".join([header, separator] + markdown_table[1:]) + def _get_llm_description(self, image_blob, client, model, prompt=None): + if prompt is None or prompt.strip() == "": + prompt = "Write a caption for this image." + content_type = "image/jpeg" + image_base64 = base64.b64encode(image_blob).decode("utf-8") + data_uri = f"data:{content_type};base64,{image_base64}" + + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": prompt}, + { + "type": "image_url", + "image_url": { + "url": data_uri, + }, + }, + ], + } + ] + + response = client.chat.completions.create(model=model, messages=messages) + return response.choices[0].message.content + class MediaConverter(DocumentConverter): """ From 9449d5b9598bbdee105015f20e835c8461ee687e Mon Sep 17 00:00:00 2001 From: Hankyeol Kyung Date: Fri, 27 Dec 2024 15:28:54 +0900 Subject: [PATCH 2/3] Update LLM description method to accept image object and validate content type Signed-off-by: Hankyeol Kyung --- src/markitdown/_markitdown.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 2f471aa..faaaa24 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -773,7 +773,7 @@ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: llm_model = kwargs.get("llm_model") if llm_client is not None and llm_model is not None: alt_text += self._get_llm_description( - shape.image.blob, + shape.image, llm_client, llm_model, prompt=kwargs.get("llm_prompt"), @@ -868,12 +868,18 @@ def _convert_chart_to_markdown(self, chart): separator = "|" + "|".join(["---"] * len(data[0])) + "|" return md + "\n".join([header, separator] + markdown_table[1:]) - def _get_llm_description(self, image_blob, client, model, prompt=None): + def _get_llm_description(self, image, client, model, prompt=None): + if image.content_type not in [ + "image/jpeg", + "image/png", + "image/webp", + "image/gif", + ]: + return "" # https://platform.openai.com/docs/guides/vision#what-type-of-files-can-i-upload if prompt is None or prompt.strip() == "": prompt = "Write a caption for this image." - content_type = "image/jpeg" - image_base64 = base64.b64encode(image_blob).decode("utf-8") - data_uri = f"data:{content_type};base64,{image_base64}" + image_base64 = base64.b64encode(image.blob).decode("utf-8") + data_uri = f"data:{image.content_type};base64,{image_base64}" messages = [ { From 06ccea5219ba47083958e2e4b3521805e611b914 Mon Sep 17 00:00:00 2001 From: Hankyeol Kyung Date: Fri, 27 Dec 2024 16:07:33 +0900 Subject: [PATCH 3/3] Test for PPTX conversion with OpenAI client Signed-off-by: Hankyeol Kyung --- tests/test_files/test.pptx | Bin 124277 -> 128721 bytes tests/test_markitdown.py | 16 ++++++++++++++++ 2 files changed, 16 insertions(+) diff --git a/tests/test_files/test.pptx b/tests/test_files/test.pptx index ea1bbcb0f84f4e1007be35f2262173529215e484..d86f4dd1e0ed0ae9bfa47adc3fcf761146e23bac 100644 GIT binary patch delta 12479 zcmajF1zc3!*DpRpHv=N2(lIo`&^#+s(I^@kV^Ss z?sYRJA^#I-8y@Mf>;hF@EkixnzvF;lFR#PvUz`T00f3um^?nxRl9O7;l+PSapo{iH%3)C8ZQT z8Sx-Y5Igz&J3aU%{FwcZz6&##9`1S4bU~O&Fh8(rA)EDm%;GY`9F-u|x}c!%6!L>w+g9s>tTV$gxsTMKY6R0VoiP?) zy0w|%t51HydmWfzgDmnbYTI1mD_s0ZbQx!KGjX3AvFTX8Z>!FvtLv9<@eT!j7vz$U zex|=?Kr)uMs0(~j?q|vtk6=>JxL|{-UM@$yTE%uOp}r^~yKGkO?|q`m?MUjhA^vA$ zg-H%XP_Mt@!zqh8RTm!&t+{#cA^bUjO(+>}wYbRp^~PmuHvyonO~Vg^VCv~}qsPC@ z)G=2jHO3NMByITG8lk1#zk_zZ_@-kvjLXW9IlcQ+s8#yp*^kV&kf~3xGXvYTiD}p~ zyn6mh@~baC(4ZR?6~ie?o7Y3@3&d+9iotzW4-}}*)f*GctrAjNXP&Xhy^%woajIBJ zu>E?Ghl3t%Z@pZvW@$k7+P#piB}|lq{5!K*+0fogT`!cW<1~+BGA$P^MQXO!z8GBN zuq;%sTr$N)elj#PCPmB+dL({ZQ* zu7xmw1Y_ttT1+iZ<;R3wQ+yN^*Bvd1P zr6C>aSStXwCnXYYQ&gaqb^BeYlfRj7#iA3tS}OQyz_F1WFNXXi>|8m>aaiwz8YNL! zJtdRr*u2))wMycQFoRYVYJbR!XW?YSVvoKbGRcIJOQhr;f=LB~|aaFt<)uCF9h#VEL^w7<2F*ItzdGn%2DlDn({4F2Vyr8rI z6-JH)TOn}mv)cMA)IXUbp4-X|HDbOe-!r>VI<2L3<=pIfI2LeU%<%?*K73XH>2#-#%^*VWkZ1(y#D3HT6vBCOAwjTp?hr*0=Lc zV-iOw8mNhD!m01SfSh-m(dKB%ANTz6$lsVJBTD@;6YZ6T6&$!>!<6q>8DgicZZP<= zz!=fr$`Bqum~|FI0$-5UjKZag(RFTDa`n_YKWF5))3Z-U@vsETB4mo(iQPg{18X@rxQCfQ%wvr_LNM z5n48?R^<5?u@sqNg`wU|~ffA;$QV#Buk;*>f zh699x<~F%nAGX>VHS~?Rhy0X$Xzj`8(&DUL_ao@{gR^aCy*C+p1p=;|Pb~Qc0tP3% zB=!t0T~YMrjYf>3^F2D|->O-vR5=GUqg z$4&p`=p2{{QH;FV;m8kvgnPKCISCc|O@aS5E>X4V{s~=##981vI+AcEG?L&b@(>?y@y};>c zYVA}XGhl^E3ku!~tPkjIsGa(#sxYgkKs%@*PyKdKXTtR+AaU5|O3tfG5=AKX@spIj zv#0&8XDc$NN-Q&rjO-q$5%oCJp7cvA`}1E39gFfmr6^qImW{5I>EowkF(2`M zC;7&Rm-Wk>wv@8dXq`5u2rA$sN1*tOiJmT>3&Jp+Jr&~I?`PX?;>AI`dxA4*6y}U8 za@_Fhn0Rz3nv0C?qYGah%2NtfVnMQ~^0ubSFj+eZy>W}Z>>)?vtM`;Wh|$iBvEv{O z!T5w;9iP+OHNNG8W%HFXap~CEJd0+_MQJf=b9a9AIohm~NTau?MLqNt+p+=vr zB&)(us3Chw4#}Y}v9E$;zNHJe4ZoSLROqh@%dpdQ2qhDqbYW3Q{U&WGrDwt{CgZ2n8WVJY^Pk`eDe51gEn*%^E-;8EAz&SiPO?JCi!8(Y}=1m*L2at;aRuruqXiyHaY+ayzo%pAczh zDtD}~tZ?;-G)UAnw<~d03c<+Cw0~Q$WzPF1`=N@gTM@LG3~zoN^kXzmb|Y4q!YRnpuUL5N==Q0^b=YOe++Rpb<^5#CSX zGhr4~G_Y!(Oqta)ejtwW8$WF!bO7M_!~0f$P7oRz9}FC+w7`sTzwmwS%6yf1JN{?m zSaF>Hvn8xN!Fhz0(5Iz+a^h^>slRRix~(!V1t0FiIQ@c5>_|VjDiHkr^m@?8r(I%4 zxS@N7r%1Yoa87Z)jw{OdX3fKGaAM5?>dP|TJ;XnKOC#>At>A+Bb%Km-e)WcT=s-*S z4bIf6!a}!GQS3!v<=iOMq0cg($5`!(XU%|TiJlhvxA1p!`&#%i3`I^^H+$NzCRE02 zeF|J+&ZZ;uZ)y~`N=JtBaUBlVxh6`ER%Jd(oXG_nux|}p9?1|U>tiuLEF3o5fo3#{ zz><>;>MrzM!~bYbKP=+U8O~EMHq!g;xf0-z*kB`2xia_KSIp%oU{{?#do01eGWZ9^ z&amDy-u9|)mY|k*0=fmtf4;maC>1gD8o`)nPD;$h*su)_$hPliedW}#+HvVH`@Jip zqJUuTEP|HO3=cKi3vfk(nz%&q77PvHB`G06U+pdkT21{t`|eOy$N@LzjXVD!EGsx;#S_jwO7qaoae{SB zmm<=w()H79Ub5|=QHjx-Xg0f+?KZ#br zZofh{7c3D+o|dyyf0&^DxKOvwX9Vx`vkIb&A|yZj9EhX-Nlwh$0;a^I#rtrc{R4Bv ztE_+*ZDw>LaAm20w_j)Y#I5+xIwwD~;eu%k?ohmPpFsV5D_!lv@>G7z)g!z=0-e8f z^T*~%52#4LuW93ny{tQbKaKx2*OfQqH~Yvt{1ex=RCIXU3G@JmSW|AOQklmG-&9wQ z?^K!kDD=k~_2O+;;%$$Vms2*vpxJuYQgPugNTq!xTCBesea<8TK275V1+f@AiPM2E zfx$)UHC6LXtqEsp1jdRt@^3RD-(u)|KuLZ02UWielsXgvNamJ|AsGgT|H$TwP&dxa z4=UU8=D6sZ_LiD8iyjexvKD&G8*IOR@_dyxT_bWoN^Kf5-0RKPbsoeKKUq_toyqc} z2R}7Gs!))LskIYj3w)wQjaC*4xdx+H?#KevNZ*e?=Qo}@GoMxO@^^$5Ipt{xl>GT> zyzt`nS+%O{gVFF>)|w-$x%wKQphqbqQ!}^)kDI#RB{WS;PEzVb7HSX#lDbV+LtLTG zPh|B^rF>a%(mT&qfSIx@ImiKrwSCCLatVZknpMgm&GKPH^b>K}0-r0V>wIu~I|vci zrw6S~pH%g&d2dJ)N^QO9-M}eCvF#+PxwOBIbkLkj@PH$^%dzdDwmUchL0q~?5OQ7?$lp&@yNGA4y!(}1Ohmr)r^!{%f79AYwd??&Ie zekDwA%^KCD+9sdvnThJ}9E~rEHn#OMySdfh{+VH%_o@zco0sE;epz8^@PdcVC1~vC zb$&|pmd;nkyb~jx#ON^oWhl!Gd9#3M`o?l?w9YLjb zr?mQ;j`-Kr5}2?gS+Gw4>P>#!e}HftENcq6mwNK5oj-V#zO!(|xzsUzu`@*6uFXt9 zeI;C}_(HMSS@DPBN2OFBG*fg)(MlS}mo{GUrGC zNcIn&BNnph0B@~m;h#buwkTdt(swa6NR78!&6`jAxb!8Rsri)iP_n#?W>C^DbeSU5 zCmj0m>Dv#&TlBlv2AaiX@ow3S{2U6@9xJPT^%pp#WpO?pu6IaChKRINAq?wz86Ies@*_88PVWH3&GVD!DgoBF_LRmeI+KN0 z>%i@r2R;P#K_YP<4TztkccG0ibVh5GQ=UimQ@86juH3Y>e9nA&Cc>L;@W4s)S>ERC z2HDk?;eDVN{+9oHASwWGihchseWa|70yP@6l60xd&VEiHbM~WlIi?9(+&#M3GHMxq z$Zm-kc=OhS?a}~fDo!lxtG>LPEUor9&;nmYH8$vdEQmB6+-va_o&ELs@k!_ms}Swk z#8*X&l&t>tMt}vqXJmB0}|{M1TCOA z(uo~hK0y!W;yvDx@9SbqhIkm5Q0QldG(S`$%?9g8kzY`7E5jmUKFp;+GUCevcD{Nw zdtY4+*ex{PUZLcKQ|!o2yUz8Rxei0j7{b+K-|5oSM!i4_UPE~p?l;UH8N=SmCxV*& zX$rVX38MWid4FgR%H17EA_LqPGlq z6a=(~E1SO`3c%-RYMSCLbFyvK=iKI9eWhGFvJ#aw4dbOb3j}($Za98+Io+&FF8?CT z)wLsk-ki}4d>T4N!Kr1A)sM>MJT>xQ9$u4J<|TxH_S61RQ-08ay)lUkRM+;y#(%!} z6zegS6B*`X`kORMSWQ;M2=+spzRX+C85Na?Oug*LP!xC{f+yLGmhG4oVZgsG0&&W| z+m52bx;=0LbhE2;vw>tZWucBc%upO;fPZ(<0zmZK@EznA`2m*6OFh?gPu@*dvuc$9 zwI$CZ`hR!al1PS+%_VqX{6@L@Y>MT&zg;^IpIvzm^=yWzl1k_ROo6fJ{|~}XRcEi0 z3^lEkM?SWq6eogBO^e;3)E6U;R6DuAq}0SGo8+N(e1ZK3lN(YsWxeO6hw{-UZep|! zS)HC}M5TW|!a&*|3v#%*wtab49Qt}ZL9L}w3-&Sx9n!bz;|Ly#h@Mh_aj7dxLKu?Z zEj=9}4@DBFrOMJn)NoT^t!YYMnh<9XN_lraRvIsoIPS_=?aZ}m-N&){z0{UNV-l)3s z-ildyY5~ZV1jtJ(OZ||1`{C}h@2`iXASSR9k=}7v{tq6)!)b15CMz!^C}3oQbPNBx zFULD$|6J^M)g4~~3jqLPJ(ep@Z3)1FK48`BEKX|;2GJ`d%attu0+8}r&4Ri8IzEvL zg*^@hWy*hM&~Oy(BF<8X0sIQ zSvf-?MWG^p?7N!MyE#gyZOs$|uUlsd{*B2p^Bzhm5slp!m7;TEHrvJ7v;NEh(DjC% zs-lG^KU7?*ol|WzR8^=gyiUWYdRbq6u+j-*GwiRgaoYI#O#s5)U^}vf z=Woo_>2+m2Ie)O_bQ+a(y|NoqViWL+y4KTl{Yz5E#%1m8W#{S9`+?kVCwsSvp%^qa z+W3M7W*D_*)F!u2&vlJ2SV)(wE?KZfy?u!$hkPh1;h^A}srJ~AiWJ#|P%KyYsJ}tU zCb_%OE_19EhdMki@_3DWK1%d-limpk1c;P5B(ekVHUR-$Ve&_1$esAEJVY+czaDl7 zrv&FqG;@_U~B*eu+hkky{-<1`GFP?sy1Q+Tc|ZB`?p-m5aHiH>Mx}AK$bt z90P|DfPDh-{ft{8J3{CnrXm8+EQkR7tc8|kU3Y}uRu3Wo-&FQwdo{I3n)bupy>9)4 z4R3+ST?P7+V`=Xa@Edv##iTx0yGh|%Cp>8nzP^NdioN@v9Tah&H)UB)o1Y3yvUAvo z4v0geaXRX#40!$fd7#4mKX5b`*XKLT41blERDW4o!ALGDoRC`B^oS%;(=#6qBvF$< zzbif>OLi9|5W~y;00cm~(7Vl0`5gM@RvMq|ww?w77*z)*v7^FQS55uhxQWQ&WoR>c=rA0P5jU^-)B6$9&idm09Gv(Z|4z!VV%`- zHEDPAtu$#8Rp4Dg4(~rlXZTb*$I_4t0Vsonvk~ry^X?SO5N@&h@{Ya6wo15P0P?&L0A}cwKJ|W`_?7_z(6w+oAv*zv z3M{tXyy-I$-{PA zZ(wQXe6ZAOEEL34SwfN_T9rdP&Wi{o;^?fkQ$d>F%SnguLCk|4M>;nUH(!{s#> z=>|o1h9&FOjsg0GR`v~aqq1nk?J{`s=vf|!4YY>$>6$VAw&zA1;T}zRBt*PbFC%m+ zfdy;zULA;o<_8;rkA!}F_U?b*`?hd3T@okz9Vsb_zW;t$kUYtWB>gWpltFql&5&Z( zrs@G-gLsCH2VVsqnswt4eW63g>J-W|W#jndMOQ;xLRYgG1K;xKXJalH!~WPdXKy+t z=BG&$xbi&K6CM(E-ATw7O~+sM5uI>8StorFpUpnn0ot-C%zysfTQD037bBggpTuGp zQt&6xJ@8k09;#{;27S&);tGh_m5eL@#@i2WLE?N;Z-owz1odBh79;V?rT!DxMd>g2 zM4=*cl9EI=l%GW}<-Mk$WUR7U;~=*+chT#KZO=;Fpn21%5mafCHZyx03C0U=1UjX% zM~UdeuZvKHfUPcJaXQI-V}>yn;zhTTq6~?K02TR*vCfXz4_qM2-+m}$-+cBIQax9D zRPc!>&pi}fv|`Hio?3n@>OL;0)foQsdb=m(3hNeK2BPei0dXcQdQG7 zqw+GpX#LK$eo4+bc~VAR=jy2e9HHJ1XBz;~KrcOd%glVAQ5G+?SmgkNg!t!SLk8S! z)eoGZmzH|j((_Z>@w(+anAOIMGnX$XYeB(RITke!n}=C`Raed}w^d>pDYlq~I=QgN zF;$9GShFp4J%yro{3;l%st8Xptuip>(j%P43#4r0IyU*hqBW_rC0dZJ7l|%YsmJb- zal=fl9NjssiMfL9@MSG|+J_fR&u7`AsQd^U+C@y}dO#d%+Rn1T%C!iatY%>OQPrtO zG8yVd@(5(VVqQI(;fU~#DhWe)y6&@&S5mMRm}7Caeo5m2zwc&*nkODEg150#M{b2k z(G&-;sv)2;VXy%DRH%^4L%0o|Z%>@+*BAcW8}s|2!EJ0t!qWzDy92!B-@~uK!HQAP z-oWDQT43+F4$nsww?G2!!oiYP!Yk;gv&gjy7LAii%(r zdBpIkIFhND_l?vzXuQwZ$!uW3#M|^G6T_tZ7O?P;8WXQ^oM1!7fS#t!w@Wys0o*g3 zD)RdYh9#u)LqMxC@#9mNJkO?EZ3_Dpw00<8tXfS;z_Wb5*l1NFz_5=JO#iTT-$ zDP=dvlR2MmCDIFd64O?AtikS_yO2r^8DVMLSUp;gw|`MVk&j>uOstFNrK(;vhvxg* zkrK z@_a)14~DT3YiIny=4EUhM7)Azzb-?;V|$M zvX2i3O0d z;?QLr7t%N(;G)rR!OyS1=2<>-EP8XY&cg2d%g|=rH@jgEHF}TS#bE7222H6LKnPdJ zl@m)P{N#m6I`!je!qn71B-^J?9riPB4#nn|BYf1$ zP|QzUic`-7+5XXaV|yD=j(|lck-~qk06|spT|fkC)lLU65s1FaNTWy!ml^}IL5K9m zKN$;rnQWQFCh>epEz#*nS6nWmo)SOoR&*!}ZdfhH>+$6+`+kx`0n-sGYS9YGB4O-S z*sPe--t?(zQ;a>QvF4H~&X+Xo&TzT8E>C>jZ}EHd+6q&SuI*A?a})GalX!XUImMF3 zm)OWUU@{S{LKSnvgAO?X`lVRXUr;QYp_BP%j)bkLSz8jmqSD(LxE|b zCGqs3VgF#-Oyb^$GE9Q30{&u0ua(IgR`d7x7QCxjgtj)5Hyb4vINl3*v{zPV)VGDF zFIE4p#dDj9;AdYbe>zX7u#R(ZJ}s8aWcarIDRgQmy|*leYlLn9l&8H~G=ri4)mQJI z0}=H8@L0&Fs>D7Z6A;>CJHrihhK4ha8qQ7pn<8&D1h38l!!HI8@3sAx*L-=CFf!WdW3!f2Q77HV9DtoJ*Yc(q z(=$rK;MP^{Cj~W#up7OiM#2oz=@T9KRQ>M^vXL0IZ3H)D_Q~F^%T--9F-7*j=nSLB ztdvM;$vM#1uOC3C-aj7~38;N-ePppZUgs+iFkMPC)oP07FXoEN$gXp#?`%)WPL{Ut znFxErBjJp4v|~hke!|m`*r@1gU~sIgh3D&xQ~4)CKRO}eTk=KenxuK(X?9C(_{l3Z zR;>*PmXGNk5y3ov2SZ%=VVSsRz`it-u#buW;``M{KWT zb_g5^1iY&<8U)e;p)Ix}5}*e9146b1eGyvu8H<&C1(s(Jo6Op^`h%#Hho?U(>hL*o z93>~xdrLk<|jo-PY^9`^5^J1*LaDQ;>~%+Oyo$=8I1-S(=vCmJ{U z1$CD~NGQ<0kJj;SI%U;hZ&fgley5r1J<^owO{aRUBBe|5@J*>9DXq^5v9lqzWuAbF z6kqe(li_L3Bq%Nubw3k)p<|DWX@lC%!0?l}PsThk>o;I+SUJ%pck@VRIC9eis)0NS}=k8_%pI&zT04gkDE`0fq7xO=d6X6r|4$b>m7`cS7>B?G097z zpXILvRxLuNuj?-rD?491!YQ@+hUuCR__(Mw0v~>KJ2I&}<$h5o*~lI2kz4sC@V70O z@Nb|i_G6Vc>oaqL+JKc2%a%QYgqMCQLWHPok%A=o%4VwX3(wEl23kh7>yLhO;QHR@ zO2F4wky|bOC=diB2cmEwp^X2T%?ghj2dd#%BjGi}yTg7i_|iP^;iLb~Q^fxV0Qm>m zxceQ1(@X%F;m@Xklqgv^FgqG@SRow_+-nL5g2zn&DS;XA;t8N6kRN_@SH;2kCy`Y> z++z|5HvF4=_Rq)oU$b^J|9qM|SzDTzI&s}s_o->BivRna|4sh@0I2`!BPkh1c+M2? z>EpkN3;+Pq>VI6L%y;(-|GWUCgzrrv2Y>+I4U0P?IKg!fk-#CqS_jDrJD zTtE&l?M?-k4auZ~FH8bKkMA-9u#oA&5dJ$@@jovbcGzPkdN|oMQcV@v9f~;w=3vc$ zOcua_HzS#k|7HyQTh)dSwgjPv6V4!|{vL||Tk5+oQi=+$GXs41`0wHPzomr5@1=~U zky3vT$^R`?Ds?ZFi?nrr2nBDM0g|F5%inX)?znfSEO5{)lKWfbo+~qppl;v?tl%z?Bz~Yq2?pTnjW?*rgfgKQ4Q} zIL3DOQa%j`uXue;_DcPDq;b&M{a~{GIRl zNc0(k66|tMpWo4uu6H!7&>k1Azl4;Fb-QmXTmlloRI@4n!_|`9;e0DVW>}~r2>HbQ z*H!uj2wz(Qilg3q1Hze?fmHt&5c|J$0D#dr14AC+YtmwCWEz5bbN=2~a(d)<3xoqhJ+_gX`{7$KpM6(OsVyreZ>GtW>DY>XxWK2hW)CBijeT*Tig)JW#hKI~hSk&1 z8V4oUhlw|B`_@ttvBmJS$OMBIS--d}VSYcD^gj(oWU%j??bZ*W#b;uqhK^zr*U5k5 zv#)rj=_ZD5uH?8zRCh2RxI9~k!_(d~otvL+2Au&*i|%GptK4?dTuk`|4(gRjbIBcb zgbx%w6NoN69-0VfGE2S|6t^GRaD3jYmc|h#Kh;4$X7qM16g?rC3GK;EU|}_e&4&yw zHd~3Wyb&I5FWR#sej(&?$8QSOYM8n<(Ldh;Xs?&Q`nmE*4yRhLbZNcG-I6Bfw39R^ zV}Zu{4q#tA6XtvP(T?cqd1gWU#GGbD++HMsqcw283+ueP|d0W4dgFYl36)ipg!NK%_*`@XCep*ZB5y6q#OTrot& z`(ll%is3Mi!DQ>mDVELV@b}di&3@P`n_kQ^^5`$}b+I;c7w8@Co<9Vfa)Msd2~i%j zS1jV&Ue&ssYKM6-RY=($OtKvvb<1ACo{NwoUZL8cU@=;e{mezqm}J-n1}${Xb%ufD z8*;wT^0=~3bS%U!>kmjifk|>YPGjUgaQJLh8B(BH+L@%MtTz&F0n8rkS<`-M5DJ|hwDH3}&M3StW;*J~rKYx(F z*GwJ?NZwV$To6xwJLdnVh-t^Tpj(^N5xgnE8zq8QWp5Jr z*1HqR460pZ$_sj8rrh7o{M?p&pNBOgY!<4ix~dq)fNEJ$X-rw&z8>51S*|Ijl4HP; zSdH#nw=ErRmtNK}!>3{@iHUaQR=bk!YO%$MgO=^?eBGks(1u~;SH|5DC(TPyDrF}& zw69?j03JHd^RFj2c&V>W&utWp{%jD|hQ$D=!WOWYpr_0XiiKz-ry*uy_}gFFF?+Qg zfJZ0~llNcSAIGZUaO?q;k}Hfs`NQB0d{Csk2;DF{*U7-iWt{Oq?%cd>p6#wpa;}`* zaI^RCSJ^UHt)r)Nw$HyEj3A27KEcub{G-`@p#!}180+ZQUWThAncgR;_~l%2)x8DU zRv+Ai=ymD}WA$Qvxwv&k$FO|fzshg4R^*#Q3=T?Ed?338!7N2cV02tp@bnD;0@H^; z5JNbhNf4CWqyPeslMsNb-?evP?_odV564sjK%gZwgas=f0DU{46(K+pdB)a;^4kLY z#~ZnHA!_T+&hE`~P?l$?@(Wc{o%wl5VWy-i4|${ck<(fJGZA0J&1ba=psw)DYq%3Frig1X!CvL(5Bp?>l3fba9%kaZ0l z=N;3^DE+nPn38WF3m%tIc5YCT=?QJ76w5>@`s*^2cb~k4;6=Y-XWbo@O49O_R}$9j zqc16DzEU0uyGd1dBr`l z>-ngueAgGaE|btj{4Yau)c*9ZmG1JhT1`!xZ4u-n?|xko$sA&!5|DXaqum7_)sA0Q z4(iHhRx>5K>y_=AFK=XD^B(2x^HMb$8*R7AumiUY7W-a_!}EdU$V=+TGDm=&#! zfQtS_f!d@`M!1rq$-bC>zbsCiPIQX*TK19KUO|10OKPSs>Enfu%S8O%mD{>+AIilK z$29b61tx$Od~ae>1QIReJ=8ynnRr6Wgn>`*_0Q;oI<583s`U}2f1d;uQ$}xJ9-}Yq zQM1mbE6P!5)AvwdzHuSId}0iGGN0)ex-EekPyzi#V1Tx|qcs>^zH7^^nNr}PF^5Tb z{^+egA-1`8BQ3k&afcu)XLE^2Ykm!|PiOKxIPmtWo_^uhKXfZU`nMOfuPOWWbZmY| z^5h%s2#2^$=>rIfI(cEaq65u^`g2) zqfV}b&7Y6duO7TMq%!_^ZuzM5+_YG`+LFDg0}I!m1EV@$%jRn!*;`(rDn(I)YkMlUTynQ z6xm-UAJo<^RaYnlyjZa>cR8$s*p6k>Pgne;h;uGA^fz6veQ}W&^gZCJXQ<+f^gjM& zlQeXYzsJP*`G;MCBgKG1ecwg#psHofc{yTQx1}lFUN%E_?w?t@>*UXyKhZ8Xg^sK~ zb$YYCCcE9^X-pc(7x%KuRFopQWsvlIp`xTb=kv=eqivS`js)YIM)j@g@E2o|;{Jc& zF6sWBD9n?xVZY)ZnVY!Gx3^D4R_YFM30TuXPv+8B&CaF=_Nz_3i-$k^)k>rqJaxob zQJbNpPZ;^((kw{3@%8A>r}y;E@aiF6o6c#vY0zY0o8+qmx74S&;XST3Mp?6h^+vY7 zKYWVtGM_vc)4>~@!;jWj%6wX~{%-Ki;Yx!c?6H$E?esn+b8L=a9K|u@C2?Fz!XSj% z0{S*NsjwTT{MeC(h9rer<1Yuh>(U}ww&qinq4>gA1~GOlU%UB3h(-w2cGm*ngEwhQ zBt49=Itt7Lz$9>F&oV&aLoC;0Xbo=P_#~<$Zrx(NNDK+`<%z3r9m58xtw=8V$XJ8) z>gODnVS_{BZhkK$n|a>wXwN9Ug};J{L4Rd*T<7kMDVUGX&U#;3%_f1AvYZ-^$XYFM z7fU|Wl*iS-BO>GHwr%|p&{Jr8iXjtDO1lUtI_;$J_Mw&{UVVl>|2~9%$}MZ$zr!$} z&yx8_$st5_x#<%;k0IlVL&YlL+Iu}Oq){~r(O%4XlP$8j5PIw1I=biV3;WU@36)x< z^C#5j{9sK(D97-3qwmZk0!2oiJL8q?Y0G9&ImSo;;`0dV%;-eq%F^eNLCWD1-^#sp zJ~sAe3$`6NL+=a*#9HRNm>L(Br%F<;sE_x=db>?Z#^%X>(~*^~8RE)nG#@)npJadZ z5!Qeiqg$O=N{O6dJrYSHj&Mk`6^Cjf{567eeeg=@3g$r3tHplMxHD+nL{7KVPtseB z#^YEn_$3dnIlnx@Us)$J3Wy7{i3*gnb>52tu%PWrrTV93XZ!B!L2tVziDa=fY8VFv z;$x#9F7btBJkll_1S?sSp?bu@1aK;f_!Ec(^q+PazifMVZBr)4NiBOS@IP z{G2G@!O-8$>iv!5pks0VU2)@&+m?YIwwD(TE^M1AY_W~cdF=#XZqt!(xre>dF=VZ`D0?p^1|rCI zjy>dvJDfY-0yE%f4iDQ7`8`238pB;KKDfk|&%(`{8NoEzD^KdvK=a~Y8Ye0k;NMKtewZa= zk(!~=C`+%}xGs7TvTNI78~pMj6c_T! zT`<~x%}E`HJwi)xz@+{H4C0l9-iLpP=hvaJ(}=Q6=!VOW4?>!d2wkErpdc8qoJHXy zg-i-1r8T7FCYjr280eXBZTn~nEVTW->94Q2y$;#2;#)E7^O(?d`jxmLfvdU%5TrmL z3q#L-P@pmPM5sJzAi^+_5@o5c{UGT(qGg^{tZ8n$uE$q{aq*eyWrP>%m~ds~;}o`l z*w3%WT7K4CZDokME25jmq?%D~vRzHzM3-+)FK14FjdLHgCMMTO#MP5IFf_N1 z!qL}?r2DzMTfaKrMRYXkja(qJ@WN5~h*?{|_-(4t;6N7(n`@LbeaQ3QlA%#u3N>F< z-&KiQ2FE?i_Aq6A+;`KsL8K;zaIFhMAD7`D?Z7=LDp#{?T@cIJ1GFqS7ged)b(AJJ zx&TEFiZuxMI6Y=fB$hJ~oYN+~2t9ZZo*@$_5ge)ovz4%B$;Ipp*LM2{cdiHb&$e@UQ@ZACo(Ps;JZC-~usE(7k!y^O z>58X7@mg=)K0~VYF`Xwy#FG`Rg&@M}eXc(tC$q7#7#3YwY6|?ZxnVY84y>!aoF;hJ z${PgMybFham!GQ)r(I=H>Qux^lw|9B@BN7nI^n-RDPkKpY-UHIfIz2Mcl#wmO5YF+ z^-e#J*z(cT45vxhGk@8`3cY=v`l%!Q(+Js`@$lcA`t~D(<5qBAFiYZfL3fWte7-pL z5LI?A2i^n+Baf0!+K@DkQRTt!ZQ}Z;o(g*vGVouL>RGRYfGrCHN5{f>k(aEQzE`K0 zp^F_)xcpLtn?lUBvsWnA-fGt*I^b{MLXCyj5gd6v+c@~o>^7%h88Aa}hohEWp(Yi= zQ0{yl4V~&3fg|DPnQWA-Xw6a%%DfI3Q!-~E7pEsV3vEF+XJ}gF!jo@mcAL@%b_yJR zsE`$M^wd1`rj5qL??>Yz0}73)SdEjMe9$%M5h!0bs)UboB55wCuiKtOzizZs^Pd+9 zMPZGj4hU%`%-WYmE^J;}I zAkahd`#9bj#-O59_H`rT%p?K4USc|JC*+|dh5!U&f(@C|Bg(9mP*7|rXAsAB;sB~F zcNRZF;6)Y+iu}7+#4jgl0DC$rNii77hw~r419J;hsAzw$yk99tYS-bJr_gU+2dc6) z^B(`7`W}Rb{@p>C&zDPiY4z|`c9_~#D1>MfR=*96aJ%v~%X>n3N3=BO3Yzv?cCTZ$ z64gbsKDP11n;E&i8@n=%fNfk3M0Us0YuC;TgoQi~MX@3(6*l`k2+72-*sLyQZ{hb6 zwLkOx3(ZoT{gBC)NIY0Y4lXDM5mSY=TSLU9*ZJ_pl`j=X*u)+=lvPWOK5q-Y*r=~~ z_E#pXIc?_nlV`@u5vp3^C|UW;NQMY;K>UbP+wOO0x0m2LBTfUg({2r9VJvY=T0_d^ zkB+yCUYc9eFUmE9ntD*6O)sNH);r}4^w-XaF^2AETbE<4I~K*eGpU5Cwx__UopNNYa&GQqH+*6bfFPJ$bI@V3E~AJy-Vjt6)C;@5Jq_4lhv)qPsOl@b0ek@F zB!XRE-Vf-Y5H=LAQdVQh!g~ZFCGJA&>drSubBssHs7M#(6!ZZ%X0{;afS+I zdbO~_Za1;iZw)sHmfA=85a6My8ryAaJjO|NR#g_w@K;IgcCguVxt-wVB#Ra^gVE+r z1)@Ra;A`n4nIE1OkO%Y^`}~RFC<`#`DzMJrLGLrm-(P5MFZ<&I)A*CLH$?lYFx1j$ zpzF-gJbf8l#dLo6$W<>3YjkT?a#G!2q<<4nhc2|Lh91vjt6-Ov^K&cHDnO4F&Je&tAj+P?0cql&hiR)E5)pu=K z#2BiWJ$MP4DeLH^%u^~$m}(o!;uTL@Z0(|_W~!vQ@XvH*P@%_uI7&d!X%qf16Ti?z z#~Cl~gVKAF@(EE>apkvz+QFYJCo@{InS=xIv?pODe9u1zr0>tVM&EJ#JcHVZszbqd`|=9hb* z@_a#&2SbpdNNMK#+h2VOtsJRaW~SDXwf>n{H%}LjXm^x@!#-iOcAweV8Fov*PyWd; z>OG0&8T)d7wkB|&XeY`LWMdtcE~^M7(#uyAKB{&7&dHgGRao@lC$7D|(ZeyJ2VO7q zvW`0YUd?d`x!QHR^$*8Mm5I*|TLsMEY*t!kn~XJ}(otM|G33eZE9rXS24a><>TeA- zYf9x2DEggSO#o8=hMS3vGoOPpPLil~HgOmp<(MF8%O5vv{EK@;NJUT*(~iOy8m1-p zy1%AZ3S&;U)W8m?$CWk}!!oT{a$hAW^RY5$!V@fasrRd*m7i2|H(oZ&QYnnweQ#1f zcCZ?MtMjb;^Dm5orRw>H)3#dXDdF+g@1nIooJa0y2=-u0sb}epy)BJx?`4Z5b*t%L z?V=ls9seO29nilwxoJH=PrRQV&_#N{l9zVO1&?CsfKH;f=W zG9MQE9_nw4&l=cB*^RyAsIxp1HB|7&cWD0{(8W$imLKsO6ky)g;pJY$4Tnb4@JR?^ zbY^U!3&8BZL+Iv2B?TN_Cl2&b@*kX&M$IOEOs`AcV@{h-WQZ`SiJ!MiOk@?aCs-QD zv=L@+bkedHF7{d}c%VF`6`+exF?nTvPU9Ge4>bs+5%Bv&J6PMI4C)CUl*|g}=pW9N z+NDKv5a|u;`k+Nba{_Y^+JyQxW$|29KTVQtpwX=7s+zB~T(u5=CPj2f)A$6U$zJ*D zO8^OOiYtkC(TK(PlPgvB^k6|t+tb-#YHsgQBV|r4JN*nJ^Mh?-X^f0(;(gX96c-_z zc*aqq)cU(o>F-dCXSw9M6rwNjE-Z!L>jY}ax_lvWb#Y1D`l6CYR$d8Bz(~B-$ge09 zQMojTd&jHb!M>$D=M9n4ouO2P)odttnh!>Wq*0tUnZkY$+VnNrJ_kQYzlzQ>CG?4V zr?o%c-D#6VyE1f?AeE70jCDTeQP`q+o)Ad>G^yAh7MdO9zN7SpmG7@Ld2=#aD-Di8f5|bf72v?}w7cVy?k#!EQnA(#(ce4r0r53_j%*r7d5o z+y^THq6|9a+-6f}j-j^e zk|$jh6{wswaHDnK$-ndp(k%n+^yyE(To7Fh9*F)b-Xx8!SFAvGu%l^#t z2neW-qG4d$oeO*LA%n<$n0wa;=hCIQWc@7*i5AuhEzc$;QS0t}n}hA*@Z)o$ppY0U zqVFv523(xW5DIZ?RoM_yhqttR@n^Co(9jDCUmna0)18pl+n2P65Ctrf&lm=l48u6O z-vP?gA(D_3TwEjfG$UchNfyW7Bvy}M(bKg!tjVv%V>?e+QO|@2=zl@dnS|#1IpWL z=caAgZ%+4m)_-EZ=p!$6l8l)$WYfA{zcVFu;W5UBRU`IIRDw#w_x_fN-1(f_qratR ztc3VYx47oQ{O^`#E%Ji`K_uUk559pa`p&-3MeoM>th|yg+fp6-QFoH?)Aii>60yJg z=!W>0h2%MX6(#xdTQRXfNvLuQxCvwUu-|R!2DlRqSOa(!JCVo?y?A}r62J&RHQh!Y zLRy&*=?&VXe8NWFyjbB<UrICYc zr@g**R*AimA8jQW8{zF1!LqHNE}nx%74Je}5v_$24DDQ{IFI{dpF4Tozr~3A zUJ8%?i|2R4HmguWetq!yy_SVXoB|Md{wY8Sk6Q!IU=AX+iE%4Cw8^7;E7^b&(>>5DxCS z+}-j>w;B7I5C$bn3_*!flk-ntvzKoO1~epCYYn5xr6iL$9*pKkUQujNtuUPR#UwQv zPt{Ooi7~PL_VqhhPTREWOh_VvQs?ouoZmpNU1@6do!A#%S0CaYIuZ$rVg|@WbMJaD z(GI=%ZSHRUqd=IPB708fV(K^X8&8ez7v*~fe?+@WQ(#)(9}@6ha~b^%XtMjraYS^e z*#j|l9sjta`nP-1)A;F_7-}?GHBA^}k*ZNBhvHi|Rh^OQ_4tG}n&t8iD94EF(b=~- zq?!WV-lBlC)d4Us=>J!yg{Q9rI)DLudmW&o{@+?09ta8kzpo}Z%Lc#>ANmbYffKM{ z9oQ6bv@L+@u7pAbj>ds~!6AqL*Z{bYO=lZ`0@$A%Rz*$@Pu)aTQu=5hIR9|v!h<#e z4mds18?480#{`q(#)g}3A@64R9sLW&d-{5$eOHO00{b%|8<^m+$f^v@KNT4ei2ff^ zq~*&DNBs>5-q#o49$Nqz7{msX=3$0^Ml#dfl^7(DVo@F`Ul{-8_m~g1WJm|!+XhJC zatFxaz6isTJz4IG4^%XFg@;?DELA4@Z?_(idpB*Q8}(h5T^otgL5BO^Mn&dg-l1BBV+n+)OXc;)XW{~ zpQOcqqn0%9QCz!7)IXB^zfpxcNE9<`D<3@e0N?`4ytt3M$37qeR&=uc*7=fQSA=~~3`_%caAJ*)J{U2Xbm-}m3m;br8biMZ%K1BN4yWQKb?`#+M zd)$Bz&Yd3(9RCQw2YY+mH3EWgC~~&JbB~a}E5H*zmh03>m4C^G>YK< zN61I&CDI#g{S}^k3<$xaP5^3xZve>M-J9RX!PZyX)6E_3ae|zh=qCUb#P9omy4*c& zz5eM+p8_Db|BNHj54Su4SZVH$Qve9Gg50_Z|LI0r(F1ViQ-GQ4e>ZmIg8}&$LhTm- zvT?J%8^ep=(%l`Nya|xpt+-TRv{88ODL@OqI05i!?iXC-7@v`!e=it?}%9|Q^*M*>0r582JL%m4rY diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py index 4a981bd..770ef31 100644 --- a/tests/test_markitdown.py +++ b/tests/test_markitdown.py @@ -300,6 +300,21 @@ def test_markitdown_llm() -> None: assert test_string in result.text_content.lower() +@pytest.mark.skipif( + skip_llm, + reason="do not run llm tests without a key", +) +def test_markitdown_pptx_llm() -> None: + client = openai.OpenAI() + markitdown = MarkItDown(llm_client=client, llm_model="gpt-4o-mini") + + result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.pptx")) + + # like test_markitdown_llm, this should be improved + for test_string in ["red", "blue"]: + assert test_string in result.text_content.lower() + + if __name__ == "__main__": """Runs this file's tests from the command line.""" test_markitdown_remote() @@ -307,3 +322,4 @@ def test_markitdown_llm() -> None: test_markitdown_exiftool() test_markitdown_deprecation() test_markitdown_llm() + test_markitdown_pptx_llm()