From 4678c8a2a4c5f2984b2a8b3051b0376cf0c2bec4 Mon Sep 17 00:00:00 2001 From: AbSadiki Date: Fri, 3 Jan 2025 16:29:26 -0500 Subject: [PATCH 1/8] fix(transcription): IS_AUDIO_TRANSCRIPTION_CAPABLE should be iniztialized (#194) --- src/markitdown/_markitdown.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 789c1e5..6df13e3 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -33,6 +33,7 @@ from charset_normalizer import from_path # Optional Transcription support +IS_AUDIO_TRANSCRIPTION_CAPABLE = False try: # Using warnings' catch_warnings to catch # pydub's warning of ffmpeg or avconv missing From d248621ba4e7f4f91dba22c000a17c62b394d0c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Murat=20Can=20Kurtulu=C5=9F?= Date: Sat, 4 Jan 2025 00:34:39 +0300 Subject: [PATCH 2/8] feat: outlook ".msg" file converter (#196) * feat: outlook .msg converter * add test, adjust docstring --- pyproject.toml | 1 + src/markitdown/_markitdown.py | 75 ++++++++++++++++++++++++++ tests/test_files/test_outlook_msg.msg | Bin 0 -> 13312 bytes tests/test_markitdown.py | 13 +++++ 4 files changed, 89 insertions(+) create mode 100644 tests/test_files/test_outlook_msg.msg diff --git a/pyproject.toml b/pyproject.toml index 3e14cec..67f6825 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,6 +35,7 @@ dependencies = [ "pdfminer.six", "puremagic", "pydub", + "olefile", "youtube-transcript-api", "SpeechRecognition", "pathvalidate", diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 6df13e3..d209b5e 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -21,6 +21,7 @@ import mammoth import markdownify +import olefile import pandas as pd import pdfminer import pdfminer.high_level @@ -1077,6 +1078,79 @@ def _get_llm_description(self, local_path, extension, client, model, prompt=None return response.choices[0].message.content +class OutlookMsgConverter(DocumentConverter): + """Converts Outlook .msg files to markdown by extracting email metadata and content. + + Uses the olefile package to parse the .msg file structure and extract: + - Email headers (From, To, Subject) + - Email body content + """ + + def convert( + self, local_path: str, **kwargs: Any + ) -> Union[None, DocumentConverterResult]: + # Bail if not a MSG file + extension = kwargs.get("file_extension", "") + if extension.lower() != ".msg": + return None + + try: + msg = olefile.OleFileIO(local_path) + # Extract email metadata + md_content = "# Email Message\n\n" + + # Get headers + headers = { + "From": self._get_stream_data(msg, "__substg1.0_0C1F001F"), + "To": self._get_stream_data(msg, "__substg1.0_0E04001F"), + "Subject": self._get_stream_data(msg, "__substg1.0_0037001F"), + } + + # Add headers to markdown + for key, value in headers.items(): + if value: + md_content += f"**{key}:** {value}\n" + + md_content += "\n## Content\n\n" + + # Get email body + body = self._get_stream_data(msg, "__substg1.0_1000001F") + if body: + md_content += body + + msg.close() + + return DocumentConverterResult( + title=headers.get("Subject"), text_content=md_content.strip() + ) + + except Exception as e: + raise FileConversionException( + f"Could not convert MSG file '{local_path}': {str(e)}" + ) + + def _get_stream_data( + self, msg: olefile.OleFileIO, stream_path: str + ) -> Union[str, None]: + """Helper to safely extract and decode stream data from the MSG file.""" + try: + if msg.exists(stream_path): + data = msg.openstream(stream_path).read() + # Try UTF-16 first (common for .msg files) + try: + return data.decode("utf-16-le").strip() + except UnicodeDecodeError: + # Fall back to UTF-8 + try: + return data.decode("utf-8").strip() + except UnicodeDecodeError: + # Last resort - ignore errors + return data.decode("utf-8", errors="ignore").strip() + except Exception: + pass + return None + + class ZipConverter(DocumentConverter): """Converts ZIP files to markdown by extracting and converting all contained files. @@ -1286,6 +1360,7 @@ def __init__( self.register_page_converter(IpynbConverter()) self.register_page_converter(PdfConverter()) self.register_page_converter(ZipConverter()) + self.register_page_converter(OutlookMsgConverter()) def convert( self, source: Union[str, requests.Response, Path], **kwargs: Any diff --git a/tests/test_files/test_outlook_msg.msg b/tests/test_files/test_outlook_msg.msg new file mode 100644 index 0000000000000000000000000000000000000000..05b087b77c785c8b57a479485b9715fb7dfecbb4 GIT binary patch literal 13312 zcmeHN-BVk~6+c3bjg7ILwn=KIj;=}Trgp#*2rveM2nZVkG6mtd9nCNjlCWAKQAxyM zJj|q-$xPdqzW6l{{rCs^(npW~gZ8mAopjpA=DE{8CG_{(s}+Li<6J|AWX6m4-g|b> z?(dwl=j@mK*T1~{&)@y&(!b?y~s1$oPDb zzCjk~blees#sL(WabNo9xsPR^kLX*voQG{cD~qxqeG-$RR3zgSUgBs|MoUMcvLQ*y zNgm$|rnC%ty-lCX3-QHU3oA>L@u|sJ-`vVld}V%RIdXepa(2FN>fS;-fhfBpQ|37* zTT+57TaaN(R(+0)K_-?ZQM!g_0enB-$5oaHWVDj^fvX7Wop!Lb`seGv)*P0aMUG0Z z+=rz~uw@Ps6yz4P3PIYSbYq^FHX3A1=`!Rm$lIvz$Df0`45TI%L=KyFA#nD~0G>ho zIdUD(0rSn?_K!|4B$zfmkHLq_e5c+?J??|1!8`+*RQp=S5%;;7z z(rG-6EoWdwjv*~LE)&3wqpTM?OhD=hP|J{f9a3$Z_cAPVAKz(6ejggAVa@j>3(j%$ z@cbS`S>lky%CQ9>%pQ{*Q^uzl-vvm~3%Rz<2vIhAa2};Olq603+`yY9^v8nnyd1nC zBctF>p(pxZ+VF0}Mm@%_=w}E2Mo^N1b=OSI5Ik}O?S}CjLfIf%(nidOOTWAx_~X+* zT)6m2K|b}Jk9S~I{gBdgUh2b>TBnUR5j;PDDkPm@#u2AZ#f#9jxI%_4<;8D=cAgrll|}MtwH-T&ta{#*S>`DSTkOj!x#Ou`DpFQ z5eM&K)}J_Lq#Sy1s?Q4OOx4fRh!O|>p2gFaTX`LD*;RuvL@Dfg=f$Hx_K7erMCT`| z)#F5|k_{!2g>ue3);8Be{e%69Y^k{Mu(FX4If-m4m(G=)L^_kr1|KJ~#X_Z$%Adhk zFp(`k%9YE`WGR=-kM8WCaIKgxmQbH4WzxaKdMT4B1QV(JRwg*Q(|snfwH8WjCQIqW zWmx(XSUUHL}xnBI1ZS@T`Vlq*hVBbm!P8<}!Bxt^)(Yq-DG7iF#yL8tr7 zJ~2bVurqc8g4T&w$6c=x6h&9g&d#1odHVXgteKvKu47&=e)4-m-hKMFWY4Fabyc1| z{Z(%JwvGMrn%UpN#c?7M3CJZwf2+nnRNMc$^}~j~TCI;@wJ)^Tye4@Tqj_2R{i4~uY6VlUX_Jv4rG{xL_0rRdb` z;%qb)_j-z;&$|9=*R5vr&lAYZ`NGaWd-IJu|JZf0+5BVc?fmm*wLjY8I1=w}8)Z(2 zww!wf-us?K?nXX`d>)x==F7<33w;~;JIJpfv)yZW^FG-2(dg||>wEB~9lnqJ1LUj7 zKSXAmAK`ti`c1kQ-+jn8khy0GA@?H>Acv6$ksD!+qI|4L-Mc8Cs;is9_w72mzCT{* zaW!QbUf=k%g!3V9PI>>yGiwQFMlb1AdC3P;n8euW>UJaIncE3X*)Tn$!yv8Vk9RaY zf#z__v1)E=_U;kgJ4Bba(eo|w8GOv+wLNqrzGh{_FPpnMp2F9wcb2QwyCB|4hLGuD zjng&kRH!`W1M}NQW3DfkGIp3+Pha6&%hb&N z#2vs{RBM0kqqX?-l`8~CYK=dw)}L#X7GL9k)IV(ePdENtAGP?Ls~#XTwbFkGtC@EE zxi)L@=___0(h7eVzk_J;xi8S-(^pP12#}iNPa*Dj1FxI^xF^x#zY5-u2#(YmUmyS6 z4{7l)g7h%EKrR03;B7zua{Oo1_}qhP@o9VR zA(@)%Pu;ox=;EK-87;nU{vp2mo2O>|;oke`@wxu%^yj``i%;LRACU+>D0i@zARKZgXX`+v@9T72GNF1p7g zqyOFwQDf9qoQ|z8+Bdo|=~ngp^Cl>}rc|p`uV(fSV(y*6ENDB_>GGr7ICe)7kX`uWd${PWh!O*Z~Hr`a{l)@k0& z#@}oF^WILY|2TM$I{vSEz~}#tJL>r7eWh0auOj}rrfTsgz_knM-~6Pm)F{V8oWmx|2)rW@uT2B_4^;s;ye$t&F2*A JMf%!W;Qy?8Su_9u literal 0 HcmV?d00001 diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py index 4a981bd..a0626d1 100644 --- a/tests/test_markitdown.py +++ b/tests/test_markitdown.py @@ -63,6 +63,15 @@ "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", ] +MSG_TEST_STRINGS = [ + "# Email Message", + "**From:** test.sender@example.com", + "**To:** test.recipient@example.com", + "**Subject:** Test Email Message", + "## Content", + "This is the body of the test email message", +] + DOCX_COMMENT_TEST_STRINGS = [ "314b0a30-5b04-470b-b9f7-eed2c2bec74a", "49e168b7-d2ae-407f-a055-2167576f39a1", @@ -232,6 +241,10 @@ def test_markitdown_local() -> None: result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_mskanji.csv")) validate_strings(result, CSV_CP932_TEST_STRINGS) + # Test MSG (Outlook email) processing + result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_outlook_msg.msg")) + validate_strings(result, MSG_TEST_STRINGS) + @pytest.mark.skipif( skip_exiftool, From 08ed32869eae01d0b7c39944a092b90221f81ae6 Mon Sep 17 00:00:00 2001 From: yeungadrian <47532646+yeungadrian@users.noreply.github.com> Date: Fri, 3 Jan 2025 21:58:17 +0000 Subject: [PATCH 3/8] Feature/ Add xls support (#169) * add xlrd * add xls converter with tests --- pyproject.toml | 1 + src/markitdown/_markitdown.py | 27 ++++++++++++++++++++++++++- tests/test_files/test.xls | Bin 0 -> 27648 bytes tests/test_markitdown.py | 12 ++++++++++++ 4 files changed, 39 insertions(+), 1 deletion(-) create mode 100644 tests/test_files/test.xls diff --git a/pyproject.toml b/pyproject.toml index 67f6825..9c113ad 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,6 +32,7 @@ dependencies = [ "python-pptx", "pandas", "openpyxl", + "xlrd", "pdfminer.six", "puremagic", "pydub", diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index d209b5e..50c83b4 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -726,7 +726,31 @@ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: if extension.lower() != ".xlsx": return None - sheets = pd.read_excel(local_path, sheet_name=None) + sheets = pd.read_excel(local_path, sheet_name=None, engine="openpyxl") + md_content = "" + for s in sheets: + md_content += f"## {s}\n" + html_content = sheets[s].to_html(index=False) + md_content += self._convert(html_content).text_content.strip() + "\n\n" + + return DocumentConverterResult( + title=None, + text_content=md_content.strip(), + ) + + +class XlsConverter(HtmlConverter): + """ + Converts XLS files to Markdown, with each sheet presented as a separate Markdown table. + """ + + def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: + # Bail if not a XLS + extension = kwargs.get("file_extension", "") + if extension.lower() != ".xls": + return None + + sheets = pd.read_excel(local_path, sheet_name=None, engine="xlrd") md_content = "" for s in sheets: md_content += f"## {s}\n" @@ -1353,6 +1377,7 @@ def __init__( self.register_page_converter(BingSerpConverter()) self.register_page_converter(DocxConverter()) self.register_page_converter(XlsxConverter()) + self.register_page_converter(XlsConverter()) self.register_page_converter(PptxConverter()) self.register_page_converter(WavConverter()) self.register_page_converter(Mp3Converter()) diff --git a/tests/test_files/test.xls b/tests/test_files/test.xls new file mode 100644 index 0000000000000000000000000000000000000000..de4f368c24d489ff7aca786acc29d80652189eb9 GIT binary patch literal 27648 zcmeHQ2V7J~x1U|wf+$5qipWY)L_m5`aVesxfY=K*$|6JrL{R)9fvCh9MKo5TF^C#t z?_JbbP)t1ti zc8Api!a2GVE%I5SPjp#y9$b6N^2P-2OC*?;eR{z)kTl2tA`9pXpdod2$cTOy^o|og z(G?K#06w>CV~&uAkVcYpVn|4ELPm0Is&KeCD|MXkzxoKFI>ho&hf;97K%O??0j&s< z39+QQnbLDhdfrAk57ToXF(r~*L#?wT{C)kQJb{wK_(7EKTY9#m=N540k$tqxO-k_~ zZOLum7mxyUb`Y+|3nj56g(QUkPaT|0Kki%#I6?!zJsWVA}TKLx0r8%cA zLJ|Qr8b^{yCasqXj|bf9Bx*Ovfm*A_K7>fOW({(5YmlRR0~|aeQd`^CP#5n8 zI4~lr_Z1$|ZxF+v0SrEA-2g{JzJgZQSBcS9yA9C+8N{yG&fdk|-hE8Rt|OgfnUNja zYPW-tppD%9m;vI1QSwZ7M330iGr76T<3JvvI)Gc-ak)5XQdU3WOgsr0gQ_?SWtvH| zBb_K=WXG;jf}N`!=sm_MXbebkBOT!nin7Z&3)zL7g>ofC#D#Q`7sk{;ULIAAB~m1+ z7wStoK~Jg|Dl1|BSOZ+c;++s81Sk6 zcS~<+ucG!>q;FF}pQD1lQU!gD3i>7$^vx>hs`Oc^g8yd~bXE9G<^Ol-vqOcR2PoZK zp}iP>Mfx5U{QFeUcd4KsRY7O;ff*9Vf3{G5m|rpUB_<}znk{Qaw3&7>t)nL<9dVmyn4Qtnpq?&-OGP?IZVMTIoqZVpF_F=)kg;%SCBttnqu1%l_$RJZXuXO| zOw5~^gPoavXC$KoK4WLb{<<(8Dzm4ajGv=tD;fO?ObE%2IyIxRq&!?>UYml%U^@PuKdNd>S`np6P0MUx6(e`!(y?5IsDfRU+51+a@XsQ~uj zCKbR=*`xy9n_8e#Qwz8>wSa3=3%E740L(_4W~Huj1+c@iqXsWW5X=cPFj<8FBMSoD z0(^lq{$NjLCe}isA_&aT8wQbx)PvyWQG=RzddY)uqoy~2)Vq~Jbre;1uak=55CXe| zZ7{wz)E{r0e)X$Fb^V>>K#GX2auCTGbs&E7AVXDXBB?H7X|jmZoIU`eN1GG17KtXS z(4>4t9f+Q^in+PD5~1|KW=8-vO`k=BTl6qcLdy|wl?nh}wo)SHO1*viRzWF|NTi?? znDtD|sJKtc5?YF;0WEP?-P7 zI;KQYu274eJ9jEL4Vxyb1=0YU9x|t4(-h}eoq@N$7U&0Zrp*xy=JdyIP(ZsbyeiZJ z?T*!B2Q{?2j3(=kigC*hO4Ub8$S5|l3`BL@!j(xqh=O)#V=W?1WNW9W2`;x91SyYD z4+1+&4TErlKezYT%Yi^fxlM4AsoEyE>S(Y!;zaczu#wgM5ZFj-7{t>{J&1!G z2xOGo1n1GJZGtP+M%Y9>2&|VH1}TqF57M?VAf8_8K^)~kAfwzSxKvPW6I}H*!Y1lL zEE@w-9-$t@wlN@{Ug|;G%Yi^fxlM32q}nECav&<>iFy!N=QnKF@(A@H!p4AjdZ`EL zAO`{&Kcg-UZZ!w zEWi;rGf<2PSB$5k7&|p$m^BC$qa#lI`_Wb9VobSWycET>S0jd5rBE@t;=~s(UMLsS zk}JkrQH--1F}RMA8ck1}D0y41T#Ol4jE|xiH#K5#xgZrI5GUSv@K(8)R$MW@iefy~ zh+(F+)M)zR#D}kLDi>ppV!YLRnk|U&ZiJ_mM;y8KQn{GcTro;K&0dWdwx^Xxyg$E1 zxflzs7$u(8L5&!;rLJ;HI2v76$T6N-yaxpesF-knm zL5&!;r+Iq4+dfmdn6_LoN<7U;jTpA4d3u$Vl_?iv%N3)<(_GYuVSAdVm*hsGaxp@# z7$u(Ou0{;o(>%Qv{G_7awBw3V;%VM$#IQZh)9a^;3zXHy5T0|xysK{94CaiRHWEFR zVl^H52SR#e{GuGRLp>;X0vFVS1IliF zLejVNQU)p@PB6NXkl2JmL4bVA^QO9Nf^n7H}$aX zQ)&CBfg)4E=7T_;Z14x%yppv(Hn!Bg^r3FBJ<}^K4IH(rJTA{lqD^{%(b71siaX*m zED$EcY5}jmzzQN<9Gj2~ZwbO$fupFFF<3Sn6hhxm21Un#g3MD0gwul#(__#V3>>w@77i67`sqQqI58L<#vIDwhh1_E;w0Ao!vz^5>%FqrlTMXn;T{sr7kq3DLQ(zxZQ+iAu{97YgF!G57}1Ul1b`lp;_+S>(ly zYFS9bP3HPQ%j9GsjmgPE8k1YhZ$<(LY-;t(%A_qmg(I{TK=-3IP;7H{$j1mN*XBs5 z*M!N*G+}ZwO_&@8%r1F4N`jvb1ixsga|X1qZ0V0iLuc)Z&N?VLGX?DBA+NfY5j4;s z@Q+MrI<%Ogzw;2H8-+;AL?7VnK4@7|2sAm<{@T>7^|5PHba z*o;&RT1O=J$>^F6N$yoZ%az51R6h0#c-x!!A7H4|olMbXJ7%8^I zJ;2@9-NVz{HO@!u=j!d};pZBV;O^(@74PotD~?+!upld0>M=gfJZdS1m%@u#_YiPC z_&g8ZkUY*4KqAEFj~3)_vIxh2nxIF7rlkZkA0f;~DD%++nm;%tJt-ElA>zzf$o7hz zFabY%h*L6S9pJ%hVq&~sLTrMok7q)>t9M+0ziU984`e;WzCH>5vHr0M?!e}onCR`{ z=N0Gb?HTI>b@KNHHje<;MDKujfbdQT@Jf)t+us7x8y>bpo06IE*$Y171?G_8LVB1> zKlx;-U~#jhtUGuky04bD8CVs`?Z~scM#rF8NgH@})e=5qLeo++VJB?NpsDHNNu%7d zC!}m%bUx~&yKz`e_}?=gyY?yC86*QT>VMm!dd?#MsE9H8d27#@!_k+DyIt_GE(zc3{++4O>>K?K57-)# zaH-7ra-XnU-f<4O8x~C*bIo@7YRmh+5fy)#54h$xcI(28Ws+Sd-OE?^JD$7E<#CvH zs^5}`o3vI;))!{s%%u05`r;rHu3 zv&vO{n-6y>(rZ3fe8Fw@zSq-SHdt@G8Q$iUOUi>Yt)4u~wkX^EZOePT&ra%}V_#X2 zzh%!I;nh8zD%-tsxik8J!xw?`dJTPAQPc5ZSaqlN27G_z$`@C==Qe{uWimX4z}~=8 zo2h~as{MTUg?ZL~X2)885E)KNOL_Ide)PlUhd!U(dm}RB`MO6=zn%4(>r)(DIHfA4 zchnIxgXCu=7h{Y{ye|fYcK#G^ZhG_G!l#Q9yDd!#_;`H4$0qSElo?>{pfPaCU-%8?xdQkg{eQ- zSXOC&Id;uln=5bPJMQ|vPnfp*{9k;QJ}`S3Y0X5Y4hLl#7priFBhsr+JBp|J47 z=((opUrzCC$NOi`b0x5MzH47;ZTbA+4;?e@6KW9Eq zn7JnWUU7K%{!4Fu{_xwj=XW>$@coI|=bG>R=1!2L)#UtO|2vl_y-gY!HDa6J9~X{X zKBgOeU_tf14;Q}L^=<>tNien~pmoK#_+v-zjW-*nwIny|{mVW7xaHFMQrBA!e?JbsxV!ZFO{X+JvlX^U*UhrcG*oZ_~@a{%{2@565Sj%|}h} z-gjtT<D(K;1Hu{?Z?Z@svIM`;;z6ZxXT=~n-#$@-5MX_75c`bMNu6v(tci&;L zjl}EqAJzrewvGP2d(PAk@Ag%U`z>nC=K!p0?P30$+&}+ZmiCpTYShcvQ|F@(fBxsGRm(b$&aJ6jBMO|7eXlAh<|ML8G{@oS!f1G#`-*u)@z{dH_vd91P`ok{s zYa>k0l<%whh2PerXN~0)-TQl2o}c|XX=%@wWwqtM-0Ai7y3dr>B}1lv+-3OayZs|4 z$ERF6+^fHH|Cs|_0<#uIdQ9H@(xlyxkqg61SB!1>EYR+5ce`H;=AXUpysqRDueY%C zq}IXVds?k3o^`i-@z%jLuXlVq^9;YgUaiUcAr@(CSM->3{;|OEisAcqg*)awO8-@> z<+awPo;I=P4u3T}KmEq-$J$|ZOyFN9xZRgSESA_EYG(o z^!DAcW^TtNPH$g&JezLo@ln73wF12*sS!1y)3;=>M~ZD#HIru@={t8Wd59K4odpEG2% zarEBS?P4Pm7hj)TkmRa6xZqt`o9R!4YllvDbh+8W{MvUP!!}(VM)r7i+CK5^*h7_X zth3yC_WlFf%?f3;Dc@x6<)K8eoXz|9(4GT$WhyGPcd??D#HKH6}}j6%QWBGn|umO|+-m%64DZtSBjPtxhZPXti_v z(da8aOBUL%Jhkzu)B5GVr@KCOw8gsf8V~N zdi8{LcH?G0**R{+rXLTkk2$s3(tN^L$EC~0zt#OYY<6%(d5blDYc@UF+j_*lo&B5+ zX=@*zzoxi)b#Xh3D@zv-zhYLEWjSQ!sQEXK#P2Be+W7s3-WHqgjod$KPRtGaqKf19 zOS*`zZyDlh7-{~ok4^HYle21XXSlT!BZ!3_A6!Wsdk6M#sA6Ygd+Yo<>Tq@`6HG+xyH&g_d@ zu4v7CMHlA|s?FQ#U%YV0BfH}D!DkO$ z46n(``Qgnf-n3H=N8*gXtR$98qgP6Hee$=OGp@6(`KZtRgD2g{*KzvsboqVzRhuHT z&&3=(_{%-07*ll~-xXemQ8gn3HrAnJei+?X9)LSVsxR#h_GC?%xCOaWj{1U@Fc)Jt z0@wUpB<}cMk91WT5}gfjF}O}pkI@P?YS`MSaf#@@I{3l{tlD(QD5$+LE-A<=Sc9x$ zP0Qg1nGdx2EE@75GxH@@W4SM}HshP&ZF=ClC%53;*nnUH+y3~aPeV8-5F>hR0%wp4 zll{McdUH84E?P8((w!;26FpC-lsvj1U@#qFDNKk9?`ybo92N_Nw1PB)a?hma_Q0J; zk0&Dh&YSGosSf3-s3YroEj%sI0z@9@$HN^4IK1dlA(DL>9V>!;$0CjmD690;4Y#D| zF&MNkAUSjyM0k|~eCHYt*y+LpZNj{HCg9_XV0i6ZMfri?1 z`Cz%HK*NUT^1*(O0u3w7prJQ#`EUusrQy<$OM_>E3N*A8mk;g8rJ)76G<=ne zOEXnM!zCq`5B-TtL;vE^aN^FT;oE0y+9%pa2?J)j7^oL= zv_Wc6e|pb>o;{=(s4LtnrKP1(7^J>Z4%8np{Af92JTrVGK+1u20%{(E&y?r&_OQWS6=#PjIO*uU2 znL!Gda-er21|!?7o`Yh11^Z#>tH==q%cz887*8Zx~-TPRc%cFjoG1#!Ivs z##Q)ni2sK1l8M9^v+-bZ7_+hNOb%l<#$G0eF&pD7D`pegccBNNe+lT>SnAnW4yjCJ zVWI$yy+kPGki~2}T8@du?ddfW_bsR;5W|^LUFg{@h)?y=23=qd0e7##7B$W)qKT}R zQ2&QJpcp+j;QywyT097AaHZ8K|ChpQJ{n_z&#wlNpt+@Vw3JEuwas3|+3AccTLBh>a+@!J|x^9I0=cpAAoxacrfdAV~T}&#KeR9*%pLgdEAd+-hU)QxNP9X zM`9=~hFc8?IW#R}{3LvRauSt{h?+xV0gVMT7SLEgV*!l?G#1cUKw|-o1vD1WSU_We z|63Nobv&+kam|hEZ+y)K-%i3cH9phFwLZSmh6`m}v*Y?*`YIV+d@ zg8#w+eq?~T?*gB$klY}3hU5+jzdG&-3Cn+igrAzD@fH{W-!sLp&I;kz1Mug%@D~G` za+nhx`tlh{L_ZP=nEjJd2BfDlEZP5y!h1tOdIo$qJdWxyUKzVu8vGBSI{B!BT4vVG ziXKcxhYY0Oy2YP_z`ElX=M$+;NU8q~4OUUPODB+zeF*zcW98c;0&bAOp+kCrH2lqu W3Gf|YY=Nff None: result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx")) validate_strings(result, XLSX_TEST_STRINGS) + # Test XLS processing + result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xls")) + for test_string in XLS_TEST_STRINGS: + text_content = result.text_content.replace("\\", "") + assert test_string in text_content + # Test DOCX processing result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.docx")) validate_strings(result, DOCX_TEST_STRINGS) From 731b39e7f5d36469b2912ed1608fd86c04a1ddcc Mon Sep 17 00:00:00 2001 From: afourney Date: Fri, 3 Jan 2025 14:34:33 -0800 Subject: [PATCH 4/8] Added a test for leading spaces. (#258) --- tests/test_markitdown.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py index 1ac9041..9dc7374 100644 --- a/tests/test_markitdown.py +++ b/tests/test_markitdown.py @@ -257,6 +257,11 @@ def test_markitdown_local() -> None: result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_outlook_msg.msg")) validate_strings(result, MSG_TEST_STRINGS) + # Test input with leading blank characters + input_data = b" \n\n\n

Test

" + result = markitdown.convert_stream(io.BytesIO(input_data), file_extension=".html") + assert "# Test" in result.text_content + @pytest.mark.skipif( skip_exiftool, From 436407288f01b5a2c31111062b0c2ac959dad443 Mon Sep 17 00:00:00 2001 From: afourney Date: Fri, 3 Jan 2025 16:03:11 -0800 Subject: [PATCH 5/8] If puremagic has no guesses, try again after ltrim. (#260) --- src/markitdown/_markitdown.py | 19 +++++++++++++++++++ tests/test_markitdown.py | 2 +- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 50c83b4..aceaa86 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -1594,6 +1594,25 @@ def _guess_ext_magic(self, path): # Use puremagic to guess try: guesses = puremagic.magic_file(path) + + # Fix for: https://github.com/microsoft/markitdown/issues/222 + # If there are no guesses, then try again after trimming leading ASCII whitespaces. + # ASCII whitespace characters are those byte values in the sequence b' \t\n\r\x0b\f' + # (space, tab, newline, carriage return, vertical tab, form feed). + if len(guesses) == 0: + with open(path, "rb") as file: + while True: + char = file.read(1) + if not char: # End of file + break + if not char.isspace(): + file.seek(file.tell() - 1) + break + try: + guesses = puremagic.magic_stream(file) + except puremagic.main.PureError: + pass + extensions = list() for g in guesses: ext = g.extension.strip() diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py index 9dc7374..e2d2e75 100644 --- a/tests/test_markitdown.py +++ b/tests/test_markitdown.py @@ -259,7 +259,7 @@ def test_markitdown_local() -> None: # Test input with leading blank characters input_data = b" \n\n\n

Test

" - result = markitdown.convert_stream(io.BytesIO(input_data), file_extension=".html") + result = markitdown.convert_stream(io.BytesIO(input_data)) assert "# Test" in result.text_content From 05b78e7ce18cf2f8d8d75058a1f2c98f9930318b Mon Sep 17 00:00:00 2001 From: afourney Date: Fri, 3 Jan 2025 16:40:43 -0800 Subject: [PATCH 6/8] Recognize json as plain text (if no other handlers are present). (#261) * Recognize json as plain text (if no other handlers are present). --- src/markitdown/_markitdown.py | 5 ++++- tests/test_files/test.json | 10 ++++++++++ tests/test_markitdown.py | 9 +++++++++ 3 files changed, 23 insertions(+), 1 deletion(-) create mode 100644 tests/test_files/test.json diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index aceaa86..b6acfe8 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -173,7 +173,10 @@ def convert( # Only accept text files if content_type is None: return None - elif "text/" not in content_type.lower(): + elif all( + not content_type.lower().startswith(type_prefix) + for type_prefix in ["text/", "application/json"] + ): return None text_content = str(from_path(local_path).best()) diff --git a/tests/test_files/test.json b/tests/test_files/test.json new file mode 100644 index 0000000..eba3059 --- /dev/null +++ b/tests/test_files/test.json @@ -0,0 +1,10 @@ +{ + "key1": "string_value", + "key2": 1234, + "key3": [ + "list_value1", + "list_value2" + ], + "5b64c88c-b3c3-4510-bcb8-da0b200602d8": "uuid_key", + "uuid_value": "9700dc99-6685-40b4-9a3a-5e406dcb37f3" +} diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py index e2d2e75..3333bcb 100644 --- a/tests/test_markitdown.py +++ b/tests/test_markitdown.py @@ -145,6 +145,11 @@ "5bda1dd6", ] +JSON_TEST_STRINGS = [ + "5b64c88c-b3c3-4510-bcb8-da0b200602d8", + "9700dc99-6685-40b4-9a3a-5e406dcb37f3", +] + # --- Helper Functions --- def validate_strings(result, expected_strings, exclude_strings=None): @@ -257,6 +262,10 @@ def test_markitdown_local() -> None: result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_outlook_msg.msg")) validate_strings(result, MSG_TEST_STRINGS) + # Test JSON processing + result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.json")) + validate_strings(result, JSON_TEST_STRINGS) + # Test input with leading blank characters input_data = b" \n\n\n

Test

" result = markitdown.convert_stream(io.BytesIO(input_data)) From 265aea2edf31bf1b022992e59f0ade1e54903aee Mon Sep 17 00:00:00 2001 From: afourney Date: Mon, 6 Jan 2025 09:06:21 -0800 Subject: [PATCH 7/8] Removed the holiday away message from README.md (#266) --- README.md | 3 --- 1 file changed, 3 deletions(-) diff --git a/README.md b/README.md index d2314c3..6bc91e6 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,3 @@ -> [!IMPORTANT] -> (12/19/24) Hello! MarkItDown team members will be resting and recharging with family and friends over the holiday period. Activity/responses on the project may be delayed during the period of Dec 21-Jan 06. We will be excited to engage with you in the new year! - # MarkItDown [![PyPI](https://img.shields.io/pypi/v/markitdown.svg)](https://pypi.org/project/markitdown/) From f58a864951da6c720d3e10987371133c67db296a Mon Sep 17 00:00:00 2001 From: afourney Date: Mon, 6 Jan 2025 12:43:47 -0800 Subject: [PATCH 8/8] Set exiftool path explicitly. (#267) --- src/markitdown/_markitdown.py | 39 ++++++++++++++++++++++++++--------- tests/test_markitdown.py | 32 ++++++++++++++++++++++------ 2 files changed, 55 insertions(+), 16 deletions(-) diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index b6acfe8..33806e1 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -892,14 +892,25 @@ class MediaConverter(DocumentConverter): Abstract class for multi-modal media (e.g., images and audio) """ - def _get_metadata(self, local_path): - exiftool = shutil.which("exiftool") - if not exiftool: + def _get_metadata(self, local_path, exiftool_path=None): + if not exiftool_path: + which_exiftool = shutil.which("exiftool") + if which_exiftool: + warn( + f"""Implicit discovery of 'exiftool' is disabled. If you would like to continue to use exiftool in MarkItDown, please set the exiftool_path parameter in the MarkItDown consructor. E.g., + + md = MarkItDown(exiftool_path="{which_exiftool}") + +This warning will be removed in future releases. +""", + DeprecationWarning, + ) + return None else: try: result = subprocess.run( - [exiftool, "-json", local_path], capture_output=True, text=True + [exiftool_path, "-json", local_path], capture_output=True, text=True ).stdout return json.loads(result)[0] except Exception: @@ -920,7 +931,7 @@ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: md_content = "" # Add metadata - metadata = self._get_metadata(local_path) + metadata = self._get_metadata(local_path, kwargs.get("exiftool_path")) if metadata: for f in [ "Title", @@ -975,7 +986,7 @@ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: md_content = "" # Add metadata - metadata = self._get_metadata(local_path) + metadata = self._get_metadata(local_path, kwargs.get("exiftool_path")) if metadata: for f in [ "Title", @@ -1036,7 +1047,7 @@ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: md_content = "" # Add metadata - metadata = self._get_metadata(local_path) + metadata = self._get_metadata(local_path, kwargs.get("exiftool_path")) if metadata: for f in [ "ImageSize", @@ -1325,6 +1336,7 @@ def __init__( llm_client: Optional[Any] = None, llm_model: Optional[str] = None, style_map: Optional[str] = None, + exiftool_path: Optional[str] = None, # Deprecated mlm_client: Optional[Any] = None, mlm_model: Optional[str] = None, @@ -1334,6 +1346,9 @@ def __init__( else: self._requests_session = requests_session + if exiftool_path is None: + exiftool_path = os.environ.get("EXIFTOOL_PATH") + # Handle deprecation notices ############################# if mlm_client is not None: @@ -1366,6 +1381,7 @@ def __init__( self._llm_client = llm_client self._llm_model = llm_model self._style_map = style_map + self._exiftool_path = exiftool_path self._page_converters: List[DocumentConverter] = [] @@ -1549,12 +1565,15 @@ def _convert( if "llm_model" not in _kwargs and self._llm_model is not None: _kwargs["llm_model"] = self._llm_model - # Add the list of converters for nested processing - _kwargs["_parent_converters"] = self._page_converters - if "style_map" not in _kwargs and self._style_map is not None: _kwargs["style_map"] = self._style_map + if "exiftool_path" not in _kwargs and self._exiftool_path is not None: + _kwargs["exiftool_path"] = self._exiftool_path + + # Add the list of converters for nested processing + _kwargs["_parent_converters"] = self._page_converters + # If we hit an error log it and keep trying try: res = converter.convert(local_path, **_kwargs) diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py index 3333bcb..689d6f3 100644 --- a/tests/test_markitdown.py +++ b/tests/test_markitdown.py @@ -277,9 +277,29 @@ def test_markitdown_local() -> None: reason="do not run if exiftool is not installed", ) def test_markitdown_exiftool() -> None: - markitdown = MarkItDown() + # Test the automatic discovery of exiftool throws a warning + # and is disabled + try: + with catch_warnings(record=True) as w: + markitdown = MarkItDown() + result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg")) + assert len(w) == 1 + assert w[0].category is DeprecationWarning + assert result.text_content.strip() == "" + finally: + resetwarnings() - # Test JPG metadata processing + # Test explicitly setting the location of exiftool + which_exiftool = shutil.which("exiftool") + markitdown = MarkItDown(exiftool_path=which_exiftool) + result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg")) + for key in JPG_TEST_EXIFTOOL: + target = f"{key}: {JPG_TEST_EXIFTOOL[key]}" + assert target in result.text_content + + # Test setting the exiftool path through an environment variable + os.environ["EXIFTOOL_PATH"] = which_exiftool + markitdown = MarkItDown() result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg")) for key in JPG_TEST_EXIFTOOL: target = f"{key}: {JPG_TEST_EXIFTOOL[key]}" @@ -341,8 +361,8 @@ def test_markitdown_llm() -> None: if __name__ == "__main__": """Runs this file's tests from the command line.""" - test_markitdown_remote() - test_markitdown_local() + # test_markitdown_remote() + # test_markitdown_local() test_markitdown_exiftool() - test_markitdown_deprecation() - test_markitdown_llm() + # test_markitdown_deprecation() + # test_markitdown_llm()