From 3fe5c094fa452314729c693a51163219f46aedf1 Mon Sep 17 00:00:00 2001 From: Steve Canny Date: Tue, 23 Jul 2024 16:18:48 -0700 Subject: [PATCH] rfctr(file): refactor detect_filetype() (#3429) **Summary** In preparation for fixing a cluster of bugs with automatic file-type detection and paving the way for some reliability improvements, refactor `unstructured.file_utils.filetype` module and improve thoroughness of tests. **Additional Context** Factor type-recognition process into three distinct strategies that are attempted in sequence. Attempted in order of preference, type-recognition falls to the next strategy when the one before it is not applicable or cannot determine the file-type. This provides a clear basis for organizing the code and tests at the top level. Consolidate the existing tests around these strategies, adding additional cases to achieve better coverage. Several bugs were uncovered in the process. Small ones were just fixed, bigger ones will be remedied in following PRs. --- CHANGELOG.md | 6 +- example-docs/simple.pptx | Bin 0 -> 34865 bytes test_unstructured/file_utils/test_filetype.py | 1191 +++++++++++------ test_unstructured/file_utils/test_model.py | 72 +- .../metrics/test_element_type.py | 23 +- test_unstructured/partition/test_auto.py | 8 +- test_unstructured/partition/test_json.py | 38 +- unstructured/__version__.py | 2 +- unstructured/file_utils/filetype.py | 790 +++++++---- unstructured/file_utils/model.py | 7 +- unstructured/metrics/element_type.py | 39 +- unstructured/partition/auto.py | 17 +- unstructured/staging/base.py | 9 +- 13 files changed, 1466 insertions(+), 736 deletions(-) create mode 100644 example-docs/simple.pptx diff --git a/CHANGELOG.md b/CHANGELOG.md index dbe51c5bac..4999a4975f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.15.1-dev1 +## 0.15.1-dev2 ### Enhancements @@ -7,6 +7,10 @@ ### Fixes * **Update import of Pinecone exception** Adds compatibility for pinecone-client>=5.0.0 +* **File-type detection catches non-existent file-path.** `detect_filetype()` no longer silently falls back to detecting a file-type based on the extension when no file exists at the path provided. Instead `FileNotFoundError` is raised. This provides consistent user notification of a mis-typed path rather than an unpredictable exception from a file-type specific partitioner when the file cannot be opened. +* **EML files specified as a file-path are detected correctly.** Resolved a bug where an EML file submitted to `partition()` as a file-path was identified as TXT and partitioned using `partition_text()`. EML files specified by path are now identified and processed correctly, including processing any attachments. +* **A DOCX, PPTX, or XLSX file specified by path and ambiguously identified as MIME-type "application/octet-stream" is identified correctly.** Resolves a shortcoming where a file specified by path immediately fell back to filename-extension based identification when misidentified as "application/octet-stream", either by asserted content type or a mis-guess by libmagic. An MS Office file misidentified in this way is now correctly identified regardless of its filename and whether it is specified by path or file-like object. +* **Textual content retrieved from a URL with gzip transport compression now partitions correctly.** Resolves a bug where a textual file-type (such as Markdown) retrieved by passing a URL to `partition()` would raise when `gzip` compression was used for transport by the server. ## 0.15.0 diff --git a/example-docs/simple.pptx b/example-docs/simple.pptx new file mode 100644 index 0000000000000000000000000000000000000000..ab165bb5bccb0f5b81d23454791e7ed642483350 GIT binary patch literal 34865 zcmeFZRahn4k}ZsT;qLD4?oQ#Zg}b{$;qF>M;qLD4?z(VymjVj7?0vfXKYh;r@9F#6 z-TAQc%bY78W@N;eBSyqb1sPB|POF@mG(w;O-Piu9SLY(Dlt@*=c&ATP7*+Vhacg+zp&8d@{!e^KY zvcYz*0-1Me*~}{D@t?Lr3^6Fo2vfRMN5|T7P1v7%`yrh8-ld9m;sRzgWn!zLEG9_4=|1B!>SK{4C3K zizTBhe*8#(iPDtLc=N-A+SCPi$XQdks2rJDH#M$QQMf1rnVC&y#93RoNUJeQp?Gan z@#oJD_dE6)f&TL!nI$yX(LJ19&%7DxqGy4SUq-+12AGT4{CE#-@cRttj}K5Fh5r=l z=D(bCK|e=!KO-6TGt|F1npitA(EsuKzajoVIIjP0>J-$8yL_9bWThN5NwfZ+CAE$UM1Ci(d(@%6N+2z)!3qO zwn6u!~y?ZLe&nD8VFZH0;VK5-`E+IRfg z8C9}XL(y7xhLi5goRiKtxI`(b?F633C;CF5&27j!Lg^ye5LWt?qR9kf*6=hOEiH#Y zhKm42l_)S1Ln})?EHumO#(Wqnxmxc5yYxfW{Mjvc#zk%3XNE`O=TS#dxLq-(=8$MB zYV3(JhqI3y(He_~$l-myK~%Q5<1I@D!lxPjIc8)Y_|7{&M>sy?hVb)jZ|}_DWNl$= z;>7T`@cHwC`HyAt&p?Wr?u7ZoNXTp5YrMV-@FwkFf4k#c@$O>Wxo|K-$;bVwR{K1crV zQ)l}7say8{F?GHfH^gT3I-VdzS`#X8k>WW19-=H8MQvp6)&V<$XeQ^8KW4wIdpS<~ zPFWu-BtnCOh-&T3ABF8}`|XMXQHPN;(}?g34wKFVAh&%ksu2HXA0@I3Y>6>BVM@9- z%UmdhNwFl6=n!0n?ydxR?HjI?59_4jff?w;6l&*uoY!(di%qy*?H;4MO!jOQZ1Amo z(C#YF%Me{vVO^R8HFkmbU@KSWxt*liZ6>YN#f&xH`P z1PGg&Z5U;RZ8IOYaM^v$KUspg^FH+Xf{ zLGlS@$r{-xb0h*(9ekU-#py(F^VGNunFizh*}>d~xsvHz6Ohkl29noas1M07nawQa~oovC!86Cw=5zs1_f!1R$*h|Q~~6uoHMK~^M-}= zla^?bl}xLD8%@oSRqbKWJh#$$x6bo#=f1D9ydSK5j?iw71SGKX^jLlr_0kqFVxK7V zJ&N;>u92UZsHXAp?>kE#jfmXcBFsZ9T-|DBv|}kHb4B-YM%ODMlc=yYvXlg|rcDQY z6m0Ql%MM9V@YJm~JFdk`RC+91J_tpSu%iZ}SYtkQa2~LZ4yEI9iV%GV^X*x3fB2mvw0e^>8?=)U`E6*S!##wI$J+SR^P<9e>4T^vhRhsVEgbTf=%^RAD z9Hm*`yf|zkW<6$)0dlVSTjk1D;JgY}K-3~sW@lFGQp})Yvs5@=IZZR$P2sV`XisiA zLH!_acm2z`(07Wek?%W}hwhtM?Uc>bOvMlYhHHtNd~{G-Ytjsmb`<}tVBsz3U|WX{ z4cy-3PzjVq^IZo^*vn^5%AeCE2TtE-?rIfyvs(^OV&)i6fOSjU1^3`{XvcIO_y}bN zL8Y|>6_dy=>1IIdRCse5Kq;do-tc@1cY#gl2xW@93#vcwNo5%gz687imPXwPxlD7j8qV`-RKH|}Cm3|?L zL1C-hUHO{W)~KX)@zrKdqy7s>0(sfX1|%j2U{0#ldTkH^|LOz5$)nRQLw~rS%!31A zM+_IDg2qGDP5$nB=chj_TvCy)7|v#`hr2#?N7u`*9i<5sW$>YPWfpD(4y{N{)KbTO ztup!+{VXnRr&T4Jrdz5qQXh6j0J*S3rYK!y`^@XO)9+_b-BjA6gC>)#%RtE`1siJh z46PEPADk3F;=ZOcPQ9ikRDCr&C@ZZ{+0|n+Sg6`^?&y18(Q zc~Xa#UD|~7=z5Ys_Xc`eCZgq;lH=g=OWuB^1m8L)D5Rm^H|t~uMxsd0?_8K@Czu5& zQC}a8f$PL@F;1N0^^v<@fYW|Jp0tH|cX!WCrP);2O&7T;<{34a3%;3c=L7$;d@a${xzA>%(2N-lGxpzy`i{GcORG=| zHr_C@4JK#EI{Crx)aY}bo{{$`k?J?}u3gCi{3WT_1>KJ##LXHrT-&2YlU;U(c~TI9 z379imt|x%hNN|WGDpj*hBL)UmP0$-=iE@8ZYT9lzY&zluZ6`Ldae2I@ zd-QIlW#QgiZOd!4&5|4ogV-;l3_k4)Z+1sXpthga==OJpPB(E%$xsasE?AI!I7~!! za-hiAtIpsFAHzL0$^D_CfIfZ+&)s-fzsVZ#7WrI;F>2nNl&nd_(SW0^@VnCkU>bYL zI<_`>UXy{=OyOKl%#~~N1bm(y@S{S@e)z=QhdWYQ)|_s<_93ZA50~7+ zUpdZxdjL2lQ9n$1Iu`kgH_P*VyS>&GGXLvb(b)3z#x8seX4%+R$HCMW+C5bXmn|m2 z7#!y}Lk0fHK2Eqj%Aupg9XK92#$H9vYX|PELKcoMoeROoHo!nm-Rznxx?Q@vd`Qyk z*XN44KR^{jmQ$L|qEo)sM}VAk`eR!t=gU>|SqF(Ex6{wGryY5r>;o^ECPt;rIdz&f zJR=@1_QVN-k(l0_M_(PGkI9rGu%3;J0fQ?_Iu!_citc}}ZXB(iobxyTy&Br$I{oXc zepddD0?FKsYx8e6LG!-S4xG!QsOy(*P zDN%MiF+wygBAdMe5?m5qU6&y}lA>LqMtQSK{(d_rXm?Y`SP=KeRA9r2E3|Keb8f~Y zE>38}{ogr)KeW#wsNhZeop_}niR45Cpo$z=(_g~VX?7rnoxn5fGH47akvgm|m6lJ$ ztwb=_K`$92AaYT}biJDIoCK{x^4dvQ$h#38+AS=>dRdd_O=Z$@ZB!Ac$=zyR|8iki zoK3jt_>8MR2tMuKFAQG|JnUSY|5JwZw_EK0!FT^5RyV|795?G?KRj@e&>0FdtI z9Ro6E0|NJa-rjZU{bcjql?*XDLbTKfM>6!YuGrSAh8NMn%Lx25_sFdr`*XLZa6 zbYk4?a11$NzUL zaQv?oKHY=!zqu9Y{!06{3kKw?{%R6QTX`3R`_H2{QuPi z|3~=(%YSoV_^-hu>wiNlSmb>pLi=Qn!;t^L9%T)joJ}166MOvIt^W@f1{P)ikcz># zeRACx>to4y!V{B5R)d6d$`kMfHTDQBWyF=FeO@C{gy9U#iE^TQ5*`lsY#Ic-;eBqu zFS&@rjwr!HpN^Htae!=2O-Is^PRUdn+p)r=&GhDad#`jGGWfLYuh}|FNzTd%p1u7V zNGsREEs&WE-EilWk1s+0Y?8j@i0%xA&BK8SVd zb!0Y4IjoEIVM9rQ6G277Lt{7wQ&-HuqC|){HZZ`Wj3n;pgzrH=$Dn~8Tet~`+=PhS7sTLY)fNeq>tGJ@)G`cNY{XNz$<~Fr>ijd({l^iUJ zqv&kp&G@4fV46=K)p?LQZ^90s_n?y)iD}9@6WmSy&9;xrnS0Q0+?6q@ZuYctSiX;M zW#swY;jECkthcq_{u*^kPsw;LNu;6yluh*&@6L9Zg-~0miWuxEHa8$#;toT1#R#r4$0P& z$i62SNdzCt^9oUnjj}-W$wM0SwFkB;>Ja9ywC~~ZT|72-)16BVJ!z!wgx5oc`l>M- zmZ%wVTyu>nsf43KUVCM|D!S(hehhaD{R143ayHAgHBRbaD}yS0sr%dubtSS-RQNgV z=_?^N4rJ(RvQfiPZWf-sys!T5PqXY-2kbBgLctLE(dxkY=6pE^c*=)0x z6_mk`{Ab{MKO(TT;Bq7xLiWeZ(Q;~?L_8jAF3&|aMti2qY0b);sJC5yHA>Gk&k;fy z8n4oeS$xGsP4Lg5?9H`#L1LQ!5DG{*<^vD#0&MJ3P|gQfjLO z5b67Bd-kcT?XPU*5Bu0Br-t@o(ApWWl&plPN)YZ2BQdCv3Er}*ulS&Y# zj6(utt1TLropPd_%P|3}8`5giXD~N|c30dKrNcvi&*Y6D&`RqFz@u1pdNW}tbi92- zlv1NBX2yTKBUq~CwmnY;Ai!8dOAw&Zh5w%6a0sY5a36V;iVz+T(w>Jl8gI@Tc&I8y zQ3b}0s57TIs+E?jMi1_u|^5 z9AmBV=e&hks(U zG01aze+W%o9_ZK@POqEdCXgHKDvFk75GEu5&4vv@IT9FU4i$;)Ww6i7NaI{zu3X-} z0k*omKOU!9oH5wGZh0v$EiRU1G2ZLq&LXyvHb^7C;=?E{3zZt7AP2LCor_u)*N^17 z>fdr+IA354wKu5{jHYmnnI-nVUuaaFtk6G7q(Fq9XC`+r82@5a`|Y!oWs@Xs=^`Al zphDrom3H~n_;@}WgyQ2Qfjx6LQOd41O3{w2WunPp9=18mpX5q%hD{16w+ zwDLqZ&}%_;BSGY>ojH8WNUPMtYztd5HozcmwJ(nLWw;ASt1nzKW%V(9k*FegGt-?+ zB|AMkD`{-{-M=+Wmd40`)isjcL^pAADeKlY9$9YGn<>gs1=J|}412yktX6k-oSOQm zX3p!jqhsr{zD4~!r6FodH@JU}SZolm;92TvHW{~~-cJ681zt=5++GDY{t0hICi$^v z&rD~4gSrXLxx=#`Y=%HP6?q`%SL!Cq^5z>%=6Gg&Jtd27vdHc59lKz*Ld88|e5H5NviL+cORX(BPNTYIYu}@dj`n@a2f!rR@|bX5 zG-Vr;FJ^J+vS~i|Q0IZVa_Ur*jNUkUUY?D-3!xoFb_gygNrN}$#8dwB>F)Q|XKEk) zCMFyG^)(ewHcYqU7`9dxFtH)&Rfo;U(L6Q=`G;vuQ!%+8NJ&BnNG!yZ(o+7+@)T#rQyVR^yF=K44+_LCUwPYy`G$v=9d_J)$Nn*6B?~Jmr3blbYYJyP)}$T1 zhRiI!*Rhj-dY;c)CTN}Cv@$BYDN+t`QgJ${b532P$Nj8eYEPJ#$LyjRxH$>wIH;+U z{0u4EfK4Nv+nL*`ucQ1bVhbAKY;Uiul?xR8HZ&yq+#KFav3s@e@2WYQS-Do`e^G%! z4Y4fmGyTLU9k3$4LIHj2Lm_CYFJW+=5l%2P%>W@)S=HDPWXYO@XR+zp{YwIs_>3!u zk$M%A)xIQEPTE9yBgudGK5;W8npCc|$WM^{sWrA3K~DhuyLO*M`IbzjhKI|ndm(c?Z(*_ns<8Hi1QI1G4=!T_W__m7 z`dko#-){;n&ghNeI!JAt3J~ISB?WVfTm?0zjSQ)%uSHc?vq!|^zenCj-tj{&X}iZ& zX}SHUEiSk{<=79CgT){9k&PXaxMh-OR!Hhm>>MXFQ@s6kaFOQ+U=WmZ8M_INFGsnE ze&W_sq)inzCR{Ca9Jj&;A&HzZwFJb8Ej3<(w5UOfK^-e#Ubx5s(9;afh1-Ff?|RDc zxf!HSENJgI30_)d8{wxOs_0+6+N-vD;b>@Ip^RQQ z4M8o>l{vPi%&A<5t!^~g!VR4THx9rzVqSbz8_atcQLVPIUa9HU%d{t z((V`qly?5at9qBvi6!XGa*L=Z8_C-?<@O7YcbsV$0CR|8en^f_+2c=!vOdc?tuM6h zRU33RBf%a3H;e$M4J-EU2;12#*u8sKC;E!InZ&56CYCPD$qA!+CcLAV*Mo&gxvG@B zrOJp97q)sdi{+Q)S)Kj-gwpmvO8d27&lXX+a9CDT|aWP#0;1A%hj^F5&8>M88B|!c8j`8 zx1mb6$G4IlK8_~TXk?XEv5((>8WsCRWh=<37NNk{^}G(&gz!d}Gk=(mF71j@RdQ%D zwAOreRvA)EJcfou4jJXC4~1@Do!YCrZ>n2RWZjm z^Q_~}HiE>k+CPpsugJsSfxRD=dU;JeFaf#!>)JAoM9&YQI<_+~7Er&tNGQ=^9jYG4 z_q&6SLhp;?0M7YN;ef!@U6ByKPP+O1n7%fQfLbZ(D|p0vT>4AF$0Z23Jop%I;%~L@ zyLcAg13-}X1_puhf&zdHU^I|}%YwwI3W5{|4&}e)pw6)?X_+iKQma7Nc-Jd2VU$aZ zL*MBa)T+4g)NA8HDsJb25lVV*gO}k(=9J^>B2OA@#|OP|%!zaxnfxe-Op^lRzhlqz?&e<3E_9*2t@4zXUZ$6d}wW zClC<{^flGbz!6wJL`7tev;m^%pF2ZMNrp23g0rtBGqMS;C;PzRckF;eT%U<0QM1>a>W% zk|F{m!VtZbwp5AiCnP_Vvb3NyZ`58`aBZsjg)`#Tatt=_S03Hh@W?8<$t}MbyaS^M zFgV>IZ)`-T6KA-l(hW=F+^0{k>=96o1)fkwry5&I zh3e9jd7^IWyEfsNn{ z<&U1WzAXimtkHUhHtEY9CQeS%o_7}~(*m6*rarnq54^hKA&$*oHTgM~m0VfIPb1l^ zZr_94DuVrJ=zr++Dd=byjtG3eENZH;c~whPujH!27_K-fS5>V6XJG$gb&w-k!=0+{ z*Im`-JGX?FUx`uy$>#a+L_X>Rxp`YI@KpphoDL4}4)r&Eii<^q2PjHvaz#4uER)iv z^h^Qx%cT-yk>|yu`Yc^LZ0>UBmL60-b@ES#PK#&(aV9?N<~E}ZZC^f@yH#5<>_uV5 zsEu{7iN{l`Qm&>%ai*{6p{s^I?_t)pQ~2>sMPDj-GN`K|_VC*RDcnNrODHXPraZYc z8;};)e+yH|bwnYH=7xr4Ato7<3Zu3ggn<4qT<%sZ$MaKzDl&Mk%sf8$g+ORiCVc&~ zdS9wHBOV835J)yO%AA^}yTt;z-2gEk3V7wtPy`Wth>9{qh@Jq8`TI*U388%Vn!_789Q< zE)SG0t%@MCUd&m*fE16YjWi?qX(8Kh#N?veE>7t5mf{;N=DVm4kJLb{$8h~`? z9g|roB)12SurTH*Z`+*#3Vb7wR&|Lf1z8bZB*N^!wX*-RIFM1ZF9~7;)V)AO{~E?- zSWf2D%Y8KpPO=NqA*hVU&8^kq%f0R{L`k2sh?jZPrEjffP2F9HLLJ-_y6!w=YP+C>Xez zg3&tS=O;PvA;NA)jB}uq9N6wmQk`VWDQkf_4+IYEl4FVTt3Wfrkio@=2X?oIgh9SJ z85MNveSIH9m_zXr%j)G$_(EQR;jGMf=vCIq1;PF*83f;?N=Di3d&fw499{xqmGurt zJSV(0gA-rEb}A$;J~v{$1k-Zp3QHZAIr^88|=KP9~~vrW{1P`U=H)gCR#@ps|$vvdJ&u6e%EIVS8H!w z6A1I;rgUF=N?zZbO{)s5pXkb`)$*iYJB!fR*0tGnlcihErz$&S1VCL?wFa7W%cUdN zaG%^_SGQmXm&n#T47vB6}#cu$Z_Y7p!K!PHw@wA zr*~Cwcp7f3`SQ!ozeSp;mm(n>=uCXwK2v(rvXs@4B*8(izp!lL?iyy-9(>Z!o(Eo< z%BB8P;o5iNs4|I=s0G$y`}O4TE87iCeWj3*xgGixKv)LO<8m2I`K6%_m~Iln6rQ9n zhAw@Vpk+r1o=NG>H*@Q$>#9yN3fISfSS}2bGUy}M$4ea?q_)ax9=8+qLA%8#G9o-P z4UgOy3fNo60@%B-QGod6^JDt3_wYbJ`5&O4{B;4x1yq#kGgqzp!)<;jEpNCKNk&DGE+@kU34FEV2leG)P8uGww*5Qz>?(0Ih?vD>L~H=puO2^P7LCKW zdM_>lt)^u<7I>Lm#z)TG^=Kp+4S#nqF8}puXsN@dd==jd90jdFx7bSht*-YwKy&p+ zm$1uh|9z!^iSD%cMbLGp6fq(FfCBaE`jbLj3%oS$mluGnC_2}Av)yY!`sfzi5>oh}Z9u0| zA{gR^98bAKAGrWeT?d^(#w(8}OV0Bd#YbN;JJD{4t(X5~bE}TXUWsQ#_8|TKVqYZk zcYYs6Hf(*^w@*Z~VYsNFlc03g%LZadrJD{ujQqZ!Zt|WlA)`@BNd+fJck5vH<9t2) zwyECFE$?UAxZekaCB|n=;zVS^h6{IU+?!hAtFG4J-|9h?^0lwB1-~HR4Zbe*83@qC zqfuAE2sj#4{>gB3N4qo6tV!oVghLc?^flo9fw=rsPU`{HB6*uFY*zlN#2MmDhW<`P z*KB$6soLAt_=HYlv}${2oX6G7|HwE(}WoTj%G8|{oh%tccd z>MbPp3>KJKsK5f%8v0Qc`jthRb5scOz`^k?`vQV$4wV2uWjbuwnG{HNCL68)I&MPcD7)y^yR=t zqsT472}0eUz!93an{tq1enbHEVFV75=-Rf_4(fq%J?wqL+D!Z~k$L%jI09@xYC;sI zQU*AlM{m^eXCpRSwTiL_9m9d{)I%H1L*Xb0>R7t8Nk@%uyY&R|G`UC0I%XZK`wyvn zS;MvaxHG|sgbl(9pcxQIK;h{k!K0tq-Um!z*XtgMi=Q8^S1wKv`ryq9Oc80*kQ|~W zsw|#BQ!Pt$U#goQc!`oE*Tn}i_{i`AA1=f|?uf(hJ%m6rFW?L&r>|`9$Ru$)AKw83c&G zI#`96n)1)n3tr^wRi)gF?jBf zrC4iyx8`=hqfEQk;|b>5HHS!+Z8CX6TU6=Hr1v|%;g%R%*7W_7@m+@|{7&MBz6ZeBoteg1N&1OHOwG}XimOQO)s+0HH%rMKf7C*IcZ0OCLLIV5ZCP^;-ENF(>y_PKt zaT$9!Ny`CKE3_f(1HW5#~ESGp6#?wvjT)w3=0IHVsE@c$&)~ydmo#fY{6oH#!^k46&d+wHj zTQ)E@mBM8}SMHWRZ=b5C->38`u$9|&D&sb{?z~|$wck-5HH!~__&K{bn*me*S3Hp7o*cB+U{%nhA}X=w|pZIH;=(KHmg0yZG+oV@F)7 zy&7x!tb3;3cFE^&I6A;37?oklNkf<_F_6eC;s`+r9OT`t@+Fnq3JAnFS^KJf4@~$=g)OL##DJit#@- zK2m^Qtk#||`!dhGzsWuc_$PjUJckh6P)^`q;AvmrAv@WStKZT&^;2;cSYYqoWB>Rf z@XGL)d;+S3%(V717t=!d2R`xd&C7q+AMt1V&AG*#5DB{%edW3eNXa}<0JR)KI=_e*WFoNP%?>lu@lKy3$BSH>gyB_}@!NV? zwp>KDolO>h60}uMGxzA2DR^ifx-3VCJW0b{M&^I>^VlSSbMeUh=7Muzj(1@nN6=UHr zOE2~@-PF3`B3L$=laz}*`J@QN%)@>EIt_Ho1;Q(aG@?JEBw3FcvLCpL7Z)fD6ZzS0 zFj48h-dA4=v-)L2nvbTNqld9Hr&g4A7>r9vH*{v*LTh|TI75(|)bvm%HK^%Vo!zEzeJUnBoHZx5CjtxK@gXdGnC85 zIfkR08uU^7KA24C%Dbpcq;lwfr`HQOJ+?cdPjhALOMA!H9G8;QiZS(4L13P-yn!3~ zCxSyzMlpUPO&W;f^(|MWep>3jSlM}xTe2!-09w*UmEgTKfYJA-f+9%*2IvbC+>wR} zXCl8Cbjc@Si>R^@mrrphWm-=p@32Q^<|;bS-9B>rEFwJ+-V?6PO5x}W*^STZ9QtjN z?>@yvPZ*E4O(g|RpM+#nZUg|ml4=P8VMu`mC_Md&dLfboY0DwH@j#@A{nVWjNVeWL z_mx6zi`GH$Hk%bLhlOR$N>W`^VU8AyJe)x7Z$*KJqfW^AVTIDjF%hf0+~tfmu#Gd7 z!x*FUk8gkpPL+nw!ZD4lBpV>&?mHM zba_=vl_&UkzL+2O^AmKb?f(sK9MhI1yS~Fq%(&!ckAJKbo)8uwCeAowAEq#|N(`^u zpT#fO+?tQ^v?sTv3$^N&{3eR}3&p3Y)k3R}G9ZX3`zvEK?y9Dh|2OQ92aWMm55Kxb zQVZ)yR6yY-q7&*yIOf)oouR~EYfrYZCOrtBJNN+j{~GVu{%AKm*VO(L!cpHCX854y z?Mw%XDM?&&%oKaVqt`V3X)Smy@5>6X0AJvYwJ#j&67-Z3%zj514hMgKF%z{I>6iKG zgm7iP*~P=)Xq3fnqn6yx@_Q%~%K}SXO6}+66hD5b5xuxQw)#nyjMwJI|n=$V(Cu-)eyYFQ9 z!%$CDlM82vLwQI&QVhM=7p*#?;oLKhd@>(?*Dh6aBgXszwV_Q)+kZB{I&?vIp=Rx} zf}3A}zZ_ofJq@ePsXk=by$SLxHuiRaG9LurT~$#+c1f-9y{IxagH^TC@y$w^U@{g1^M%ospFE@LB)}5Gg3C($kLl+O|9R0$P4xh|456LCL zP)701?RHGp4RcHWlXs?m7h9>8D#7~2L7W5Eux4s;$QnN0urDAQsq%-l+aniT_;_Gk zE}~XgI=zT(k2NYXHK<`Qe(tVP^pGKit=mR_~LAd$Z8}<7F zwh8#B9OboH<#})X*_wyTR`_UsyzEzslcC#{XeebPBRz?ShKu5xWo_*^5zJK+Mzy0e zlcDoC^C0JTsWuvn?F*t_*=SB-*Pee#EdL-|$?uH$MIlpS3~mnDhxgR%;<1A_QUFWW zq(Ev3>nguV21leCZXHBnq2mpr?7^bv+0+nkygwmNJwZu^cOsutSIq;YJ`SUdeitm0 z^JjR?nRXa=6!CBod>!pD6Cb^7Zs-X&a0qYHFya5r@C0Thgp>Y`6_kvK|gCf8gZa-sw*bkozQO$IJn(`tD4Ilq}iCF2m!K<0tH(1_Ks>bZ7 z@eW}upizi8hsctyM1@H}35=y+A&v?T3tlg(3awo}bqR2?jRI3+johA8P1})+Mlf5? zEuDr`LCj8V4UCmP8jgl=!3@r$#vcP(+BN9;iCFEhY6f3w96`$&iH;5h?+jmrJ2`9cLIM1^ zxA@>V@ET#(Q+WF=g8uzJkd9AWof?XrQMZHJ1D!JbbK%e&tri=cSJhlhXbBF-Zw-6z z9s{l@I#+=HZ3zpVfjyI@uaLO#ouNSemW$5gqhZz{hE483*RM(` zTiQIe7(c43YH~^*L{NibqQQdKq}F1>zh)zcdNmDWzE@jPB)9ujl?t)^6+ykJH4&|Ok6Ad-ygmper}87GAVHc}KtxGO|$>MvR!Ya=TUn%6UV72cIMncvPv4nRzZ-m5Vey)vnG)=6kr9SC z6n`d++jXHpZY~)_m)-qCx~(f^?FBFBb$R^hu*&z9kEs{kgSlhz!Frip${*@Cx$pu+ zsADa3Gr60OwK6``nWJ}ZHc+UpT7iLUo8?@fYB5zuDRDj>H-misEwxWOpOxH5W=(V^ zA&{(RIM zh=cI7QJNtVua)c!%H|>T`D#|C-=5ESzI8}Fqdlk$G4w&4RzqL`g%6n56$Mmgx-be|d#M`fLx7+TB_|A*$%Iqk z-mIWS@z8mb>-G-o#loWI+Jb7)NB-a;VYW*_1iETKI${vp%z~9MN+t2TWF%D>Jh|MEbU^2^RB@sw$}jOMm(^Km7CMn=C-X zi)gs*O0CIlcP-_3tgEd3EcxxEOt97_I>HUj?nSQRJP`BZ$xPtm-j)+D<1~}tC9AGS zg5&3Ha;K^#r%Pt-dPGVr#Ty~_ujUhzaWvY4MrEmYH1Fo zs@tyh#ysUj6e{d0!gmL4guPuG_n2m(2&8q)Gr!ORcu?bV%f+zBDz23j#bhr;&qpld zZ&ev%r|z%;G?gY;Y;R_`RRxHiOBa60{hlpJL)&}rN?I<3coQK_EFWo$6Lo-$@VlPh zp6R#q8PV1cP>nUVYV3lpGv6e}2cql7q%`mg4<2`+Hyp=*&bX$0$@I!Ntti^YFAhv~ zF5*7EP2-%BXcTQod>DIILrsIIAT066BBmz`V@}8cd1dNXj5iSISX|O`*OE3+c&^l1 zIS7tLv^i6HH0oBQFxi?~(7=Sy_cVr3mUynr@i=Xxm7qG0VwUy?Xkrti7js_cPvi$} zOo98SrOq1fOgvA8GYCfYdwlc7s;q*{VXZ>3b;CwU^Tcg>!eTp<6`0)DI7kOS4S8G? zvb)6+C)qEaELl38fvU&69}Xe{4huO0rQyD};QL-aRaZpctx?u2s!DSPN{}3vvPvMw z>OKoCXK_jyC34hKTQB&&)$6S$Th=3UK}_j6ULjNpj~_3b;hpCXCryfu zy8r?n`dnw&19SjJ&ic489<4%kMjP_du=XCR6xu-H3=q2=HMLB5FOOAlHyX z)JD>?*v*T7e?Oe%BkAEUw6(8-Fyfl1`JC`r#NbjphS5%7jiDWp>tA+%S4~D?&%x(; z*}6U7yj6em_5OGTZ;QlADPC1_VaaF}u|VFX4dLkxRFk?T6}A9AM*A)=DZa;gmwq_4 z9Y4GdymQyWa{nzD_hqd^kI$`W`X`5dZb*FS+W_nc-26biQ3Lj`#wmg!ao_F!!Oge= zwp1@u(LnOU0#B9MR)*<^`oHe9fELZ<3H*eFTj+lc34iW@olyJxgj$Q^v}uG}L)uWW zCpy~p7Pw*H0cJM%Z;dMGIKMS#>;R8NO zBKO*qvd4P4pvm_b$yqVvKLy0#d>SBtVWL7tDuYP$JvISIol>4jh%4Ous|>P1cZaqo zFdJYo=NkQ327#6$u?6!iHjyQ&C%wqM=z7!W@KclTHZN1Wj(vfRtFit3yaIkeI`Ek_ zI|3Ljh+?y`wcN{`l}2=YWt)`EV(H!?;k^;$9ILrQeBwF+4Jw2~L>{g(Sk2DyZ~@#c zWmUrc6ig>DyehzHZ4;NYo0n1~7pzQZixK*lKfiVEr5+ldS$}8Bg2!oaE{5Sod8?6v zKar1lRhqG%cdFrK6(b=X@$}gF#~`|9?W}_eVXfT(`g@M!ILVK_8fk?TGpt*qmb4ja zK~=2+saC2xW95$j)81K!RkgGYf5WDcZlpuHyHk)3>5@*7?h-+|Lllti?ha{?kWOht zkdW^9Hu#)_hsWb{-tT+<`L637SbGn9ulcd=nOQTlX70J~rq7O&1Fq-$AIK^OBnLX1 zUasi8F@K&r=V;mX-pT*!jHfl7FvePd%0r{oy=O@#C=c#DfK1)H6(NkV^RasS*ajM6 zvC-O8bfSMTqp`j;jbbLw$FdCJw@U~gg9=0k&zT>Xt#_%r2pB(-SqF^qIn{wW^kW8) z#?(IeNC)i^f@S>+$6!a8p6HTp7-vEFNxlzpl@if{_`DtyL-D&ZR=*o(4~OxcOvIz^N? z4+P*ayV@|cJQ??RUa>^QMgisE#56&+T$z-lG+tNeYz5U`S6viR_>-hIH;G2~$|6Y% z^13F|Z_BWrdUCW1CNesss$J1eSg=cI`Eh3NvyGkxFZWcgQ54I1UVbrh8;v_PbDgEl zgv4|3iXo80RICyoF-xWsjXh$BR~g-zZ&=S$YPED?>#simTC0AEnXKX|98OCL*{T z6cmhmn+<)1+=UB5lcT8E1xT;8)bysQABC>Z1k1u;jOEYQA2sl8*pG1Jmk&_eK4w#t z&RQlUm7X7`!I!R%fA)@%PvcX)q_&<|!Qn21Z@o(M(?jWk%vc1Z@|ocT%2=D6!@bvxc6uK4F%+u@zWZleMD>j3cd%DPoz0oKhkRuZtLqC9I+E);! z3=fR+^{q=*ve+ardOtXQ5&1A&?>MPQInqI9zVAzLL9Un&cCvTI+t8>y9Z&oBzMbq{ z0lLXV+l*nxOg6}EyfDHK>rtHeL(u>p?5cc0>zs2o3t|CHe3oU$jmjKEF~bgnTTn~^ zxkX$02eC?atV@0Z4%R%qJmjmR$D9+>OkKe8IgEs}802pVB>EFo;_X>6Ksz_G38Cm` z`Qvfckgy}%nye+@4A28*abNf9(7APDoj2v_SG~Y%uA%1~dp&{jyJBrBmursrtkh3q z6-7hm-XndFpwLq{{7yHeM#7MB$i0=hM$5+$VfRuL1rhbZN+E&IdOg8FTUJJAjPbjq zShGi%GVj`gyICr8+DJdixg)s{F9xuH#<>v?9q2lVeNmHLJx?_%x+kh5+Gf{vWp_>G zlHEc$ownBX!zn+Rx-oAnI0@qEfBL?Yf!O0gyO^K`d>OI*p-iR#Ot9NgckAYTZLnV|8=gk#+ZPi-a%2yJL zZ=b0lXw2BlDLk^KMjn-R zxeEv>glMmD_xXVnc2FX!q3S0;dU1gU*C45h)@Y1r#-*yQ8GhvYyn~`@In->8hCJCe z1Ygk0M7A=of@dtDC2b(pI8VM>qsbfX<%`Nl)KUKBd@835Tn-xl-C2oBO*5C3eWs^5 zZ716?zJ;TFdwSp2_3*?_S z4cf9+AjsqL8+2$8585k-AF0k%>dF|Uj9milYWQ^62SJg6BS&eACm>wa3ll%(?XM+i z@M zU`Z&6WH()t`=DRpvBo?mk6JRH3apmOCI9RgGp5Q+`&eThTQ14@sc@F>>@+NQ%H!hw z@(QWg@@nZ*?DE>q=Mp27q*l}YE=6Anq4o(GgHSp1Nmp}8xkE|NnWRg_fyT>g>s#z` zvkTSDac0m@O=zpX>K@R&IWJXo6Z^Jb*8Td>SY7)C>^A=dohCM9sg&i)q<5$=G-3mb zEI2E?L;w)A^{U0qwdI`IpKB)fG%0i@eK@2oiZg(ZbHR>7j8>1OT?BERO&2Bch;(Lg zNG5(gHiPY8dvJ5_%YJjS$LYm9QaAmXKEF}C`oz%Md!ij})IC5MS%xI$DM37W5imcF z857DKb6CwKyJNG-X<<=s&MW~b%_2eAYro^jG+8+hQT?+W{*u9qyq1N?nK|mkPh%&G z8ji&yM_ZB!8N(fXqv0AxePFKe3I-g6wQ^UWJ+}Ye-cJ}_2z%W01epK1y}vck(5JQm zYVWac+k4Yy^?hnZp=jX=J;7-xsWU(xS(mUwf@u9f5Wu5w%jS%Xl&U9AOM}57_=1V* zY#Bu$sZB(Y;aOq+u*MhdXRL?0p*n&yA2%v`g3iuHJ)JOsdYX;W-)N~4xCJV$_CQ+_ zjy4z=W}=(=2BH&;spwuS98>h^qmKMmTAyU3y{Dk3 z`sIPhT3=p9&Q__H)01mseuiP*=fg&^6Sjc$J?2RBi_0|AbJ#*Yh{CNsotC$A=OR%Y zg&(WvhVv^e-<7<`#EtOCVwPj*?aPanWYa>=)pD#oERBGtd7=?&sDE~p*K4PpuiEUi zW#l&WuE%FoeuDNfC$ugnATbBaWEav)=OV_+d6dNw8>x%sWmHd?$HXLV8Su=Z; z6~n1avsGEa)ky7G!E4@YQwOi62BaKR+7`C6eX{L~;M7;&BvG=2?d!;rYB+Y}UQ&gX zeoA}$N^K1Anjg|rR2$iVgrsj7fc918SZqLI`iozJK$(O=cWsx2a)$N@hutZOWu5|2 z^ZK-1y58GM~O`mWM*h^Tm>lJXN(K&F{Kf3-6ZxrvM+J>mEviQ4Vny=&`>#dC%6UVMx??wGYPe`_ZB}#0CPKzu9i5K}%p8Zxft_GRrx$VL#z^E8X(yKSkWNyGwg1}L7R3ED zsVF~dVlQ;MM!MK4XF_Qg(_G5{dQq=5I9o`Jx2(esB@=piy&EA33TJl>6A7hA_0;>N zHI{sO$jj%@^17+}&fqCxIAHJTh&o$`^yC1el%z!SU@N1H#}I;{%-}(EDPlZ{F?=`i zI-Zw^%W`Co(|I>jUEXVxX{XiC+6-mCC7InEb>-XUCUMCM!?xZgj?24@r(rp$$#d!3 z1xpQ+udjwl<2>07%em}gZ=j2^<&P)5O47y?>Fj!a)y*yoAuRp)8q`4-*^1hUx)~o` zt_B?Sox`jKgjt52zZ7Te^7ugdcA0+xrO$?N2WGVf>7<6s+!>>!_LKY82b`@Wmcv&p zU42*epWZz$8)b|<_m@7Cro#{?J6M{895g_$@r4qE7=&hQpTzLQ6WyWs?8EHxNq$K1 zEIrKpMc1=KKF|&aCe;c1XgUO~`PZD_*i484QU(j8IRwP}A}gqvTo;aY2PQ!x>PjwJ zTKP0>GnuUDbILa@Ipu!F9E@DjE>0(lbhWJ_kn`$lOIklTK@9#z9q!#nI>!h(2~qDI zJShD%7;A$iuvStr4^a=iMM8$^+D)j&%Svf(HizhyCY!cXfhZpOG555yGTO4wuH^-D z^TeNBvV2889b<7O${wL2pEI?q|d*JF$wN-D4f z4znowvUBul7PWB7s?L--p75jA(Efw?VU5sjd8gC8)8zRk_p4)=ZT+D2{T0@YEOzs* zV{K^YR#p036>0=+#0E76Eip<2_eDsu;b}9J#$x<2=eLg2s|ItGGI~wRB%B{jVsjc3 zg*r&XDa2ewu0t5Al_MRY|11IM?RHV$vj1#uviy0OKA~YU^!;)336An>? zZsAPDuUepiYcotntsW=s7UuagfbcAmF7hmhI3 zs1exNgS#4N6e&}O+J-z?;)V?p+6wLQ4Jon9QL4jI{RoZTgi1B~Az%-fiTe;0mnTVO zEL<(%rR`Vjp)mOUkf|QxCJJqT65>UnuEMo&%_is62F$cFL3Rp`(d5j}in_doW?TdW zg^+xoj&WxzJrPoDA99!?;`(%IBtxYP!KlRBGC8nYcvrWt3Yv|i)z5EEgQ}f2y@)ABXvu^t%(?HCZXQpYp)&hBF`Lxaj~cVQCsG2U4C`JOorJoU zXl&s!>)Cg~S)I?7sB5;eC~;wF=em;w>(hxj;Shog@?c3saoRGqp|OH%y`d?-QN01YdCSCp~I| zKpdR44S3N=$ceTrd@H*rSnLa5PPKh@gxbGGslA*?n*?k#iMfSnPT0%z$5yxU!fr+H zyAU{`r5en9Zt;tQ-p6U@E6sgC@_g`f_JS??5`>gEpTW}*I(;V?rp-EWu|!z*$N~JN z1LV^9l?XhXp%~b$2<(fp5)=rphgDPcTL;*cx!;!!&|uww3JH?Z2rN2cc7ggd2IwQa zwpcygVtcwR;(jf!A>{nuGMg}V3Qroo*m*5pd{R^M}ltKZ1Ma|H4(xTP?YbT-OQU0eV@bO zxrHXP8hQ;#I~=)6#l)=$a8nzDnI2P^hLqZE1R;Df_UmUERweDG=9x53$nOqO9+NA1 zsUd;Ef!T$s=er4ps1BCg*dey0oO*tUj-~)NsC%i z(6Lv4l9^O(x0{3BJXdr z+!0gaBa_&Eb)$W+g6Ba;`|5mlDxVU^SO^P~dL*$uP%7%ShcuR2>yu$q4+QBc=P>F{ zP)d%wM6gHk=_;Dc;~x)d$(k$(huP08N_z?!+9RjpNIxB8@zu}eMjr@)rA$63QLd(M zUfm^ln4LcM?VBSSL_}X_>V-zANa>@IAu*QvPR;y@M)_yeMKF|8)x?nrePjyN5vD~@ z&qRcaHX=x8-0J&55i%Mal2WDymymYd?3Us?M^5YPx@?$LZts>==zF{7_HR@c$A z;lr1ENM^fe9M%0=55m7|b%*M_?tSfOO15;VO#0Lra+8XLAt}dT`z9Af$=7fEIXJ26 zUGF~#$PR{>q{!t+zczhq|2a|%_e`6)Hw1o4oJ3OZOrh@T?UO!yN&SGshKHt6Hu>Z6 z=Ap@ywT@p#$`h}8cyQz%Ig`PQmC^G`#i}JYQK≫yoc0T}JyFFmNgZ;kTEdZ1jN% z#n!T)X|Z-;pr9tQ=#A!(^G1}Vf~X0KMiS+zVN$Dun+!!YV!yqevawg~+jM!Z^+vhb z33yAg{#QWJ_wh#;yNigYc=@#79~KOcHon`K&eu}4mu$H9j!1taV=NE4fkuM{!k*H? z*p3v6l&^(wZjI=&n2&*#=FvDIv*d{S1}sDnK7war~CH6|<^HG#$Fx@Hz?r{A26 zbow`zntxJVg9F8H0x>|c&v)De^yke1nyLdBS{caOT3OqHIz7-fj2C)l&*{yrjZCg5 zuNMHwl425K05Fh=AYIT8;Cd18^0Bj-F#sSV1E2<_g#kc-;Q~Mpfw!jT`N1Ck85aYi z1c0XVL7#)nZ}^)5K)Q0b=g5NIZ-Or90Ac_24om_2*V&-YQo#Q>8w`*F{38yC0G$oE z-U4vSn_Agh*_m2d6S2@U0yu>wWFT(npl}-}zl|b~;)hIy1Ii%QU&6lKJQ4Q1{r~`N z0EhtnA21RCI0_gL1?;*VWLqBq1_^c>Zj*stz`%hJkWkPtuyF986DpAb;9x)?I0O(9 z^5)-wd4RqLK%hXP5;F-wp(*G=lh~j$dq<|ikP4T!U?>jlkg@37zJi6r#KOkGC4WRg zNkz@d#?HaX#VsQGSWNthgyd5tWffI5bqxbUBV!X&Gjlt82S+Do7grzO*M9y1fk9Ex zF|lvr-oA@ZOV7y6%FfBnD=V+4tg5c5t!r&-@96C6?)f-8GCDRsF*!B8xU{_TW%cXY z`nTP^{e#1!I0`he^h|NWOa`+O1^&Ot4q~SCaiH*qvN601*KAZ2*FU z10jJxAS4tdC_uqL-2@monA-sNukheDAl?L|Kf^T$0tRA0KtMo)ev#o};gSEx;d&NS zEs?LM0S`bPZNO20C;$P#r)gHaPPC&>Sh}8}u@&QA z*5Ol#kMKS^suK?QS{rt*Vv*z73)i{Ik!048a}trNi?~p0e%S0!h(*WiQE1_bHp8{iHZmNEM27~A}fJRf8po6M>TH*ow^761Jux+1wn3NFfLR_GkjWX zoa@^;{IoR{ce^848TlH<&-l}=BQMiT+9Tnel|?j`gOBV6Il`D$t4F}b}fuF)U0rX|&1;?_b zQ(L_2iI?)QeepxsnYQW7$tuk0x3}{RM6jKAXT%VbS8?rkH z!5SoT)lP<&MaSt##vGcl)(PlFJck~HTH(b@yE<#Sy$r!F2sQjrhOER%gv#idSwz~^ zjC{egGxGQv;Oo;--dGv_y~#c_a~0m@g_<51m*MMIAw~&s{KkNn?)%I0Q~9bV@2r-k zlF`gvUCjxot9BYnv!`%ZKjMqGl}p&2Pa5fc)##^rZQAp|;4>`5D?W;a>Ncrs0D!*G zChx?qRfjO?8qlNs41RsftWke9+JMA_L47OIsUlz&27aHw?xmcd zDX@*#U?+R&xZMifbXR4)R`H=B$ur`@ZS;ek%+LC&#e3awcD47CTTQM=3aW=WIXYlr+ z%%9Y2CQ_mXj-D(_kn-%1t4;DCrh@mQVRnIy9Jn~6 zHuMpF3kWyJj~x*L{jpz=KxENt>Kd@rI`7Uo+~Bli`RE$ZCV)UxSNe*FhbygNG$lbM zK_<*1kU-^0hT6)*C^qW$@)v~Efu2yF{qg>-_;SLf36FQQJKrxRI967TTiV-M6@TuG z#MW~$f;O~6Uou0lPT1FbYST43hr4U#=Wb;1D0@*`7C~j0N(1`MYO_IQ)Y$}t2u+ARDAv>y zg`KN;_2CO4sHoC!xO53>>L1JccoU2fGtsSPhQ%KS&@x+F46dahp%a!@w|ohCOyE02 zDdW@ISRdjYqyM4cgTECWkn{8{r3$$ijoJn3iH=(GCL5rhe~5PxYMF@=UBVlO<)eEE zyHVgwqQRJH3ZhA?C*sb$cZGM#iAuMc8cm#<^7}CffS%JEA^-r66VzE_jQrxx{o`)- zx*q%cEc%DN5Ab^YzrdLIFEFP5>LV`Qkkld+Gz4S@4INQHIq`>kn%e>DjaqTRumZ?Y z2r^*dSmGQB^E(71dsrc7EIyvuFhN${C0KVSY&;XplbPIaUW9hT)U&L)@kjPV_FYca z2?-Q@MFd|E#3&`e1B%;g0wNb4>ANBh~Kg$2<%146!U z&-%$r2RCUHd<$}N%f?oN>?)_gtI$|LajP!eDZ}!)BLC}Rel3-ro?KjBC&(EGqj5pMbwpz_ zDLqX`I>W?4aD!I~70#!zGG*rSO;|5Z(mI43ec-epc7sT#2zd)#O&W08@;J_hDauwA zxGhG9saxWXU6G$6Fbd8fXjp#_O^kvnuke0c`$AW21eKlcYl&ro!+=_l-K+T!Tl8!y zTCo>nUMoWtx*hWYZb?q(S&PU{^@zq?Te`Jb5XJ>EW-}dv}_!1D#B-Jm^Fe$k1DfXq>)*QO6Lk3!0{*F3%dB?nN>53rMEL zscjmv00`x zuB@FlO5UFdP{m~YsbV^s89Dt~F+&u4WkEJ&8%TGN-h5{@ zOMNXSLrn}3CrMOk%`w2vB>B8|qspu~sXv`zEwy@cdvmjoGuliU^Em=y>=4uUW8nco zsc7o=9T#@HuE*^-uBlHIuo0N%!S~E;-B?P8G+I{Mm`Db(1K)m~o1uh}UzLa3#DR}D zmRgov%(YjK;4URzDuaR2=R=My6UP zjado!eA`^n6A5!l)pF3oG83`PobW44;P+RRaP3@^s@$)yz}6!N*r*NtwWB4e;3kht z>7xPzO7suU5zi7)WCWh0E4sO*;^Q4XH%$B(W31dqML1B!)J%xPE$>v}XGEhYATp1D zxTINTXEE*k0aEFzeM4=Z5pVugF;A@wnZBe zy5D!`bEKB@cK6NvC^a?lVijyw0|TeD>_S>xg^7x5Pl@Rax{Zz(5FW|<*1Yj@PI+)u zy6+1dyrU?^z)W@Eb0;8O8NM!}!6lk#l;n-er|@yIW>#}>arjPP&ONvM;0jA`Dl?7p z0Iqa-o{xtRKOrf?x+X;szPaL8$%L{V7&Qi1|oRrs@&5T)Vo2wIQ*tuvft!*oYj6`~JKXS_hFXT`l&d8wnb z1)N0j@UjgD7=vY+K1Sq_E6%)IwlBO-r-w8B=^$9L4x1?a_Fr^j*m(}|cS*i-o_m(& zwOrKNs^VBI5uJoxeQLuB%@>f;h|2Xq_X&U5loXdZYv@QlqvwN>3Tc)o*(Yy29Nue> zCi{My{Lmb#F1$@)(kUc(9>&m&ae1>W{%;OY&*B|=AA+hP8|Yi)pR1#uwe|nj#ve8D z<~32N<;MS9V{yf`?iU?!``6TlTFOcRB2y$N)8&B_?HXNSn^c-d8%|3x=o+IKZXXlg z#c@QN$enCi7)>I&4-Uvi2^A+Zmef9-enHX0ouEBSMw?P0iEE-4wMk4?y>!mebWj8p zjtfo-`|y!Opj!9{>k7FN^Fhw@Tt&`j5*dS5PgT*m{oUf4m2{c#kIvK4bx;bx9h9A3 zN#t|REnWvv$8qex&VK$ZI%(mZh#9lCH}>8E-@>PXe4-*N=V=m8)d{h$s6B|KgSAdA z7~Po66PVhRQ^TNXK6~`tgSS$if+<%_@g2MEXKerKpfzyE;7yjCP_+1tH5&MNRae1k~0{-fh_-DWdsHgwQ zMe+TF_dT=TB_#xD%iN4xZ+8Iy#SZXTNI~D+d z4brZ@q5Z~{=zZ9IkCAscYfz#5F+l&>Bk6s@eGh?Ut$@>m*4jQfgiTGpx_KzI8#r^o`&iT|oCxe0Pwf<-FokOR8PKE|4 zUHxbBos*>Bk#D@q+yw;CQ0hOEe{wwZJM@izio1Z0{ZG($&U}7H{_92K;`|fx4@U*J zN({e4|MjAYasLterUU;Uk|`ek6Y}3T8h>A->G1vu`OZ1YKhF&o{C`5eb2xIJeBW)y zUFF~c?JWMY-u+>f>8HE!C)-TF%;)b;$f_FGtmfz!k*D&f4p$@i6Ydvfds`FlzApakqWSecb(>PZP)MP z{=N-!diD$4Pwl~v!oE{WyHCEa+;w-M9=bos_f@@q5dMCl{ElE}^0&+8_b=oAjOHB# z$@a%(+_>NVa~kP>^81r&cgcJ0{w2j(^0|45N tKjpz66Ds$U|Mijg=j5BtKPCUy1F*alBuE<%0H6T<%z*}%eC{`Y{U0lO8zTS! literal 0 HcmV?d00001 diff --git a/test_unstructured/file_utils/test_filetype.py b/test_unstructured/file_utils/test_filetype.py index 545f441a0d..fac88ab2db 100644 --- a/test_unstructured/file_utils/test_filetype.py +++ b/test_unstructured/file_utils/test_filetype.py @@ -6,550 +6,971 @@ import io import os -import pathlib -import magic import pytest from test_unstructured.unit_utils import ( FixtureRequest, LogCaptureFixture, Mock, - MonkeyPatch, - call, example_doc_path, - method_mock, + patch, + property_mock, ) -from unstructured.file_utils import filetype from unstructured.file_utils.filetype import ( - _detect_filetype_from_octet_stream, - _is_code_mime_type, - _is_text_file_a_csv, - _is_text_file_a_json, + _FileTypeDetectionContext, + _TextFileDifferentiator, + _ZipFileDifferentiator, detect_filetype, ) from unstructured.file_utils.model import FileType is_in_docker = os.path.exists("/.dockerenv") - -@pytest.mark.parametrize( - ("file_name", "expected_value"), - [ - ("layout-parser-paper-fast.pdf", FileType.PDF), - ("fake.docx", FileType.DOCX), - ("example.jpg", FileType.JPG), - ("fake-text.txt", FileType.TXT), - ("eml/fake-email.eml", FileType.EML), - ("factbook.xml", FileType.XML), - ("example-10k.html", FileType.HTML), - ("fake-html.html", FileType.HTML), - ("stanley-cups.xlsx", FileType.XLSX), - ("stanley-cups.csv", FileType.CSV), - ("stanley-cups.tsv", FileType.TSV), - ("fake-power-point.pptx", FileType.PPTX), - ("winter-sports.epub", FileType.EPUB), - ("spring-weather.html.json", FileType.JSON), - ("README.org", FileType.ORG), - ("README.rst", FileType.RST), - ("README.md", FileType.MD), - ("fake.odt", FileType.ODT), - ("fake-incomplete-json.txt", FileType.TXT), - ], -) -def test_detect_filetype_from_filename(file_name: str, expected_value: FileType): - assert detect_filetype(example_doc_path(file_name)) == expected_value +# ================================================================================================ +# STRATEGY #1 - CONTENT-TYPE ASSERTED IN CALL +# ================================================================================================ @pytest.mark.parametrize( - ("file_name", "expected_value"), + ("expected_value", "file_name", "content_type"), [ - ("layout-parser-paper-fast.pdf", FileType.PDF), - ("fake.docx", FileType.DOCX), - ("example.jpg", FileType.JPG), - ("fake-text.txt", FileType.TXT), - ("eml/fake-email.eml", FileType.EML), - ("factbook.xml", FileType.XML), - ("example-10k.html", FileType.HTML), - ("fake-html.html", FileType.HTML), - ("stanley-cups.xlsx", FileType.XLSX), - ("stanley-cups.csv", FileType.CSV), - ("stanley-cups.tsv", FileType.TSV), - ("fake-power-point.pptx", FileType.PPTX), - ("winter-sports.epub", FileType.EPUB), - ("fake-doc.rtf", FileType.RTF), - ("spring-weather.html.json", FileType.JSON), - ("fake.odt", FileType.ODT), - ("fake-incomplete-json.txt", FileType.TXT), + (FileType.BMP, "img/bmp_24.bmp", "image/bmp"), + (FileType.CSV, "stanley-cups.csv", "text/csv"), + (FileType.DOC, "simple.doc", "application/msword"), + ( + FileType.DOCX, + "simple.docx", + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + ), + (FileType.EML, "eml/fake-email.eml", "message/rfc822"), + (FileType.EPUB, "winter-sports.epub", "application/epub+zip"), + (FileType.HEIC, "img/DA-1p.heic", "image/heic"), + (FileType.HTML, "example-10k-1p.html", "text/html"), + (FileType.JPG, "img/example.jpg", "image/jpeg"), + (FileType.JSON, "spring-weather.html.json", "application/json"), + (FileType.MD, "README.md", "text/markdown"), + (FileType.MSG, "fake-email.msg", "application/vnd.ms-outlook"), + (FileType.ODT, "simple.odt", "application/vnd.oasis.opendocument.text"), + (FileType.ORG, "README.org", "text/org"), + (FileType.PDF, "pdf/layout-parser-paper-fast.pdf", "application/pdf"), + (FileType.PNG, "img/DA-1p.png", "image/png"), + (FileType.PPT, "fake-power-point.ppt", "application/vnd.ms-powerpoint"), + ( + FileType.PPTX, + "fake-power-point.pptx", + "application/vnd.openxmlformats-officedocument.presentationml.presentation", + ), + (FileType.RST, "README.rst", "text/x-rst"), + (FileType.RTF, "fake-doc.rtf", "text/rtf"), + (FileType.TIFF, "img/layout-parser-paper-fast.tiff", "image/tiff"), + (FileType.TSV, "stanley-cups.tsv", "text/tsv"), + (FileType.TXT, "norwich-city.txt", "text/plain"), + (FileType.WAV, "CantinaBand3.wav", "audio/wav"), + (FileType.XLS, "tests-example.xls", "application/vnd.ms-excel"), + ( + FileType.XLSX, + "stanley-cups.xlsx", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + ), + (FileType.XML, "factbook.xml", "application/xml"), + (FileType.ZIP, "simple.zip", "application/zip"), ], ) -def test_detect_filetype_from_filename_with_extension( - file_name: str, expected_value: FileType, monkeypatch: MonkeyPatch +def test_it_detects_correct_file_type_from_file_path_with_correct_asserted_content_type( + file_name: str, content_type: str, expected_value: FileType, ctx_mime_type_: Mock ): - """File-type is detected from extension when libmagic not available or file does not exist.""" - # -- when libmagic is not available -- - monkeypatch.setattr(filetype, "LIBMAGIC_AVAILABLE", False) - assert detect_filetype(example_doc_path(file_name)) == expected_value - # -- when file does not exist -- - monkeypatch.setattr(filetype, "LIBMAGIC_AVAILABLE", True) - extension = pathlib.Path(file_name).suffix - assert detect_filetype(example_doc_path("not-on-disk" + extension)) == expected_value + # -- disable strategy #2, leaving only asserted content-type and extension -- + ctx_mime_type_.return_value = None + + file_type = detect_filetype(example_doc_path(file_name), content_type=content_type) + + # -- Strategy 1 should not need to refer to guessed MIME-type and detection should not + # -- fall back to strategy 2 for any of these test cases. + ctx_mime_type_.assert_not_called() + assert file_type == expected_value @pytest.mark.parametrize( - ("file_name", "expected_value"), + ("expected_value", "file_name", "content_type"), [ - ("pdf/layout-parser-paper-fast.pdf", [FileType.PDF]), - ("fake.docx", [FileType.DOCX]), - ("img/example.jpg", [FileType.JPG]), - ("fake-text.txt", [FileType.TXT]), - ("eml/fake-email.eml", [FileType.EML]), - ("factbook.xml", [FileType.XML]), - # NOTE(robinson]) - For the document, some operating systems return - # */xml and some return */html. Either could be acceptable depending on the OS - ("example-10k.html", [FileType.HTML, FileType.XML]), - ("fake-html.html", [FileType.HTML]), - ("stanley-cups.xlsx", [FileType.XLSX]), - ("stanley-cups.csv", [FileType.CSV]), - ("stanley-cups.tsv", [FileType.TSV]), - ("fake-power-point.pptx", [FileType.PPTX]), - ("winter-sports.epub", [FileType.EPUB]), - ("fake-incomplete-json.txt", [FileType.TXT]), + (FileType.BMP, "img/bmp_24.bmp", "image/bmp"), + (FileType.CSV, "stanley-cups.csv", "text/csv"), + (FileType.DOC, "simple.doc", "application/msword"), + ( + FileType.DOCX, + "simple.docx", + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + ), + (FileType.EML, "eml/fake-email.eml", "message/rfc822"), + (FileType.EPUB, "winter-sports.epub", "application/epub+zip"), + (FileType.HEIC, "img/DA-1p.heic", "image/heic"), + (FileType.HTML, "example-10k-1p.html", "text/html"), + (FileType.JPG, "img/example.jpg", "image/jpeg"), + (FileType.JSON, "spring-weather.html.json", "application/json"), + (FileType.MD, "README.md", "text/markdown"), + (FileType.MSG, "fake-email.msg", "application/vnd.ms-outlook"), + (FileType.ODT, "simple.odt", "application/vnd.oasis.opendocument.text"), + (FileType.ORG, "README.org", "text/org"), + (FileType.PDF, "pdf/layout-parser-paper-fast.pdf", "application/pdf"), + (FileType.PNG, "img/DA-1p.png", "image/png"), + (FileType.PPT, "fake-power-point.ppt", "application/vnd.ms-powerpoint"), + ( + FileType.PPTX, + "fake-power-point.pptx", + "application/vnd.openxmlformats-officedocument.presentationml.presentation", + ), + (FileType.RST, "README.rst", "text/x-rst"), + (FileType.RTF, "fake-doc.rtf", "text/rtf"), + (FileType.TIFF, "img/layout-parser-paper-fast.tiff", "image/tiff"), + (FileType.TSV, "stanley-cups.tsv", "text/tsv"), + (FileType.TXT, "norwich-city.txt", "text/plain"), + (FileType.WAV, "CantinaBand3.wav", "audio/wav"), + (FileType.XLS, "tests-example.xls", "application/vnd.ms-excel"), + ( + FileType.XLSX, + "stanley-cups.xlsx", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + ), + (FileType.XML, "factbook.xml", "application/xml"), + (FileType.ZIP, "simple.zip", "application/zip"), ], ) -def test_detect_filetype_from_file(file_name: str, expected_value: list[FileType]): - with open(example_doc_path(file_name), "rb") as f: - assert detect_filetype(file=f) in expected_value - - -def test_detect_filetype_from_file_warns_when_libmagic_is_not_installed( - monkeypatch: MonkeyPatch, caplog: LogCaptureFixture +def test_it_detects_correct_file_type_from_file_no_name_with_correct_asserted_content_type( + file_name: str, content_type: str, expected_value: FileType, ctx_mime_type_: Mock ): - monkeypatch.setattr(filetype, "LIBMAGIC_AVAILABLE", False) - with open(example_doc_path("fake-text.txt"), "rb") as f: - detect_filetype(file=f) - - assert "WARNING" in caplog.text - - -def test_detect_XML_from_application_xml_file_path(magic_from_file_: Mock): - magic_from_file_.return_value = "application/xml" - file_path = example_doc_path("factbook.xml") - - filetype = detect_filetype(file_path) - - magic_from_file_.assert_called_once_with(file_path, mime=True) - assert filetype == FileType.XML - + # -- disable strategy #2 (guessed mime-type) -- + ctx_mime_type_.return_value = None + # -- disable strategy #3 (filename extension) by supplying no source of file name -- + with open(example_doc_path(file_name), "rb") as f: + file = io.BytesIO(f.read()) -def test_detect_CSV_from_text_csv_file_path(magic_from_file_: Mock): - magic_from_file_.return_value = "text/csv" - file_path = example_doc_path("stanley-cups.csv") + file_type = detect_filetype(file=file, content_type=content_type) - filetype = detect_filetype(file_path) + # -- Strategy 1 should not need to refer to guessed MIME-type and detection should not + # -- fall-back to strategy 2 for any of these test cases. + ctx_mime_type_.assert_not_called() + assert file_type is expected_value - magic_from_file_.assert_called_once_with(file_path, mime=True) - assert filetype == FileType.CSV +# ================================================================================================ +# STRATEGY #2 - GUESS MIME-TYPE WITH LIBMAGIC +# ================================================================================================ -def test_detect_TXT_from_text_x_script_python_file_path(magic_from_file_: Mock): - magic_from_file_.return_value = "text/x-script.python" - file_path = example_doc_path("logger.py") - filetype = detect_filetype(file_path) +@pytest.mark.parametrize( + ("expected_value", "file_name", "mime_type"), + [ + (FileType.BMP, "img/bmp_24.bmp", "image/bmp"), + (FileType.CSV, "stanley-cups.csv", "text/csv"), + (FileType.CSV, "stanley-cups.csv", "application/csv"), + (FileType.CSV, "stanley-cups.csv", "application/x-csv"), + (FileType.DOC, "simple.doc", "application/msword"), + ( + FileType.DOCX, + "simple.docx", + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + ), + (FileType.EML, "eml/fake-email.eml", "message/rfc822"), + (FileType.EPUB, "winter-sports.epub", "application/epub"), + (FileType.EPUB, "winter-sports.epub", "application/epub+zip"), + (FileType.HEIC, "img/DA-1p.heic", "image/heic"), + (FileType.HTML, "example-10k-1p.html", "text/html"), + (FileType.JPG, "img/example.jpg", "image/jpeg"), + (FileType.JSON, "spring-weather.html.json", "application/json"), + (FileType.MD, "README.md", "text/markdown"), + (FileType.MD, "README.md", "text/x-markdown"), + (FileType.MSG, "fake-email.msg", "application/vnd.ms-outlook"), + (FileType.ODT, "simple.odt", "application/vnd.oasis.opendocument.text"), + (FileType.ORG, "README.org", "text/org"), + (FileType.PDF, "pdf/layout-parser-paper-fast.pdf", "application/pdf"), + (FileType.PNG, "img/DA-1p.png", "image/png"), + (FileType.PPT, "fake-power-point.ppt", "application/vnd.ms-powerpoint"), + ( + FileType.PPTX, + "fake-power-point.pptx", + "application/vnd.openxmlformats-officedocument.presentationml.presentation", + ), + (FileType.RST, "README.rst", "text/x-rst"), + (FileType.RTF, "fake-doc.rtf", "text/rtf"), + (FileType.RTF, "fake-doc.rtf", "application/rtf"), + (FileType.TIFF, "img/layout-parser-paper-fast.tiff", "image/tiff"), + (FileType.TSV, "stanley-cups.tsv", "text/tsv"), + (FileType.TXT, "norwich-city.txt", "text/plain"), + (FileType.TXT, "simple.yaml", "text/yaml"), + (FileType.WAV, "CantinaBand3.wav", "audio/wav"), + (FileType.XLS, "tests-example.xls", "application/vnd.ms-excel"), + ( + FileType.XLSX, + "stanley-cups.xlsx", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + ), + (FileType.XML, "factbook.xml", "application/xml"), + (FileType.XML, "factbook.xml", "text/xml"), + (FileType.ZIP, "simple.zip", "application/zip"), + ], +) +def test_it_detects_correct_file_type_using_strategy_2_when_libmagic_guesses_recognized_mime_type( + file_name: str, mime_type: str, expected_value: FileType, ctx_mime_type_: Mock +): + # -- libmagic guesses a MIME-type mapped to a `FileType` -- + ctx_mime_type_.return_value = mime_type + # -- disable strategy #3 (filename extension) by not providing filename -- + with open(example_doc_path(file_name), "rb") as f: + file = io.BytesIO(f.read()) - magic_from_file_.assert_called_once_with(file_path, mime=True) - assert filetype == FileType.TXT + # -- disable strategy #1 by not asserting a content_type in the call -- + file_type = detect_filetype(file=file) + # -- ctx.mime_type may be referenced multiple times, but at least once -- + ctx_mime_type_.assert_called_with() + assert file_type is expected_value -def test_detect_TXT_from_text_x_script_python_file(magic_from_buffer_: Mock): - magic_from_buffer_.return_value = "text/x-script.python" - file_path = example_doc_path("logger.py") - with open(file_path, "rb") as f: - head = f.read(4096) - f.seek(0) - filetype = detect_filetype(file=f) +@pytest.mark.parametrize( + ("expected_value", "file_name"), + [ + (FileType.BMP, "img/bmp_24.bmp"), + (FileType.CSV, "stanley-cups.csv"), + (FileType.DOCX, "simple.docx"), + (FileType.EML, "eml/fake-email.eml"), + (FileType.EPUB, "winter-sports.epub"), + (FileType.HEIC, "img/DA-1p.heic"), + (FileType.HTML, "ideas-page.html"), + (FileType.JPG, "img/example.jpg"), + (FileType.JSON, "spring-weather.html.json"), + (FileType.ODT, "simple.odt"), + (FileType.PDF, "pdf/layout-parser-paper-fast.pdf"), + (FileType.PNG, "img/DA-1p.png"), + (FileType.PPTX, "fake-power-point.pptx"), + (FileType.RTF, "fake-doc.rtf"), + (FileType.TIFF, "img/layout-parser-paper-fast.tiff"), + (FileType.TXT, "norwich-city.txt"), + (FileType.WAV, "CantinaBand3.wav"), + (FileType.XLSX, "stanley-cups.xlsx"), + (FileType.XML, "factbook.xml"), + (FileType.ZIP, "simple.zip"), + ], +) +def test_it_detects_most_file_types_using_strategy_2_when_libmagic_guesses_mime_type_for_itself( + file_name: str, expected_value: FileType +): + """Does not work for all types, in particular: + + TODOs: + - DOC is misidentified as MSG, TODO on that below. + - MSG is misidentified as UNK, but only on CI. + - PPT is misidentified as MSG, same fix as DOC. + - TSV is identified as TXT, maybe need an `.is_tsv` predicate in `_TextFileDifferentiator` + - XLS is misidentified as MSG, same fix as DOC. + + NOCANDOs: w/o an extension I think these are the best we can do. + - MD is identified as TXT + - ORG is identified as TXT + - RST is identified as TXT + """ + # -- disable strategy #1 by not asserting a content_type in the call -- + # -- disable strategy #3 (extension) by passing file-like object with no `.name` attribute -- + with open(example_doc_path(file_name), "rb") as f: + file = io.BytesIO(f.read()) - magic_from_buffer_.assert_called_once_with(head, mime=True) - assert filetype == FileType.TXT + assert detect_filetype(file=file) is expected_value -def test_is_code_mime_type_for_Go(): - assert _is_code_mime_type("text/x-go") is True +# NOTE(scanny): magic gets this wrong ("application/x-ole-storage") but filetype lib gets it right +# ("application/msword"). Need a differentiator for "application/x-ole-storage". +@pytest.mark.xfail(reason="TODO: FIX", raises=AssertionError, strict=True) +@pytest.mark.parametrize( + ("expected_value", "file_name"), + [ + (FileType.DOC, "simple.doc"), + (FileType.PPT, "fake-power-point.ppt"), + (FileType.XLS, "tests-example.xls"), + # -- only fails on CI, maybe different libmagic version or "magic-files" -- + # (FileType.MSG, "fake-email.msg"), + ], +) +def test_it_detects_MS_Office_file_types_using_strategy_2_when_libmagic_guesses_mime_type( + file_name: str, expected_value: FileType +): + with open(example_doc_path(file_name), "rb") as f: + file = io.BytesIO(f.read()) + assert detect_filetype(file=file) is expected_value -def test_detect_TXT_from_text_go_file(magic_from_buffer_: Mock): - magic_from_buffer_.return_value = "text/x-go" - file_path = example_doc_path("fake.go") +@pytest.mark.parametrize( + ("expected_value", "file_name"), + [ + # -- `filetype` lib recognizes all these binary file-types -- + (FileType.BMP, "img/bmp_24.bmp"), + (FileType.DOC, "simple.doc"), + (FileType.DOCX, "simple.docx"), + (FileType.EPUB, "winter-sports.epub"), + (FileType.HEIC, "img/DA-1p.heic"), + (FileType.JPG, "img/example.jpg"), + (FileType.ODT, "simple.odt"), + (FileType.PDF, "pdf/layout-parser-paper-fast.pdf"), + (FileType.PNG, "img/DA-1p.png"), + (FileType.PPT, "fake-power-point.ppt"), + (FileType.PPTX, "fake-power-point.pptx"), + (FileType.RTF, "fake-doc.rtf"), + (FileType.TIFF, "img/layout-parser-paper-fast.tiff"), + (FileType.WAV, "CantinaBand3.wav"), + (FileType.XLS, "tests-example.xls"), + (FileType.XLSX, "stanley-cups.xlsx"), + (FileType.ZIP, "simple.zip"), + # -- but it doesn't recognize textual file-types at all -- + (FileType.UNK, "stanley-cups.csv"), + (FileType.UNK, "eml/fake-email.eml"), + (FileType.UNK, "example-10k-1p.html"), + (FileType.UNK, "spring-weather.html.json"), + (FileType.UNK, "README.md"), + (FileType.UNK, "README.org"), + (FileType.UNK, "README.rst"), + (FileType.UNK, "stanley-cups.tsv"), + (FileType.UNK, "norwich-city.txt"), + (FileType.UNK, "factbook.xml"), + # -- and it doesn't recognize MSG files -- + (FileType.UNK, "fake-email.msg"), + ], +) +def test_strategy_2_can_detect_only_binary_file_types_when_libmagic_is_unavailable( + file_name: str, expected_value: FileType, LIBMAGIC_AVAILABLE_False: bool +): + """File-type is detected using `filetype` library when libmagic is not available. - with open(file_path, "rb") as f: - head = f.read(4096) - f.seek(0) - filetype = detect_filetype(file=f) + `filetype.guess_mime()` does a good job on binary file types (PDF, images, legacy MS-Office), + but doesn't even try to guess textual file-types. + """ + # -- disable strategy #3 (extension) by passing file-like object with no `.name` attribute -- + with open(example_doc_path(file_name), "rb") as f: + file = io.BytesIO(f.read()) + # -- simulate libmagic is not available -- + assert LIBMAGIC_AVAILABLE_False is False - magic_from_buffer_.assert_called_once_with(head, mime=True) - assert filetype == FileType.TXT + # -- disable strategy #1 by not asserting a content_type in the call -- + file_type = detect_filetype(file=file) + assert file_type is expected_value -def test_detect_RTF_from_application_rtf_file_path(magic_from_file_: Mock): - magic_from_file_.return_value = "application/rtf" - file_path = example_doc_path("fake-doc.rtf") - filetype = detect_filetype(file_path) +def test_detect_filetype_from_file_warns_when_libmagic_is_not_installed( + caplog: LogCaptureFixture, LIBMAGIC_AVAILABLE_False: bool +): + with open(example_doc_path("fake-text.txt"), "rb") as f: + detect_filetype(file=f) - magic_from_file_.assert_called_once_with(file_path, mime=True) - assert filetype == FileType.RTF + assert "WARNING" in caplog.text + assert "libmagic is unavailable but assists in filetype detection. Please cons" in caplog.text -def test_detect_XML_from_text_xml_file_path(magic_from_file_: Mock): - magic_from_file_.return_value = "text/xml" - file_path = example_doc_path("factbook.xml") +# ================================================================================================ +# STRATEGY #3 - MAP FILENAME EXTENSION TO FILETYPE +# ================================================================================================ - filetype = detect_filetype(file_path) - magic_from_file_.assert_called_once_with(file_path, mime=True) - assert filetype == FileType.XML +@pytest.mark.parametrize( + ("expected_value", "file_name"), + [ + (FileType.BMP, "img/bmp_24.bmp"), + (FileType.CSV, "stanley-cups.csv"), + (FileType.DOC, "simple.doc"), + (FileType.DOCX, "simple.docx"), + (FileType.EML, "eml/fake-email.eml"), + (FileType.EPUB, "winter-sports.epub"), + (FileType.HEIC, "img/DA-1p.heic"), + (FileType.HTML, "example-10k-1p.html"), + (FileType.JPG, "img/example.jpg"), + (FileType.JSON, "spring-weather.html.json"), + (FileType.MD, "README.md"), + (FileType.MSG, "fake-email.msg"), + (FileType.ODT, "simple.odt"), + (FileType.ORG, "README.org"), + (FileType.PDF, "pdf/layout-parser-paper-fast.pdf"), + (FileType.PNG, "img/DA-1p.png"), + (FileType.PPT, "fake-power-point.ppt"), + (FileType.PPTX, "fake-power-point.pptx"), + (FileType.RST, "README.rst"), + (FileType.RTF, "fake-doc.rtf"), + (FileType.TIFF, "img/layout-parser-paper-fast.tiff"), + (FileType.TSV, "stanley-cups.tsv"), + (FileType.TXT, "norwich-city.txt"), + (FileType.WAV, "CantinaBand3.wav"), + (FileType.XLS, "tests-example.xls"), + (FileType.XLSX, "stanley-cups.xlsx"), + (FileType.XML, "factbook.xml"), + (FileType.ZIP, "simple.zip"), + ], +) +def test_it_detects_correct_file_type_from_strategy_3_when_extension_maps_to_file_type( + file_name: str, expected_value: FileType, ctx_mime_type_: Mock +): + # -- disable strategy #2 by making libmagic always guess `None` -- + ctx_mime_type_.return_value = None + # -- disable strategy #1 by not asserting a content_type in the call -- + # -- enable strategy #3 by passing filename as source for extension -- + file_type = detect_filetype(example_doc_path(file_name)) -def test_detect_HTML_from_application_xml_file_path_with_html_extension(magic_from_file_: Mock): - magic_from_file_.return_value = "application/xml" - file_path = example_doc_path("fake-html.html") + # -- ctx.mime_type may be referenced multiple times, but at least once -- + ctx_mime_type_.assert_called_with() + assert file_type is expected_value - filetype = detect_filetype(file_path) - magic_from_file_.assert_called_once_with(file_path, mime=True) - assert filetype == FileType.HTML +@pytest.mark.parametrize( + ("expected_value", "file_name", "mime_type"), + [ + (FileType.BMP, "img/bmp_24.bmp", "application/zip"), + (FileType.DOC, "simple.doc", None), + (FileType.MSG, "fake-email.msg", "application/octet-stream"), + ], +) +def test_it_falls_back_to_extension_strategy_when_prior_strategies_fail( + file_name: str, mime_type: str | None, expected_value: FileType, ctx_mime_type_: Mock +): + ctx_mime_type_.return_value = mime_type + file_type = detect_filetype(example_doc_path(file_name)) -def test_detect_HTML_from_text_xml_file_path_with_html_extension(magic_from_file_: Mock): - magic_from_file_.return_value = "text/xml" - file_path = example_doc_path("fake-html.html") + ctx_mime_type_.assert_called_with() + assert file_type is expected_value - filetype = detect_filetype(file_path) - magic_from_file_.assert_called_once_with(file_path, mime=True) - assert filetype == FileType.HTML +# ================================================================================================ +# SPECIAL CASES +# ================================================================================================ -def test_detect_DOCX_from_application_octet_stream_file_no_extension(magic_from_buffer_: Mock): - magic_from_buffer_.return_value = "application/octet-stream" - with open(example_doc_path("simple.docx"), "rb") as f: +@pytest.mark.parametrize("mime_type", ["application/xml", "text/xml"]) +@pytest.mark.parametrize("extension", [".html", ".htm"]) +def test_it_detects_HTML_from_guessed_mime_type_ending_with_xml_and_html_extension( + mime_type: str, extension: str, ctx_mime_type_: Mock +): + ctx_mime_type_.return_value = mime_type + with open(example_doc_path("example-10k-1p.html"), "rb") as f: file = io.BytesIO(f.read()) + file.name = f"a/b/page{extension}" - filetype = detect_filetype(file=file) - - magic_from_buffer_.assert_called_once_with(file.getvalue()[:4096], mime=True) - assert filetype == FileType.DOCX - - -def test_detect_DOCX_from_application_octet_stream_file_path(magic_from_file_: Mock): - magic_from_file_.return_value = "application/octet-stream" - file_path = example_doc_path("simple.docx") - - filetype = detect_filetype(file_path) - - magic_from_file_.assert_called_once_with(file_path, mime=True) - assert filetype == FileType.DOCX - - -def test_detect_DOCX_from_application_zip_file_path(magic_from_file_: Mock): - magic_from_file_.return_value = "application/zip" - file_path = example_doc_path("simple.docx") - - filetype = detect_filetype(file_path) - - magic_from_file_.assert_called_once_with(file_path, mime=True) - assert filetype == FileType.DOCX - + file_type = detect_filetype(file=file) -def test_detect_ZIP_from_application_zip_file_path(magic_from_file_: Mock): - magic_from_file_.return_value = "application/zip" - file_path = example_doc_path("simple.zip") + ctx_mime_type_.assert_called_with() + assert file_type is FileType.HTML - filetype = detect_filetype(file_path) - magic_from_file_.assert_called_once_with(file_path, mime=True) - assert filetype == FileType.ZIP - - -def test_detect_DOC_from_application_msword_file_path(magic_from_file_: Mock): - magic_from_file_.return_value = "application/msword" - file_path = example_doc_path("fake.doc") - - filetype = detect_filetype(file_path) - - magic_from_file_.assert_called_once_with(file_path, mime=True) - assert filetype == FileType.DOC - - -def test_detect_PPT_from_application_vnd_ms_powerpoint_file_path(magic_from_file_: Mock): - magic_from_file_.return_value = "application/vnd.ms-powerpoint" - file_path = example_doc_path("fake-power-point.ppt") +@pytest.mark.parametrize( + "mime_type", + [ + "application/octet-stream", + "application/zip", + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "application/vnd.openxmlformats-officedocument.presentationml.presentation", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + ], +) +@pytest.mark.parametrize( + ("expected_value", "file_name"), + [ + (FileType.DOCX, "simple.docx"), + (FileType.PPTX, "fake-power-point.pptx"), + (FileType.XLSX, "stanley-cups.xlsx"), + (FileType.ZIP, "simple.zip"), + ], +) +def test_it_differentiates_files_when_libmagic_guesses_octet_stream_zip_or_modern_ms_office( + mime_type: str, file_name: str, expected_value: FileType, ctx_mime_type_: Mock +): + ctx_mime_type_.return_value = mime_type + # -- disable extension-based strategy #3 -- + with open(example_doc_path(file_name), "rb") as f: + file = io.BytesIO(f.read()) - filetype = detect_filetype(file_path) + file_type = detect_filetype(file=file) - magic_from_file_.assert_called_once_with(file_path, mime=True) - assert filetype == FileType.PPT + ctx_mime_type_.assert_called_with() + assert file_type is expected_value -def test_detect_XLS_from_application_vnd_ms_excel_file_path(magic_from_file_: Mock): - magic_from_file_.return_value = "application/vnd.ms-excel" - file_path = example_doc_path("tests-example.xls") +@pytest.mark.parametrize( + ("mime_type", "file_name"), + [ + ("text/x-script.python", "logger.py"), + ("text/x-go", "fake.go"), + ("application/x-javascript", "fake-text.txt"), + ], +) +def test_it_detects_TXT_for_source_code_files(mime_type: str, file_name: str, ctx_mime_type_: Mock): + ctx_mime_type_.return_value = mime_type + # -- disable extension-based strategy #3 -- + with open(example_doc_path(file_name), "rb") as f: + file = io.BytesIO(f.read()) - filetype = detect_filetype(file_path) + file_type = detect_filetype(file=file) - magic_from_file_.assert_called_once_with(file_path, mime=True) - assert filetype == FileType.XLS + ctx_mime_type_.assert_called_with() + assert file_type is FileType.TXT -def test_detect_XLSX_from_application_octet_stream_file_no_extension(magic_from_buffer_: Mock): - magic_from_buffer_.return_value = "application/octet-stream" - with open(example_doc_path("stanley-cups.xlsx"), "rb") as f: +def test_detects_TXT_from_an_unknown_guessed_text_subtype(ctx_mime_type_: Mock): + ctx_mime_type_.return_value = "text/new-type" + with open(example_doc_path("fake-text.txt"), "rb") as f: file = io.BytesIO(f.read()) filetype = detect_filetype(file=file) - magic_from_buffer_.assert_called_once_with(file.getvalue()[:4096], mime=True) - assert filetype == FileType.XLSX + ctx_mime_type_.assert_called_with() + assert filetype == FileType.TXT -def test_detect_XLSX_from_application_octet_stream_file_path(magic_from_file_: Mock): - magic_from_file_.return_value = "application/octet-stream" - file_path = example_doc_path("stanley-cups.xlsx") +def test_detect_filetype_raises_with_neither_path_or_file_like_object_specified(): + with pytest.raises(ValueError, match="either `file_path` or `file` argument must be provided"): + detect_filetype() - filetype = detect_filetype(file_path) - magic_from_file_.assert_called_once_with(file_path, mime=True) - assert filetype == FileType.XLSX +def test_it_detects_EMPTY_from_file_path_to_empty_file(): + assert detect_filetype(example_doc_path("empty.txt")) == FileType.EMPTY -def test_detect_PPTX_from_application_octet_stream_file_no_extension(magic_from_buffer_: Mock): - magic_from_buffer_.return_value = "application/octet-stream" - with open(example_doc_path("fake-power-point.pptx"), "rb") as f: - file = io.BytesIO(f.read()) - - filetype = detect_filetype(file=file) +def test_it_detects_EMPTY_from_empty_file_like_object(): + with open(example_doc_path("empty.txt"), "rb") as f: + assert detect_filetype(file=f) == FileType.EMPTY - magic_from_buffer_.assert_called_once_with(file.getvalue()[:4096], mime=True) - assert filetype == FileType.PPTX +def test_it_detect_CSV_from_path_and_file_when_content_contains_escaped_commas(): + file_path = example_doc_path("csv-with-escaped-commas.csv") -def test_detect_PPTX_from_application_octet_stream_file_path(magic_from_file_: Mock): - magic_from_file_.return_value = "application/octet-stream" - file_path = example_doc_path("fake-power-point.pptx") + assert detect_filetype(file_path) == FileType.CSV + with open(file_path, "rb") as f: + assert detect_filetype(file=f) == FileType.CSV - filetype = detect_filetype(file_path) - magic_from_file_.assert_called_once_with(file_path, mime=True) - assert filetype == FileType.PPTX +# ================================================================================================ +# MODULE-LEVEL FIXTURES +# ================================================================================================ -def test_detect_UNK_from_application_octet_stream_text_file_no_extension(magic_from_buffer_: Mock): - magic_from_buffer_.return_value = "application/octet-stream" - with open(example_doc_path("fake-text.txt"), "rb") as f: - file = io.BytesIO(f.read()) +@pytest.fixture() +def LIBMAGIC_AVAILABLE_False(): + with patch("unstructured.file_utils.filetype.LIBMAGIC_AVAILABLE", False) as m: + yield m - filetype = detect_filetype(file=file) - assert magic_from_buffer_.call_args_list == [ - call(file.getvalue()[:4096], mime=True), - call(b"", mime=True), - ] - assert filetype == FileType.UNK +@pytest.fixture() +def ctx_mime_type_(request: FixtureRequest): + return property_mock(request, _FileTypeDetectionContext, "mime_type") -def test_detect_ZIP_from_application_zip_not_a_zip_file(magic_from_buffer_: Mock): - magic_from_buffer_.return_value = "application/zip" +# ================================================================================================ +# UNIT-TESTS +# ================================================================================================ - with open(example_doc_path("fake-text.txt"), "rb") as f: - head = f.read(4096) - f.seek(0) - filetype = detect_filetype(file=f) - assert magic_from_buffer_.call_args_list == [ - call(head, mime=True), - call(b"", mime=True), - ] - assert filetype == FileType.ZIP +class Describe_FileTypeDetectionContext: + """Unit-test suite for `unstructured.file_utils.filetype._FileTypeDetectionContext`.""" + + # -- .new() ------------------------------------------------- + + def it_provides_a_validating_alternate_constructor(self): + ctx = _FileTypeDetectionContext.new( + file_path=example_doc_path("simple.docx"), + file=None, + encoding="utf-8", + content_type="text/plain", + metadata_file_path="a/b/foo.bar", + ) + assert isinstance(ctx, _FileTypeDetectionContext) + + def and_the_validating_constructor_raises_on_an_invalid_context(self): + with pytest.raises(ValueError, match="either `file_path` or `file` argument must be pro"): + _FileTypeDetectionContext.new( + file_path=None, + file=None, + encoding=None, + content_type=None, + metadata_file_path=None, + ) + + # -- .content_type ------------------------------------------ + + def it_knows_the_content_type_asserted_by_the_caller(self): + assert _FileTypeDetectionContext(content_type="TEXT/hTmL").content_type == "text/html" + + # -- .encoding ---------------------------------------------- + + @pytest.mark.parametrize( + ("encoding", "expected_value"), + [ + ("utf-8", "utf-8"), + ("UTF_8", "utf-8"), + ("UTF_16LE", "utf-16le"), + ("ISO_8859_6_I", "iso-8859-6"), + # -- default value is utf-8 -- + (None, "utf-8"), + ], + ) + def it_knows_the_encoding_asserted_by_the_caller_and_normalizes_it( + self, encoding: str | None, expected_value: str + ): + assert _FileTypeDetectionContext(encoding=encoding).encoding == expected_value + + # -- .extension --------------------------------------------- + + def it_derives_the_filename_extension_from_the_file_path_when_one_is_provided(self): + ctx = _FileTypeDetectionContext(file_path=example_doc_path("simple.docx")) + assert ctx.extension == ".docx" + + def and_it_derives_the_extension_from_a_file_opened_from_a_path(self): + with open(example_doc_path("picture.pptx"), "rb") as f: + assert _FileTypeDetectionContext(file=f).extension == ".pptx" + + @pytest.mark.parametrize( + "file_name", + [ + # -- case 1: file-like object has no `.name` attribute + None, + # -- case 2: file-like object has `.name` attribute but it's value is the empty string + "", + ], + ) + def and_it_derives_the_extension_from_metadata_file_path_when_file_object_has_no_name( + self, file_name: str | None + ): + with open(example_doc_path("ideas-page.html"), "rb") as f: + file = io.BytesIO(f.read()) + if file_name is not None: + file.name = file_name + + ctx = _FileTypeDetectionContext(file=file, metadata_file_path="a/b/c.html") + + assert ctx.extension == ".html" + + @pytest.mark.parametrize( + "file_name", + [ + # -- case 1: file-like object has no `.name` attribute + None, + # -- case 2: file-like object has `.name` attribute but it's value is the empty string + "", + ], + ) + def and_it_returns_the_empty_string_as_the_extension_when_there_are_no_file_name_sources( + self, file_name: str | None + ): + with open(example_doc_path("ideas-page.html"), "rb") as f: + file = io.BytesIO(f.read()) + if file_name is not None: + file.name = file_name + + assert _FileTypeDetectionContext(file=file).extension == "" + + # -- .file_head --------------------------------------------- + + def it_grabs_the_first_4k_bytes_of_the_file_for_use_by_magic(self): + ctx = _FileTypeDetectionContext(file_path=example_doc_path("norwich-city.txt")) + + head = ctx.file_head + + assert isinstance(head, bytes) + assert len(head) == 4096 + assert head.startswith(b"Iwan Roberts\nRoberts celebrating after") + + # -- .file_path --------------------------------------------- + + @pytest.mark.parametrize("file_path", [None, "a/b/c.pdf"]) + def it_knows_the_file_path_provided_by_the_caller(self, file_path: str | None): + assert _FileTypeDetectionContext(file_path=file_path).file_path == file_path + + # -- .has_code_mime_type ------------------------------------ + + @pytest.mark.parametrize( + ("mime_type", "expected_value"), + [ + ("text/plain", False), + ("text/x-csharp", True), + ("text/x-go", True), + ("text/x-java", True), + ("text/x-python", True), + ("application/xml", False), + (None, False), + ], + ) + def it_knows_whether_its_mime_type_indicates_programming_language_source_code( + self, mime_type_prop_: Mock, mime_type: str | None, expected_value: bool + ): + mime_type_prop_.return_value = mime_type + assert _FileTypeDetectionContext().has_code_mime_type is expected_value + + # -- .is_zipfile -------------------------------------------- + + @pytest.mark.parametrize( + ("file_name", "expected_value"), + [ + ("README.md", False), + ("emoji.xlsx", True), + ("simple.doc", False), + ("simple.docx", True), + ("simple.odt", True), + ("simple.zip", True), + ("winter-sports.epub", True), + ], + ) + def it_knows_whether_it_is_a_zipfile(self, file_name: str, expected_value: bool): + assert _FileTypeDetectionContext(example_doc_path(file_name)).is_zipfile is expected_value + # -- .mime_type --------------------------------------------- -def test_detect_DOCX_from_docx_mime_type_file_no_extension(magic_from_buffer_: Mock): - magic_from_buffer_.return_value = ( - "application/vnd.openxmlformats-officedocument.wordprocessingml.document" - ) - with open(example_doc_path("simple.docx"), "rb") as f: - file = io.BytesIO(f.read()) + def it_provides_the_MIME_type_detected_by_libmagic_from_a_file_path(self): + ctx = _FileTypeDetectionContext(file_path=example_doc_path("norwich-city.txt")) + assert ctx.mime_type == "text/plain" - filetype = detect_filetype(file=file) + def and_it_provides_the_MIME_type_from_path_using_filetype_lib_when_magic_is_unavailable(self): + with patch("unstructured.file_utils.filetype.LIBMAGIC_AVAILABLE", False): + ctx = _FileTypeDetectionContext(file_path=example_doc_path("simple.doc")) + assert ctx.mime_type == "application/msword" - magic_from_buffer_.assert_called_once_with(file.getvalue()[:4096], mime=True) - assert filetype == FileType.DOCX + def but_it_warns_to_install_libmagic_when_the_filetype_lib_cannot_detect_the_MIME_type( + self, caplog: LogCaptureFixture + ): + with patch("unstructured.file_utils.filetype.LIBMAGIC_AVAILABLE", False): + ctx = _FileTypeDetectionContext(file_path=example_doc_path("norwich-city.txt")) + assert ctx.mime_type is None + assert "WARNING" in caplog.text + assert "libmagic is unavailable" in caplog.text + assert "consider installing libmagic" in caplog.text + def it_provides_the_MIME_type_detected_by_libmagic_from_a_file_like_object(self): + with open(example_doc_path("norwich-city.txt"), "rb") as f: + ctx = _FileTypeDetectionContext(file=f) + assert ctx.mime_type == "text/plain" -def test_detect_XLSX_from_xlsx_mime_type_file_no_extension(magic_from_buffer_: Mock): - magic_from_buffer_.return_value = ( - "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" - ) - with open(example_doc_path("stanley-cups.xlsx"), "rb") as f: - file = io.BytesIO(f.read()) + def and_it_provides_the_MIME_type_from_file_using_filetype_lib_when_magic_is_unavailable(self): + with patch("unstructured.file_utils.filetype.LIBMAGIC_AVAILABLE", False): + file_path = example_doc_path("simple.doc") + with open(file_path, "rb") as f: + ctx = _FileTypeDetectionContext(file=f) + assert ctx.mime_type == "application/msword" - filetype = detect_filetype(file=file) + # -- .open() ------------------------------------------------ - magic_from_buffer_.assert_called_once_with(file.getvalue()[:4096], mime=True) - assert filetype == FileType.XLSX + def it_provides_transparent_access_to_the_source_file_when_it_is_a_file_like_object(self): + with open(example_doc_path("norwich-city.txt"), "rb") as f: + ctx = _FileTypeDetectionContext(file=f) + with ctx.open() as file: + assert file is f + assert file.read(38) == b"Iwan Roberts\nRoberts celebrating after" + def it_provides_transparent_access_to_the_source_file_when_it_is_a_file_path(self): + ctx = _FileTypeDetectionContext(file_path=example_doc_path("norwich-city.txt")) + with ctx.open() as file: + assert file.read(38) == b"Iwan Roberts\nRoberts celebrating after" -def test_detect_UNK_from_extension_of_non_existent_file_path(): - assert detect_filetype(example_doc_path("made_up.fake")) == FileType.UNK + # -- .text_head --------------------------------------------- + def it_grabs_the_first_4k_chars_from_file_path_for_textual_type_differentiation(self): + ctx = _FileTypeDetectionContext(file_path=example_doc_path("norwich-city.txt")) -def test_detect_PNG_from_extension_of_non_existent_file_path(): - assert detect_filetype(example_doc_path("made_up.png")) == FileType.PNG + text_head = ctx.text_head + assert isinstance(text_head, str) + assert len(text_head) == 4096 + assert text_head.startswith("Iwan Roberts\nRoberts celebrating after") -def test_detect_TXT_from_unknown_text_subtype_file_no_extension(magic_from_buffer_: Mock): - magic_from_buffer_.return_value = "text/new-type" - with open(example_doc_path("fake-text.txt"), "rb") as f: - file = io.BytesIO(f.read()) + def and_it_uses_character_detection_to_correct_a_wrong_encoding_arg_for_file_path(self): + ctx = _FileTypeDetectionContext( + file_path=example_doc_path("norwich-city.txt"), encoding="utf_32_be" + ) - filetype = detect_filetype(file=file) + text_head = ctx.text_head - magic_from_buffer_.assert_called_once_with(file.getvalue()[:4096], mime=True) - assert filetype == FileType.TXT + assert isinstance(text_head, str) + assert len(text_head) == 4096 + assert text_head.startswith("Iwan Roberts\nRoberts celebrating after") + def but_not_to_correct_a_wrong_encoding_arg_for_a_file_like_object_open_in_binary_mode(self): + """Fails silently in this case, returning empty string.""" + with open(example_doc_path("norwich-city.txt"), "rb") as f: + file = io.BytesIO(f.read()) + ctx = _FileTypeDetectionContext(file=file, encoding="utf_32_be") -def test_detect_BMP_from_file_path(): - assert detect_filetype(example_doc_path("bmp_24.bmp")) == FileType.BMP + text_head = ctx.text_head + assert text_head == "" -def test_detect_BMP_from_file_no_extension(): - with open(example_doc_path("img/bmp_24.bmp"), "rb") as f: - file = io.BytesIO(f.read()) - assert detect_filetype(file=file) == FileType.BMP + def and_it_grabs_the_first_4k_chars_from_binary_file_for_textual_type_differentiation(self): + with open(example_doc_path("norwich-city.txt"), "rb") as f: + ctx = _FileTypeDetectionContext(file=f) + text_head = ctx.text_head -def test_detect_filetype_raises_when_both_path_and_file_like_object_are_specified(): - file_path = example_doc_path("fake-email.eml") - with open(example_doc_path(file_path), "rb") as f: - file = io.BytesIO(f.read()) + assert isinstance(text_head, str) + # -- some characters consume multiple bytes, so shorter than 4096 -- + assert len(text_head) == 4063 + assert text_head.startswith("Iwan Roberts\nRoberts celebrating after") - with pytest.raises(ValueError, match="Exactly one of filename and file must be specified."): - detect_filetype(filename=file_path, file=file) + def and_it_grabs_the_first_4k_chars_from_text_file_for_textual_type_differentiation(self): + """Not a documented behavior to accept IO[str], but support is implemented.""" + with open(example_doc_path("norwich-city.txt")) as f: + ctx = _FileTypeDetectionContext(file=f) # pyright: ignore[reportArgumentType] + text_head = ctx.text_head -def test_detect_filetype_raises_with_neither_path_or_file_like_object_specified(): - with pytest.raises(ValueError, match="Exactly one of filename and file must be specified."): - detect_filetype() + assert isinstance(text_head, str) + assert len(text_head) == 4096 + assert text_head.startswith("Iwan Roberts\nRoberts celebrating after") + def it_accommodates_a_utf_32_encoded_file_path(self): + ctx = _FileTypeDetectionContext(example_doc_path("fake-text-utf-32.txt")) -def test_FileType_is_ordererd_by_name(): - """FileType is a total order on name, e.g. FileType.A < FileType.B.""" - assert FileType.EML < FileType.HTML < FileType.XML + text_head = ctx.text_head + assert isinstance(text_head, str) + # -- test document is short -- + assert len(text_head) == 188 + assert text_head.startswith("This is a test document to use for unit tests.\n\n Doyle") -@pytest.mark.parametrize( - ("content", "expected_value"), - [ - (b"d\xe2\x80", False), # Invalid JSON - (b'[{"key": "value"}]', True), # Valid JSON - (b"", False), # Empty content - (b'"This is not a JSON"', False), # Serializable as JSON, but we want to treat it as txt - ], -) -def test_is_text_file_a_json_distinguishes_JSON_from_text(content: bytes, expected_value: bool): - with io.BytesIO(content) as f: - assert _is_text_file_a_json(file=f) == expected_value + # TODO: this fails because `.text_head` ignores decoding errors on a file open for binary + # reading. Probably better if it used chardet in that case as it does for a file-path. + @pytest.mark.xfail(reason="WIP", raises=AssertionError, strict=True) + def and_it_accommodates_a_utf_32_encoded_file_like_object(self): + with open(example_doc_path("fake-text-utf-32.txt"), "rb") as f: + file = io.BytesIO(f.read()) + ctx = _FileTypeDetectionContext(file=file) + text_head = ctx.text_head -@pytest.mark.parametrize( - ("content", "expected_value"), - [ - (b"d\xe2\x80", False), # Invalid CSV - (b'[{"key": "value"}]', False), # Invalid CSV - (b"column1,column2,column3\nvalue1,value2,value3\n", True), # Valid CSV - (b"", False), # Empty content - ], -) -def test_is_text_file_a_csv_distinguishes_CSV_from_text(content: bytes, expected_value: bool): - with io.BytesIO(content) as f: - assert _is_text_file_a_csv(file=f) == expected_value + assert isinstance(text_head, str) + # -- test document is short -- + assert len(text_head) == 188 + assert text_head.startswith("This is a test document to use for unit tests.\n\n Doyle") + # -- .validate() -------------------------------------------- -def test_csv_and_json_checks_with_filename_accommodate_utf_32_encoded_file(): - file_path = example_doc_path("fake-text-utf-32.txt") - assert _is_text_file_a_csv(filename=file_path) is False - assert _is_text_file_a_json(filename=file_path) is False + def it_raises_when_no_file_exists_at_the_specified_file_path(self): + with pytest.raises(FileNotFoundError, match="no such file a/b/c.foo"): + _FileTypeDetectionContext(file_path="a/b/c.foo")._validate() + def it_raises_when_neither_file_path_nor_file_is_provided(self): + with pytest.raises(ValueError, match="either `file_path` or `file` argument must be pro"): + _FileTypeDetectionContext()._validate() -def test_csv_and_json_checks_with_file_accommodate_utf_32_encoded_content(): - with open(example_doc_path("fake-text-utf-32.txt"), "rb") as f: - file = io.BytesIO(f.read()) + # -- fixtures -------------------------------------------------------------------------------- - assert _is_text_file_a_csv(file=file) is False - file.seek(0) - assert _is_text_file_a_json(file=file) is False + @pytest.fixture + def mime_type_prop_(self, request: FixtureRequest): + return property_mock(request, _FileTypeDetectionContext, "mime_type") -def test_detect_EMPTY_from_file_path_to_empty_file(): - assert detect_filetype(example_doc_path("empty.txt")) == FileType.EMPTY +class Describe_TextFileDifferentiator: + """Unit-test suite for `unstructured.file_utils.filetype._TextFileDifferentiator`.""" + # -- .applies() --------------------------------------------- -def test_detect_EMPTY_from_file_that_is_empty(): - with open(example_doc_path("empty.txt"), "rb") as f: - assert detect_filetype(file=f) == FileType.EMPTY + def it_provides_a_qualifying_alternate_constructor_which_constructs_when_applicable(self): + """The constructor determines whether this differentiator is applicable. + It returns an instance only when differentiating a text file-type is required, which it can + judge from the context (`ctx`). + """ + ctx = _FileTypeDetectionContext(example_doc_path("norwich-city.txt")) -def test_detect_CSV_from_path_and_file_when_content_contains_escaped_commas(): - file_path = example_doc_path("csv-with-escaped-commas.csv") + differentiator = _TextFileDifferentiator.applies(ctx) - assert detect_filetype(filename=file_path) == FileType.CSV - with open(file_path, "rb") as f: - assert detect_filetype(file=f) == FileType.CSV + assert isinstance(differentiator, _TextFileDifferentiator) + def and_it_returns_None_when_text_differentiation_does_not_apply_to_the_detection_context(self): + ctx = _FileTypeDetectionContext(example_doc_path("simple.docx")) + assert _TextFileDifferentiator.applies(ctx) is None -def test_detect_filetype_from_octet_stream(): - with open(example_doc_path("emoji.xlsx"), "rb") as f: - assert _detect_filetype_from_octet_stream(file=f) == FileType.XLSX + # -- ._is_csv ----------------------------------------------- + @pytest.mark.parametrize( + ("content", "expected_value"), + [ + # -- no commas, too few lines -- + (b"d\xe2\x80", False), + (b'[{"key": "value"}]', False), + # -- at least a header and one data row, at least two columns -- + (b"column1,column2,column3\nvalue1,value2,value3\n", True), + # -- no content -- + (b"", False), + ], + ) + def it_distinguishes_a_CSV_file_from_other_text_files( + self, content: bytes, expected_value: bool + ): + ctx = _FileTypeDetectionContext(file=io.BytesIO(content)) + differentiator = _TextFileDifferentiator(ctx) -def test_detect_WAV_from_filename(): - assert detect_filetype(example_doc_path("CantinaBand3.wav")) == FileType.WAV + assert differentiator._is_csv is expected_value + # -- ._is_eml ----------------------------------------------- -def test_detect_wav_from_file(): - with open(example_doc_path("CantinaBand3.wav"), "rb") as f: - assert detect_filetype(file=f) == FileType.WAV + @pytest.mark.parametrize( + ("file_name", "expected_value"), [("fake-email.eml", True), ("norwich-city.txt", False)] + ) + def it_distinguishes_an_EML_file_from_other_text_files( + self, file_name: str, expected_value: bool + ): + ctx = _FileTypeDetectionContext(example_doc_path(file_name)) + assert _TextFileDifferentiator(ctx)._is_eml is expected_value + + # -- ._is_json ---------------------------------------------- + + @pytest.mark.parametrize( + ("content", "expected_value"), + [ + (b"d\xe2\x80", False), + (b'[{"key": "value"}]', True), + (b"", False), + # -- valid JSON, but not for our purposes -- + (b'"This is not a JSON"', False), + ], + ) + def it_distinguishes_a_JSON_file_from_other_text_files( + self, content: bytes, expected_value: bool + ): + ctx = _FileTypeDetectionContext(file=io.BytesIO(content)) + differentiator = _TextFileDifferentiator(ctx) + assert differentiator._is_json is expected_value -def test_detect_TXT_from_file_path_to_yaml(): - assert detect_filetype(example_doc_path("simple.yaml")) == FileType.TXT +class Describe_ZipFileDifferentiator: + """Unit-test suite for `unstructured.file_utils.filetype._ZipFileDifferentiator`.""" -def test_detect_TXT_from_yaml_file(magic_from_buffer_: Mock): - magic_from_buffer_.return_value = "text/yaml" + # -- .applies() --------------------------------------------- - with open(example_doc_path("simple.yaml"), "rb") as f: - head = f.read(4096) - f.seek(0) - file_type = detect_filetype(file=f) + def it_provides_a_qualifying_alternate_constructor_which_constructs_when_applicable(self): + """The constructor determines whether this differentiator is applicable. - magic_from_buffer_.assert_called_once_with(head, mime=True) - assert file_type == FileType.TXT + It returns an instance only when differentiating a zip file-type is required, which it can + judge from the mime-type provided by the context (`ctx`). + """ + ctx = _FileTypeDetectionContext(example_doc_path("simple.docx")) + differentiator = _ZipFileDifferentiator.applies(ctx, "application/zip") -# ================================================================================================ -# MODULE-LEVEL FIXTURES -# ================================================================================================ + assert isinstance(differentiator, _ZipFileDifferentiator) + def and_it_returns_None_when_zip_differentiation_does_not_apply_to_the_detection_context(self): + ctx = _FileTypeDetectionContext(example_doc_path("norwich-city.txt")) + assert _ZipFileDifferentiator.applies(ctx, "application/epub") is None -# -- `from_buffer()` and `from_file()` are not "methods" on `magic` per-se (`magic` is a module) -# -- but they behave like methods for mocking purposes. -@pytest.fixture() -def magic_from_buffer_(request: FixtureRequest): - return method_mock(request, magic, "from_buffer") + # -- .file_type --------------------------------------------- + @pytest.mark.parametrize( + ("file_name", "expected_value"), + [ + ("simple.docx", FileType.DOCX), + ("picture.pptx", FileType.PPTX), + ("vodafone.xlsx", FileType.XLSX), + ("simple.zip", FileType.ZIP), + ("README.org", None), + ], + ) + def it_distinguishes_the_file_type_of_applicable_zip_files( + self, file_name: str, expected_value: FileType | None + ): + ctx = _FileTypeDetectionContext(example_doc_path(file_name)) + differentiator = _ZipFileDifferentiator(ctx) -@pytest.fixture() -def magic_from_file_(request: FixtureRequest): - return method_mock(request, magic, "from_file") + assert differentiator.file_type is expected_value diff --git a/test_unstructured/file_utils/test_model.py b/test_unstructured/file_utils/test_model.py index 91d2b8bec1..98088ee75b 100644 --- a/test_unstructured/file_utils/test_model.py +++ b/test_unstructured/file_utils/test_model.py @@ -10,6 +10,14 @@ class DescribeFileType: """Unit-test suite for `unstructured.file_utils.model.Filetype`.""" + # -- .__lt__() ---------------------------------------------- + + def it_is_a_collection_ordered_by_name_and_can_be_sorted(self): + """FileType is a total order on name, e.g. FileType.A < FileType.B.""" + assert FileType.EML < FileType.HTML < FileType.XML + + # -- .from_extension() -------------------------------------- + @pytest.mark.parametrize( ("ext", "file_type"), [ @@ -23,10 +31,12 @@ class DescribeFileType: def it_can_recognize_a_file_type_from_an_extension(self, ext: str, file_type: FileType | None): assert FileType.from_extension(ext) is file_type - @pytest.mark.parametrize("ext", [".foobar", ".xyz", ".mdx", "", "."]) - def but_not_when_that_extension_is_empty_or_not_registered(self, ext: str): + @pytest.mark.parametrize("ext", [".foobar", ".xyz", ".mdx", "", ".", None]) + def but_not_when_that_extension_is_empty_or_None_or_not_registered(self, ext: str | None): assert FileType.from_extension(ext) is None + # -- .from_mime_type() -------------------------------------- + @pytest.mark.parametrize( ("mime_type", "file_type"), [ @@ -46,29 +56,13 @@ def it_can_recognize_a_file_type_from_a_mime_type( ): assert FileType.from_mime_type(mime_type) is file_type - @pytest.mark.parametrize("mime_type", ["text/css", "image/gif", "audio/mpeg", "foo/bar"]) - def but_not_when_that_mime_type_is_not_registered_by_a_file_type(self, mime_type: str): + @pytest.mark.parametrize("mime_type", ["text/css", "image/gif", "audio/mpeg", "foo/bar", None]) + def but_not_when_that_mime_type_is_not_registered_by_a_file_type_or_None( + self, mime_type: str | None + ): assert FileType.from_mime_type(mime_type) is None - @pytest.mark.parametrize( - ("file_type", "expected_value"), - [ - (FileType.BMP, ("unstructured_inference",)), - (FileType.CSV, ("pandas",)), - (FileType.DOC, ("docx",)), - (FileType.EMPTY, ()), - (FileType.HTML, ()), - (FileType.ODT, ("docx", "pypandoc")), - (FileType.PDF, ("pdf2image", "pdfminer", "PIL")), - (FileType.UNK, ()), - (FileType.WAV, ()), - (FileType.ZIP, ()), - ], - ) - def it_knows_which_importable_packages_its_partitioner_depends_on( - self, file_type: FileType, expected_value: tuple[str, ...] - ): - assert file_type.importable_package_dependencies == expected_value + # -- .extra_name -------------------------------------------- @pytest.mark.parametrize( ("file_type", "expected_value"), @@ -91,6 +85,30 @@ def and_it_knows_which_pip_extra_needs_to_be_installed_to_get_those_dependencies ): assert file_type.extra_name == expected_value + # -- .importable_package_dependencies ----------------------- + + @pytest.mark.parametrize( + ("file_type", "expected_value"), + [ + (FileType.BMP, ("unstructured_inference",)), + (FileType.CSV, ("pandas",)), + (FileType.DOC, ("docx",)), + (FileType.EMPTY, ()), + (FileType.HTML, ()), + (FileType.ODT, ("docx", "pypandoc")), + (FileType.PDF, ("pdf2image", "pdfminer", "PIL")), + (FileType.UNK, ()), + (FileType.WAV, ()), + (FileType.ZIP, ()), + ], + ) + def it_knows_which_importable_packages_its_partitioner_depends_on( + self, file_type: FileType, expected_value: tuple[str, ...] + ): + assert file_type.importable_package_dependencies == expected_value + + # -- .is_partitionable -------------------------------------- + @pytest.mark.parametrize( ("file_type", "expected_value"), [ @@ -112,6 +130,8 @@ def it_knows_whether_files_of_its_type_are_directly_partitionable( ): assert file_type.is_partitionable is expected_value + # -- .mime_type --------------------------------------------- + @pytest.mark.parametrize( ("file_type", "mime_type"), [ @@ -131,6 +151,8 @@ def it_knows_whether_files_of_its_type_are_directly_partitionable( def it_knows_its_canonical_MIME_type(self, file_type: FileType, mime_type: str): assert file_type.mime_type == mime_type + # -- .partitioner_function_name ----------------------------- + @pytest.mark.parametrize( ("file_type", "expected_value"), [ @@ -155,6 +177,8 @@ def but_it_raises_on_partitioner_function_name_access_when_the_file_type_is_not_ with pytest.raises(ValueError, match="`.partitioner_function_name` is undefined because "): file_type.partitioner_function_name + # -- .partitioner_module_qname ------------------------------ + @pytest.mark.parametrize( ("file_type", "expected_value"), [ @@ -181,6 +205,8 @@ def but_it_raises_on_partitioner_module_qname_access_when_the_file_type_is_not_p with pytest.raises(ValueError, match="`.partitioner_module_qname` is undefined because "): file_type.partitioner_module_qname + # -- .partitioner_shortname --------------------------------- + @pytest.mark.parametrize( ("file_type", "expected_value"), [ diff --git a/test_unstructured/metrics/test_element_type.py b/test_unstructured/metrics/test_element_type.py index 54939af0bb..183efb8c69 100644 --- a/test_unstructured/metrics/test_element_type.py +++ b/test_unstructured/metrics/test_element_type.py @@ -1,6 +1,10 @@ +from __future__ import annotations + import pytest +from test_unstructured.unit_utils import example_doc_path from unstructured.metrics.element_type import ( + FrequencyDict, calculate_element_type_percent_match, get_element_type_frequency, ) @@ -14,10 +18,9 @@ ( "fake-email.txt", { - ("UncategorizedText", None): 6, + ("NarrativeText", None): 1, + ("Title", None): 1, ("ListItem", None): 2, - ("Title", None): 5, - ("NarrativeText", None): 2, }, ), ( @@ -34,8 +37,8 @@ ), ], ) -def test_get_element_type_frequency(filename, frequency): - elements = partition(filename=f"example-docs/{filename}") +def test_get_element_type_frequency(filename: str, frequency: dict[tuple[str, int | None], int]): + elements = partition(example_doc_path(filename)) elements_freq = get_element_type_frequency(elements_to_json(elements)) assert elements_freq == frequency @@ -46,11 +49,11 @@ def test_get_element_type_frequency(filename, frequency): ( "fake-email.txt", { - ("UncategorizedText", None): 14, + ("Title", None): 1, ("ListItem", None): 2, ("NarrativeText", None): 2, }, - (0.56, 0.56, 0.56), + (0.8, 0.8, 0.80), ), ( "sample-presentation.pptx", @@ -92,8 +95,10 @@ def test_get_element_type_frequency(filename, frequency): ), ], ) -def test_calculate_element_type_percent_match(filename, expected_frequency, percent_matched): - elements = partition(filename=f"example-docs/{filename}") +def test_calculate_element_type_percent_match( + filename: str, expected_frequency: FrequencyDict, percent_matched: tuple[float, float, float] +): + elements = partition(example_doc_path(filename)) elements_frequency = get_element_type_frequency(elements_to_json(elements)) assert ( round(calculate_element_type_percent_match(elements_frequency, expected_frequency), 2) diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py index a09d45f2d0..3e3d4c6b96 100644 --- a/test_unstructured/partition/test_auto.py +++ b/test_unstructured/partition/test_auto.py @@ -929,7 +929,11 @@ def test_auto_partition_raises_with_bad_type(request: FixtureRequest): partition(filename="made-up.fake", strategy=PartitionStrategy.HI_RES) detect_filetype_.assert_called_once_with( - content_type=None, encoding=None, file=None, file_filename=None, filename="made-up.fake" + file_path="made-up.fake", + file=None, + encoding=None, + content_type=None, + metadata_file_path=None, ) @@ -1305,7 +1309,7 @@ def test_auto_partition_that_requires_extras_raises_when_dependencies_are_not_in ) match = r"partition_pdf\(\) is not available because one or more dependencies are not installed" with pytest.raises(ImportError, match=match): - partition(example_doc_path("layout-parser-paper-fast.pdf")) + partition(example_doc_path("pdf/layout-parser-paper-fast.pdf")) dependency_exists_.assert_called_once_with("pdf2image") diff --git a/test_unstructured/partition/test_json.py b/test_unstructured/partition/test_json.py index 5b08a23e84..f5a5e0b56a 100644 --- a/test_unstructured/partition/test_json.py +++ b/test_unstructured/partition/test_json.py @@ -9,8 +9,8 @@ import pytest from pytest_mock import MockFixture +from test_unstructured.unit_utils import example_doc_path from unstructured.documents.elements import CompositeElement -from unstructured.file_utils.filetype import detect_filetype from unstructured.file_utils.model import FileType from unstructured.partition.email import partition_email from unstructured.partition.html import partition_html @@ -43,9 +43,9 @@ def test_it_chunks_elements_when_a_chunking_strategy_is_specified(): @pytest.mark.parametrize("filename", test_files) def test_partition_json_from_filename(filename: str): - path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename) + path = example_doc_path(filename) elements = [] - filetype = detect_filetype(filename=path) + filetype = FileType.from_extension(os.path.splitext(path)[1]) if filetype == FileType.TXT: elements = partition_text(filename=path) if filetype == FileType.HTML: @@ -72,9 +72,9 @@ def test_partition_json_from_filename(filename: str): @pytest.mark.parametrize("filename", test_files) def test_partition_json_from_filename_with_metadata_filename(filename: str): - path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename) + path = example_doc_path(filename) elements = [] - filetype = detect_filetype(filename=path) + filetype = FileType.from_extension(os.path.splitext(path)[1]) if filetype == FileType.TXT: elements = partition_text(filename=path) if filetype == FileType.HTML: @@ -97,9 +97,9 @@ def test_partition_json_from_filename_with_metadata_filename(filename: str): @pytest.mark.parametrize("filename", test_files) def test_partition_json_from_file(filename: str): - path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename) + path = example_doc_path(filename) elements = [] - filetype = detect_filetype(filename=path) + filetype = FileType.from_extension(os.path.splitext(path)[1]) if filetype == FileType.TXT: elements = partition_text(filename=path) if filetype == FileType.HTML: @@ -126,9 +126,9 @@ def test_partition_json_from_file(filename: str): @pytest.mark.parametrize("filename", test_files) def test_partition_json_from_file_with_metadata_filename(filename: str): - path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename) + path = example_doc_path(filename) elements = [] - filetype = detect_filetype(filename=path) + filetype = FileType.from_extension(os.path.splitext(path)[1]) if filetype == FileType.TXT: elements = partition_text(filename=path) if filetype == FileType.HTML: @@ -150,9 +150,9 @@ def test_partition_json_from_file_with_metadata_filename(filename: str): @pytest.mark.parametrize("filename", test_files) def test_partition_json_from_text(filename: str): - path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename) + path = example_doc_path(filename) elements = [] - filetype = detect_filetype(filename=path) + filetype = FileType.from_extension(os.path.splitext(path)[1]) if filetype == FileType.TXT: elements = partition_text(filename=path) if filetype == FileType.HTML: @@ -192,9 +192,9 @@ def test_partition_json_works_with_empty_list(): def test_partition_json_raises_with_too_many_specified(): - path = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt") + path = example_doc_path("fake-text.txt") elements = [] - filetype = detect_filetype(filename=path) + filetype = FileType.from_extension(os.path.splitext(path)[1]) if filetype == FileType.TXT: elements = partition_text(filename=path) if filetype == FileType.HTML: @@ -225,9 +225,9 @@ def test_partition_json_raises_with_too_many_specified(): @pytest.mark.parametrize("filename", test_files) def test_partition_json_from_filename_exclude_metadata(filename: str): - path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename) + path = example_doc_path(filename) elements = [] - filetype = detect_filetype(filename=path) + filetype = FileType.from_extension(os.path.splitext(path)[1]) if filetype == FileType.TXT: elements = partition_text(filename=path) if filetype == FileType.HTML: @@ -249,9 +249,9 @@ def test_partition_json_from_filename_exclude_metadata(filename: str): @pytest.mark.parametrize("filename", test_files) def test_partition_json_from_file_exclude_metadata(filename: str): - path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename) + path = example_doc_path(filename) elements = [] - filetype = detect_filetype(filename=path) + filetype = FileType.from_extension(os.path.splitext(path)[1]) if filetype == FileType.TXT: elements = partition_text(filename=path) if filetype == FileType.HTML: @@ -274,9 +274,9 @@ def test_partition_json_from_file_exclude_metadata(filename: str): @pytest.mark.parametrize("filename", test_files) def test_partition_json_from_text_exclude_metadata(filename: str): - path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename) + path = example_doc_path(filename) elements = [] - filetype = detect_filetype(filename=path) + filetype = FileType.from_extension(os.path.splitext(path)[1]) if filetype == FileType.TXT: elements = partition_text(filename=path) if filetype == FileType.HTML: diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 648e215035..d9462c4cfd 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.15.1-dev1" # pragma: no cover +__version__ = "0.15.1-dev2" # pragma: no cover diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py index c11d81c6c2..5e930c366e 100644 --- a/unstructured/file_utils/filetype.py +++ b/unstructured/file_utils/filetype.py @@ -1,18 +1,48 @@ +"""Automatically detect file-type based on inspection of the file's contents. + +Auto-detection proceeds via a sequence of strategies. The first strategy to confidently determine a +file-type returns that value. A strategy that is not applicable, either because it lacks the input +required or fails to determine a file-type, returns `None` and execution continues with the next +strategy. + +`_FileTypeDetector` is the main object and implements the three strategies. + +The three strategies are: + +- Use MIME-type asserted by caller in the `content_type` argument. +- Guess a MIME-type using libmagic, falling back to the `filetype` package when libmagic is + unavailable. +- Map filename-extension to a `FileType` member. + +A file that fails all three strategies is assigned the value `FileType.UNK`, for "unknown". + +`_FileTypeDetectionContext` encapsulates the various arguments received by `detect_filetype()` and +provides values derived from them. This object is immutable and can be passed to delegates of +`_FileTypeDetector` to provide whatever context they need on the current detection instance. + +`_FileTypeDetector` delegates to _differentiator_ objects like `_ZipFileDifferentiator` for +specialized discrimination and/or confirmation of ambiguous or frequently mis-identified +MIME-types. Additional differentiators are planned, one for `application/x-ole-storage` +(DOC, PPT, XLS, and MSG file-types) and perhaps others. +""" + from __future__ import annotations +import contextlib import functools import importlib.util import json import os import re import zipfile -from typing import IO, Callable, List, Optional +from typing import IO, Callable, Iterator, Optional +import filetype as ft from typing_extensions import ParamSpec from unstructured.documents.elements import Element from unstructured.file_utils.encoding import detect_file_encoding, format_encoding_str -from unstructured.file_utils.model import PLAIN_TEXT_EXTENSIONS, FileType +from unstructured.file_utils.model import FileType from unstructured.logger import logger from unstructured.nlp.patterns import EMAIL_HEAD_RE, LIST_OF_DICTS_PATTERN from unstructured.partition.common import ( @@ -21,320 +51,546 @@ remove_element_metadata, set_element_hierarchy, ) -from unstructured.utils import get_call_args_applying_defaults +from unstructured.utils import get_call_args_applying_defaults, lazyproperty LIBMAGIC_AVAILABLE = bool(importlib.util.find_spec("magic")) def detect_filetype( + file_path: str | None = None, + file: IO[bytes] | None = None, + encoding: str | None = None, + content_type: str | None = None, + metadata_file_path: Optional[str] = None, +) -> FileType: + """Determine file-type of specified file using libmagic and/or fallback methods. + + One of `file_path` or `file` must be specified. A `file_path` that does not + correspond to a file on the filesystem raises `ValueError`. + + Args: + content_type: MIME-type of document-source, when already known. Providing + a value for this argument disables auto-detection unless it does not map + to a FileType member or is ambiguous, in which case it is ignored. + encoding: Only used for textual file-types. When omitted, `utf-8` is + assumed. Should generally be omitted except to resolve a problem with + textual file-types like HTML. + metadata_file_path: Only used when `file` is provided and then only as a + source for a filename-extension that may be needed as a secondary + content-type indicator. Ignored with the document is specified using + `file_path`. + + Returns: + A member of the `FileType` enumeration, `FileType.UNK` when the file type + could not be determined or is not supported. + + Raises: + ValueError: when: + - `file_path` is specified but does not correspond to a file on the + fileesystem. + - Neither `file_path` nor `file` were specified. + """ + ctx = _FileTypeDetectionContext.new( + file_path=file_path, + file=file, + encoding=encoding, + content_type=content_type, + metadata_file_path=metadata_file_path, + ) + return _FileTypeDetector.file_type(ctx) + + +def is_json_processable( filename: Optional[str] = None, - content_type: Optional[str] = None, file: Optional[IO[bytes]] = None, - file_filename: Optional[str] = None, + file_text: Optional[str] = None, encoding: Optional[str] = "utf-8", -) -> FileType: - """Use libmagic to determine a file's type. +) -> bool: + """True when file looks like a JSON array of objects. - Helps determine which partition brick to use for a given file. A return value of None indicates - a non-supported file type. + Uses regex on a file prefix, so not entirely reliable but good enough if you already know the + file is JSON. """ - mime_type = None - exactly_one(filename=filename, file=file) + exactly_one(filename=filename, file=file, file_text=file_text) + if file_text is None: + file_text = _read_file_start_for_type_check( + file=file, + filename=filename, + encoding=encoding, + ) + return re.match(LIST_OF_DICTS_PATTERN, file_text) is not None + - # first check (content_type) - if content_type: - file_type = FileType.from_mime_type(content_type) - if file_type: +class _FileTypeDetector: + """Determines file type from a variety of possible inputs.""" + + def __init__(self, ctx: _FileTypeDetectionContext): + self._ctx = ctx + + @classmethod + def file_type(cls, ctx: _FileTypeDetectionContext) -> FileType: + """Detect file-type of document-source described by `ctx`.""" + return cls(ctx)._file_type + + @property + def _file_type(self) -> FileType: + """FileType member corresponding to this document source.""" + # -- strategy 1: use content-type asserted by caller -- + if file_type := self._file_type_from_content_type: return file_type - # second check (filename/file_name/file) - # continue if successfully define mime_type - if filename or file_filename: - _filename = filename or file_filename or "" - _, extension = os.path.splitext(_filename) - extension = extension.lower() - if os.path.isfile(_filename) and LIBMAGIC_AVAILABLE: - import magic + # -- strategy 2: guess MIME-type using libmagic and use that -- + if file_type := self._file_type_from_guessed_mime_type: + return file_type + + # -- strategy 3: use filename-extension, like ".docx" -> FileType.DOCX -- + if file_type := self._file_type_from_file_extension: + return file_type + + # -- strategy 4: give up and report FileType.UNK -- + return FileType.UNK + + # == STRATEGIES ============================================================ + + @property + def _file_type_from_content_type(self) -> FileType | None: + """Map passed content-type argument to a file-type, subject to certain rules.""" + content_type = self._ctx.content_type + + # -- when no content-type was asserted by caller, this strategy is not applicable -- + if not content_type: + return None + + # -- otherwise we trust the passed `content_type` as long as `FileType` recognizes it -- + return FileType.from_mime_type(content_type) + + @property + def _file_type_from_guessed_mime_type(self) -> FileType | None: + """FileType based on auto-detection of MIME-type by libmagic. - mime_type = magic.from_file(_resolve_symlink(_filename), mime=True) - elif os.path.isfile(_filename): - import filetype as ft + In some cases refinements are necessary on the magic-derived MIME-types. This process + includes applying those rules, most of which are accumulated through practical experience. + """ + mime_type = self._ctx.mime_type + extension = self._ctx.extension - mime_type = ft.guess_mime(_filename) + # -- when libmagic is not installed, the `filetype` package is used instead. + # -- `filetype.guess()` returns `None` for file-types it does not support, which + # -- unfortunately includes all the textual file-types like CSV, EML, HTML, MD, RST, RTF, + # -- TSV, and TXT. When we have no guessed MIME-type, this strategy is not applicable. if mime_type is None: - return FileType.from_extension(extension) or FileType.UNK + return None - elif file is not None: - if hasattr(file, "name"): - _, extension = os.path.splitext(file.name) - else: - extension = "" - extension = extension.lower() - # NOTE(robinson) - the python-magic docs recommend reading at least the first 2048 bytes - # Increased to 4096 because otherwise .xlsx files get detected as a zip file - # ref: https://github.com/ahupp/python-magic#usage + # NOTE(Crag): older magic lib does not differentiate between xls and doc + if mime_type == "application/msword" and extension == ".xls": + return FileType.XLS + + if mime_type.endswith("xml"): + return FileType.HTML if extension in (".html", ".htm") else FileType.XML + + if differentiator := _TextFileDifferentiator.applies(self._ctx): + return differentiator.file_type + + # -- applicable to "application/octet-stream", "application/zip", and all Office 2007+ + # -- document MIME-types, i.e. those for DOCX, PPTX, and XLSX. Note however it does NOT + # -- apply to EPUB or ODT documents, even though those are also Zip archives. The zip and + # -- octet-stream MIME-types are fed in because they are ambiguous. The MS-Office types are + # -- differentiated because they are sometimes mistaken for each other, like DOCX mime-type + # -- is actually a PPTX file etc. + if differentiator := _ZipFileDifferentiator.applies(self._ctx, mime_type): + return differentiator.file_type + + # -- All source-code files (e.g. *.py, *.js) are classified as plain text for the moment -- + if self._ctx.has_code_mime_type: + return FileType.TXT + + if mime_type.endswith("empty"): + return FileType.EMPTY + + # -- if no more-specific rules apply, use the MIME-type -> FileType mapping when present -- + if file_type := FileType.from_mime_type(mime_type): + return file_type + + logger.warning( + f"The MIME type{f' of {self._ctx.file_path!r}' if self._ctx.file_path else ''} is" + f" {mime_type!r}. This file type is not currently supported in unstructured.", + ) + return None + + @lazyproperty + def _file_type_from_file_extension(self) -> FileType | None: + """Determine file-type from filename extension. + + Returns `None` when no filename is available or when the extension does not map to a + supported file-type. + """ + return FileType.from_extension(self._ctx.extension) + + +class _FileTypeDetectionContext: + """Provides all arguments to auto-file detection and values derived from them. + + This keeps computation of derived values out of the file-detection code but more importantly + allows the main filetype-detector to pass the full context to any delegates without coupling + itself to which values it might need. + """ + + def __init__( + self, + file_path: str | None = None, + *, + file: IO[bytes] | None = None, + encoding: str | None = None, + content_type: str | None = None, + metadata_file_path: str | None = None, + ): + self._file_path = file_path + self._file_arg = file + self._encoding_arg = encoding + self._content_type = content_type + self._metadata_file_path = metadata_file_path + + @classmethod + def new( + cls, + *, + file_path: str | None, + file: IO[bytes] | None, + encoding: str | None, + content_type: str | None, + metadata_file_path: str | None, + ): + self = cls( + file_path=file_path, + file=file, + encoding=encoding, + content_type=content_type, + metadata_file_path=metadata_file_path, + ) + self._validate() + return self + + @lazyproperty + def content_type(self) -> str | None: + """MIME-type asserted by caller; not based on inspection of file by this process. + + Would commonly occur when the file was downloaded via HTTP and a `"Content-Type:` header was + present on the response. These are often ambiguous and sometimes just wrong so get some + further verification. All lower-case when not `None`. + """ + return self._content_type.lower() if self._content_type else None + + @lazyproperty + def encoding(self) -> str: + """Character-set used to encode text of this file. + + Relevant for textual file-types only, like HTML, TXT, JSON, etc. + """ + return format_encoding_str(self._encoding_arg or "utf-8") + + @lazyproperty + def extension(self) -> str: + """Best filename-extension we can muster, "" when there is no available source.""" + # -- get from file_path, or file when it has a name (path) -- + with self.open() as file: + if hasattr(file, "name") and file.name: + return os.path.splitext(file.name)[1].lower() + + # -- otherwise use metadata file-path when provided -- + if file_path := self._metadata_file_path: + return os.path.splitext(file_path)[1].lower() + + # -- otherwise empty str means no extension, same as a path like "a/b/name-no-ext" -- + return "" + + @lazyproperty + def file_head(self) -> bytes: + """The initial bytes of the file to be recognized, for use with libmagic detection.""" + with self.open() as file: + return file.read(4096) + + @lazyproperty + def file_path(self) -> str | None: + """Filesystem path to file to be inspected, when provided on call. + + None when the caller specified the source as a file-like object instead. Useful for user + feedback on an error, but users of context should have little use for it otherwise. + """ + return self._file_path + + @lazyproperty + def is_zipfile(self) -> bool: + """True when file is a Zip archive.""" + with self.open() as file: + return zipfile.is_zipfile(file) + + @lazyproperty + def has_code_mime_type(self) -> bool: + """True when `mime_type` plausibly indicates a programming language source-code file.""" + mime_type = self.mime_type + + if mime_type is None: + return False + + # -- check Go separately to avoid matching other MIME type containing "go" -- + if mime_type == "text/x-go": + return True + + return any( + lang in mime_type + for lang in "c# c++ cpp csharp java javascript php python ruby swift typescript".split() + ) + + @lazyproperty + def mime_type(self) -> str | None: + """The best MIME-type we can get from `magic` (or `filetype` package). + + A `str` return value is always in lower-case. + """ if LIBMAGIC_AVAILABLE: import magic - mime_type = magic.from_buffer(file.read(4096), mime=True) - else: - import filetype as ft + mime_type = ( + magic.from_file(_resolve_symlink(self._file_path), mime=True) + if self._file_path + else magic.from_buffer(self.file_head, mime=True) + ) + return mime_type.lower() if mime_type else None + + mime_type = ( + ft.guess_mime(self._file_path) if self._file_path else ft.guess_mime(self.file_head) + ) - mime_type = ft.guess_mime(file.read(4096)) if mime_type is None: logger.warning( - "libmagic is unavailable but assists in filetype detection on file-like objects. " - "Please consider installing libmagic for better results.", + "libmagic is unavailable but assists in filetype detection. Please consider" + " installing libmagic for better results." ) - return FileType.from_extension(extension) or FileType.UNK + return None - else: - raise ValueError("No filename, file, nor file_filename were specified.") + return mime_type.lower() - """Mime type special cases.""" - # third check (mime_type) + @contextlib.contextmanager + def open(self) -> Iterator[IO[bytes]]: + """Encapsulates complexity of dealing with file-path or file-like-object. - # NOTE(Crag): older magic lib does not differentiate between xls and doc - if mime_type == "application/msword" and extension == ".xls": - return FileType.XLS + Provides an `IO[bytes]` object as the "common-denominator" document source. - elif mime_type.endswith("xml"): - if extension == ".html" or extension == ".htm": - return FileType.HTML + Must be used as a context manager using a `with` statement: + + with self._file as file: + do things with file + + File is guaranteed to be at read position 0 when called. + """ + if self._file_path: + with open(self._file_path, "rb") as f: + yield f else: - return FileType.XML - - # -- ref: https://www.rfc-editor.org/rfc/rfc822 -- - elif mime_type == "message/rfc822" or mime_type.startswith("text"): - if not encoding: - encoding = "utf-8" - formatted_encoding = format_encoding_str(encoding) - - if extension in [ - ".eml", - ".p7s", - ".md", - ".rtf", - ".html", - ".rst", - ".org", - ".csv", - ".tsv", - ".json", - ]: + file = self._file_arg + assert file is not None # -- guaranteed by `._validate()` -- + file.seek(0) + yield file + + @lazyproperty + def text_head(self) -> str: + """The initial characters of the text file for use with text-format differentiation. + + Raises: + UnicodeDecodeError if file cannot be read as text. + """ + # TODO: only attempts fallback character-set detection for file-path case, not for + # file-like object case. Seems like we should do both. + + if file := self._file_arg: + file.seek(0) + content = file.read(4096) + file.seek(0) + return ( + content + if isinstance(content, str) + else content.decode(encoding=self.encoding, errors="ignore") + ) + + file_path = self._file_path + assert file_path is not None # -- guaranteed by `._validate` -- + + try: + with open(file_path, encoding=self.encoding) as f: + return f.read(4096) + except UnicodeDecodeError: + encoding, _ = detect_file_encoding(filename=file_path) + with open(file_path, encoding=encoding) as f: + return f.read(4096) + + def _validate(self) -> None: + """Raise if the context is invalid.""" + if self._file_path and not os.path.isfile(self._file_path): + raise FileNotFoundError(f"no such file {self._file_path}") + if not self._file_path and not self._file_arg: + raise ValueError("either `file_path` or `file` argument must be provided") + + +class _TextFileDifferentiator: + """Refine a textual file-type that may not be as specific as it could be.""" + + def __init__(self, ctx: _FileTypeDetectionContext): + self._ctx = ctx + + @classmethod + def applies(cls, ctx: _FileTypeDetectionContext) -> _TextFileDifferentiator | None: + """Constructs an instance, but only if this differentiator applies in `ctx`.""" + mime_type = ctx.mime_type + return ( + cls(ctx) + if mime_type and (mime_type == "message/rfc822" or mime_type.startswith("text")) + else None + ) + + @lazyproperty + def file_type(self) -> FileType: + """Differentiated file-type for textual content. + + Always produces a file-type, worst case that's `FileType.TXT` when nothing more specific + applies. + """ + extension = self._ctx.extension + + if extension in ".csv .eml .html .json .md .org .p7s .rst .rtf .tab .tsv".split(): return FileType.from_extension(extension) or FileType.TXT # NOTE(crag): for older versions of the OS libmagic package, such as is currently # installed on the Unstructured docker image, .json files resolve to "text/plain" # rather than "application/json". this corrects for that case. - if _is_text_file_a_json( - file=file, - filename=filename, - encoding=formatted_encoding, - ): + if self._is_json: return FileType.JSON - if _is_text_file_a_csv( - file=file, - filename=filename, - encoding=formatted_encoding, - ): + if self._is_csv: return FileType.CSV - if file and _check_eml_from_buffer(file=file) is True: + if self._is_eml: return FileType.EML - if extension in PLAIN_TEXT_EXTENSIONS: - return FileType.from_extension(extension) or FileType.UNK + if extension in (".text", ".txt"): + return FileType.TXT # Safety catch - if file_type := FileType.from_mime_type(mime_type): + if file_type := FileType.from_mime_type(self._ctx.mime_type): return file_type return FileType.TXT - elif mime_type == "application/octet-stream": - if extension == ".docx": - return FileType.DOCX - elif file: - return _detect_filetype_from_octet_stream(file=file) - else: - return FileType.from_extension(extension) or FileType.UNK - - elif mime_type == "application/zip": - file_type = FileType.UNK - if file: - file_type = _detect_filetype_from_octet_stream(file=file) - elif filename is not None: - with open(filename, "rb") as f: - file_type = _detect_filetype_from_octet_stream(file=f) - - extension = extension if extension else "" + @lazyproperty + def _is_csv(self) -> bool: + """True when file is plausibly in Comma Separated Values (CSV) format.""" + + def count_commas(text: str): + """Counts the number of commas in a line, excluding commas in quotes.""" + pattern = r"(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)," + matches = re.findall(pattern, text) + return len(matches) + + lines = self._ctx.text_head.strip().splitlines() + if len(lines) < 2: + return False + # -- check at most the first 10 lines -- + lines = lines[: len(lines)] if len(lines) < 10 else lines[:10] + # -- any lines without at least one comma disqualifies the file -- + if any("," not in line for line in lines): + return False + header_count = count_commas(lines[0]) + return all(count_commas(line) == header_count for line in lines[1:]) + + @lazyproperty + def _is_eml(self) -> bool: + """Checks if a text/plain file is actually a .eml file. + + Uses a regex pattern to see if the start of the file matches the typical pattern for a .eml + file. + """ + return EMAIL_HEAD_RE.match(self._ctx.text_head) is not None + + @lazyproperty + def _is_json(self) -> bool: + """True when file is JSON collection. + + A JSON file that contains only a string, number, or boolean, while valid JSON, will fail + this test since it is not partitionable. + """ + text_head = self._ctx.text_head + + # -- an empty file is not JSON -- + if not text_head: + return False + + # -- has to be a list or object, no string, number, or bool -- + if text_head.lstrip()[0] not in "[{": + return False + + try: + with self._ctx.open() as file: + json.load(file) + return True + except json.JSONDecodeError: + return False + + +class _ZipFileDifferentiator: + """Refine a Zip-packaged file-type that may be ambiguous or swapped.""" + + def __init__(self, ctx: _FileTypeDetectionContext): + self._ctx = ctx + + @classmethod + def applies( + cls, ctx: _FileTypeDetectionContext, mime_type: str + ) -> _ZipFileDifferentiator | None: + """Constructs an instance, but only if this differentiator applies for `mime_type`. + + Separate `mime_type` argument allows it to be applied to either asserted content-type or + guessed mime-type. + """ return ( - FileType.ZIP - if file_type in (FileType.UNK, FileType.ZIP) - else FileType.from_extension(extension) or file_type + cls(ctx) + if mime_type + in ( + "application/octet-stream", + "application/zip", + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "application/vnd.openxmlformats-officedocument.presentationml.presentation", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + ) + else None ) - elif _is_code_mime_type(mime_type): - # NOTE(robinson) - we'll treat all code files as plain text for now. - # we can update this logic and add filetypes for specific languages - # later if needed. - return FileType.TXT + @lazyproperty + def file_type(self) -> FileType | None: + """Differentiated file-type for a Zip archive. - elif mime_type.endswith("empty"): - return FileType.EMPTY + Returns `None` if the file is not a Zip archive. Otherwise it returns `FileType.DOCX`, + `FileType.PPTX`, or `FileType.XLSX` when one of those applies and `FileType.ZIP` otherwise. + """ + if not self._ctx.is_zipfile: + return None - # For everything else - elif file_type := FileType.from_mime_type(mime_type): - return file_type + with self._ctx.open() as file: + zip = zipfile.ZipFile(file) - logger.warning( - f"The MIME type{f' of {filename!r}' if filename else ''} is {mime_type!r}. " - "This file type is not currently supported in unstructured.", - ) - return FileType.from_extension(extension) or FileType.UNK + # NOTE(robinson) - .docx and .xlsx files are actually a zip file with a .docx/.xslx + # extension. If the MIME type is application/octet-stream, we check if it's a + # .docx/.xlsx file by looking for expected filenames within the zip file. + filenames = [f.filename for f in zip.filelist] + if all(f in filenames for f in ("word/document.xml",)): + return FileType.DOCX -def is_json_processable( - filename: Optional[str] = None, - file: Optional[IO[bytes]] = None, - file_text: Optional[str] = None, - encoding: Optional[str] = "utf-8", -) -> bool: - """True when file looks like a JSON array of objects. - - Uses regex on a file prefix, so not entirely reliable but good enough if you already know the - file is JSON. - """ - exactly_one(filename=filename, file=file, file_text=file_text) - if file_text is None: - file_text = _read_file_start_for_type_check( - file=file, - filename=filename, - encoding=encoding, - ) - return re.match(LIST_OF_DICTS_PATTERN, file_text) is not None + if all(f in filenames for f in ("xl/workbook.xml",)): + return FileType.XLSX + if all(f in filenames for f in ("ppt/presentation.xml",)): + return FileType.PPTX -def _check_eml_from_buffer(file: IO[bytes] | IO[str]) -> bool: - """Checks if a text/plain file is actually a .eml file. - - Uses a regex pattern to see if the start of the file matches the typical pattern for a .eml - file. - """ - file.seek(0) - file_content = file.read(4096) - if isinstance(file_content, bytes): - file_head = file_content.decode("utf-8", errors="ignore") - else: - file_head = file_content - return EMAIL_HEAD_RE.match(file_head) is not None - - -def _detect_filetype_from_octet_stream(file: IO[bytes]) -> FileType: - """Detects the filetype, given a file with an application/octet-stream MIME type.""" - file.seek(0) - if zipfile.is_zipfile(file): - file.seek(0) - archive = zipfile.ZipFile(file) - - # NOTE(robinson) - .docx.xlsx files are actually zip file with a .docx/.xslx extension. - # If the MIME type is application/octet-stream, we check if it's a .docx/.xlsx file by - # looking for expected filenames within the zip file. - archive_filenames = [f.filename for f in archive.filelist] - if all(f in archive_filenames for f in ("docProps/core.xml", "word/document.xml")): - return FileType.DOCX - elif all(f in archive_filenames for f in ("xl/workbook.xml",)): - return FileType.XLSX - elif all(f in archive_filenames for f in ("docProps/core.xml", "ppt/presentation.xml")): - return FileType.PPTX - - if LIBMAGIC_AVAILABLE: - import magic - - # Infer mime type using magic if octet-stream is not zip file - mime_type = magic.from_buffer(file.read(4096), mime=True) - return FileType.from_mime_type(mime_type) or FileType.UNK - logger.warning( - "Could not detect the filetype from application/octet-stream MIME type.", - ) - return FileType.UNK - - -def _is_code_mime_type(mime_type: str) -> bool: - """True when `mime_type` plausibly indicates a programming language source-code file.""" - PROGRAMMING_LANGUAGES = [ - "javascript", - "python", - "java", - "c++", - "cpp", - "csharp", - "c#", - "php", - "ruby", - "swift", - "typescript", - ] - mime_type = mime_type.lower() - # NOTE(robinson) - check this one explicitly to avoid conflicts with other - # MIME types that contain "go" - if mime_type == "text/x-go": - return True - return any(language in mime_type for language in PROGRAMMING_LANGUAGES) - - -def _is_text_file_a_csv( - filename: Optional[str] = None, - file: Optional[IO[bytes]] = None, - encoding: Optional[str] = "utf-8", -): - """Detects if a file that has a text/plain MIME type is a CSV file.""" - - def count_commas(text: str): - """Counts the number of commas in a line, excluding commas in quotes.""" - pattern = r"(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)," - matches = re.findall(pattern, text) - return len(matches) - - file_text = _read_file_start_for_type_check( - file=file, - filename=filename, - encoding=encoding, - ) - lines = file_text.strip().splitlines() - if len(lines) < 2: - return False - lines = lines[: len(lines)] if len(lines) < 10 else lines[:10] - header_count = count_commas(lines[0]) - if any("," not in line for line in lines): - return False - return all(count_commas(line) == header_count for line in lines[1:]) - - -def _is_text_file_a_json( - filename: Optional[str] = None, - file: Optional[IO[bytes]] = None, - encoding: Optional[str] = "utf-8", -): - """Detects if a file that has a text/plain MIME type is a JSON file.""" - file_text = _read_file_start_for_type_check( - file=file, - filename=filename, - encoding=encoding, - ) - try: - output = json.loads(file_text) - # NOTE(robinson) - Per RFC 4627 which defines the application/json media type, - # a string is a valid JSON. For our purposes, however, we want to treat that - # as a text file even if it is serializable as json. - # References: - # https://stackoverflow.com/questions/7487869/is-this-simple-string-considered-valid-json - # https://www.ietf.org/rfc/rfc4627.txt - return not isinstance(output, str) - except json.JSONDecodeError: - return False + return FileType.ZIP def _read_file_start_for_type_check( @@ -379,9 +635,9 @@ def _resolve_symlink(file_path: str) -> str: _P = ParamSpec("_P") -def add_metadata(func: Callable[_P, List[Element]]) -> Callable[_P, List[Element]]: +def add_metadata(func: Callable[_P, list[Element]]) -> Callable[_P, list[Element]]: @functools.wraps(func) - def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> List[Element]: + def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> list[Element]: elements = func(*args, **kwargs) call_args = get_call_args_applying_defaults(func, *args, **kwargs) include_metadata = call_args.get("include_metadata", True) @@ -412,7 +668,7 @@ def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> List[Element]: def add_filetype( filetype: FileType, -) -> Callable[[Callable[_P, List[Element]]], Callable[_P, List[Element]]]: +) -> Callable[[Callable[_P, list[Element]]], Callable[_P, list[Element]]]: """Post-process element-metadata for list[Element] from partitioning. This decorator adds a post-processing step to a document partitioner. @@ -423,9 +679,9 @@ def add_filetype( """ - def decorator(func: Callable[_P, List[Element]]) -> Callable[_P, List[Element]]: + def decorator(func: Callable[_P, list[Element]]) -> Callable[_P, list[Element]]: @functools.wraps(func) - def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> List[Element]: + def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> list[Element]: elements = func(*args, **kwargs) params = get_call_args_applying_defaults(func, *args, **kwargs) include_metadata = params.get("include_metadata", True) @@ -447,10 +703,10 @@ def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> List[Element]: def add_metadata_with_filetype( filetype: FileType, -) -> Callable[[Callable[_P, List[Element]]], Callable[_P, List[Element]]]: +) -> Callable[[Callable[_P, list[Element]]], Callable[_P, list[Element]]]: """...""" - def decorator(func: Callable[_P, List[Element]]) -> Callable[_P, List[Element]]: + def decorator(func: Callable[_P, list[Element]]) -> Callable[_P, list[Element]]: return add_filetype(filetype=filetype)(add_metadata(func)) return decorator diff --git a/unstructured/file_utils/model.py b/unstructured/file_utils/model.py index 6c285a704a..0fe0caa63f 100644 --- a/unstructured/file_utils/model.py +++ b/unstructured/file_utils/model.py @@ -76,12 +76,14 @@ def from_extension(cls, extension: str | None) -> FileType | None: return None @classmethod - def from_mime_type(cls, mime_type: str) -> FileType | None: + def from_mime_type(cls, mime_type: str | None) -> FileType | None: """Select a FileType member based on a MIME-type. Returns `None` when `mime_type` is `None` or does not map to the canonical MIME-type of a `FileType` member or one of its alias MIME-types. """ + if mime_type is None: + return None # -- not super efficient but plenty fast enough for once-or-twice-per-file use and avoids # -- limitations on defining a class variable on an Enum. for m in cls.__members__.values(): @@ -434,6 +436,3 @@ def partitioner_shortname(self) -> str | None: "inode/x-empty", cast(list[str], []), ) - - -PLAIN_TEXT_EXTENSIONS = ".csv .eml .html .json .md .org .p7s .rst .rtf .tab .text .tsv .txt".split() diff --git a/unstructured/metrics/element_type.py b/unstructured/metrics/element_type.py index 3e4e8cbf86..6511900a4b 100644 --- a/unstructured/metrics/element_type.py +++ b/unstructured/metrics/element_type.py @@ -1,10 +1,23 @@ +from __future__ import annotations + import json -from typing import Dict, Optional, Tuple, Union + +from typing_extensions import TypeAlias + +FrequencyDict: TypeAlias = "dict[tuple[str, int | None], int]" +"""Like: + { + ("ListItem", 0): 2, + ("NarrativeText", None): 2, + ("Title", 0): 5, + ("UncategorizedText", None): 6, + } +""" def get_element_type_frequency( elements: str, -) -> Union[Dict[Tuple[str, Optional[int]], int], Dict]: +) -> FrequencyDict: """ Calculate the frequency of Element Types from a list of elements. @@ -13,7 +26,7 @@ def get_element_type_frequency( Returns: Element type and its frequency in dictionary format. """ - frequency: Dict = {} + frequency: dict[tuple[str, int | None], int] = {} if len(elements) == 0: return frequency for element in json.loads(elements): @@ -28,14 +41,14 @@ def get_element_type_frequency( def calculate_element_type_percent_match( - output: Dict, - source: Dict, + output: FrequencyDict, + source: FrequencyDict, category_depth_weight: float = 0.5, ) -> float: - """ - Calculate the percent match between two frequency dictionary. Intended to use with - `get_element_type_frequency` function. The function counts the absolute exact match - (type and depth), and counts the weighted match (correct type but different depth), + """Calculate the percent match between two frequency dictionary. + + Intended to use with `get_element_type_frequency` function. The function counts the absolute + exact match (type and depth), and counts the weighted match (correct type but different depth), then normalized with source's total elements. """ if len(output) == 0 or len(source) == 0: @@ -46,8 +59,8 @@ def calculate_element_type_percent_match( total_source_element_count = 0 total_match_element_count = 0 - unmatched_depth_output = {} - unmatched_depth_source = {} + unmatched_depth_output: dict[str, int] = {} + unmatched_depth_source: dict[str, int] = {} # loop through the output list to find match with source for k, _ in output_copy.items(): @@ -80,12 +93,12 @@ def calculate_element_type_percent_match( return min(max(total_match_element_count / total_source_element_count, 0.0), 1.0) -def _convert_to_frequency_without_depth(d: Dict) -> Dict: +def _convert_to_frequency_without_depth(d: FrequencyDict) -> dict[str, int]: """ Takes in element frequency with depth of format (type, depth): value and converts to dictionary without depth of format type: value """ - res = {} + res: dict[str, int] = {} for k, v in d.items(): element_type = k[0] if element_type not in res: diff --git a/unstructured/partition/auto.py b/unstructured/partition/auto.py index b7cad8055e..21c15d2f44 100644 --- a/unstructured/partition/auto.py +++ b/unstructured/partition/auto.py @@ -184,11 +184,11 @@ def partition( "The headers kwarg will be ignored.", ) file_type = detect_filetype( - filename=filename, + file_path=filename, file=file, - file_filename=metadata_filename, - content_type=content_type, encoding=encoding, + content_type=content_type, + metadata_file_path=metadata_filename, ) if file is not None: @@ -471,12 +471,13 @@ def file_and_type_from_url( response = requests.get(url, headers=headers, verify=ssl_verify, timeout=request_timeout) file = io.BytesIO(response.content) - content_type = ( - content_type or response.headers.get("Content-Type", "").split(";")[0].strip().lower() - ) - encoding = response.headers.get("Content-Encoding", "utf-8") + if content_type := content_type or response.headers.get("Content-Type", None): + content_type = content_type.split(";")[0].strip().lower() + + # -- non-None when response is textual -- + encoding = response.encoding - filetype = detect_filetype(file=file, content_type=content_type, encoding=encoding) + filetype = detect_filetype(file=file, encoding=encoding, content_type=content_type) return file, filetype diff --git a/unstructured/staging/base.py b/unstructured/staging/base.py index 331c860a9f..429195f681 100644 --- a/unstructured/staging/base.py +++ b/unstructured/staging/base.py @@ -133,10 +133,12 @@ def elements_to_json( filename: Optional[str] = None, indent: int = 4, encoding: str = "utf-8", -) -> Optional[str]: - """Saves a list of elements to a JSON file if filename is specified. +) -> str: + """Serialize `elements` to a JSON array. + + Also writes the JSON to `filename` if it is provided, encoded using `encoding`. - Otherwise, return the list of elements as a string. + The JSON is returned as a string. """ # -- serialize `elements` as a JSON array (str) -- precision_adjusted_elements = _fix_metadata_field_precision(elements) @@ -146,7 +148,6 @@ def elements_to_json( if filename is not None: with open(filename, "w", encoding=encoding) as f: f.write(json_str) - return None return json_str