From e0db77fbf7bb5f9bd9c12ccdbcea558980156607 Mon Sep 17 00:00:00 2001 From: Elias Dabbas Date: Tue, 16 Jul 2024 01:11:52 +0300 Subject: [PATCH] Update docs for v0.15.0 --- .../doctrees/advertools.crawlytics.doctree | Bin 200452 -> 222171 bytes docs/_build/doctrees/environment.pickle | Bin 935144 -> 943545 bytes .../html/_modules/advertools/crawlytics.html | 79 +++++++-- docs/_build/html/advertools.crawlytics.html | 155 ++++++++++++++++-- docs/_build/html/genindex.html | 6 +- docs/_build/html/objects.inv | Bin 2657 -> 2666 bytes docs/_build/html/searchindex.js | 2 +- 7 files changed, 205 insertions(+), 37 deletions(-) diff --git a/docs/_build/doctrees/advertools.crawlytics.doctree b/docs/_build/doctrees/advertools.crawlytics.doctree index 425bf907df970896747a2c8db48de1f90aaa844f..54d8bd7c3d04d8eebf1358f3e6502e24d908b9c3 100644 GIT binary patch delta 13385 zcmeHOdw5gFmFHYpG8o%D3=V`xu4QcO)RrY12sW5Uff5pMY+knU^IDdXge(bN31bps zI|WLpDGwQjF9DK#o6sfg7f4g0FLV=3+ca&rTObXaWH%7_(oHw(ghxZercK&Ca~~sJ zUmNW0w)tb>4|M08Ilnn)=FFMjoeNtIC;s~zNypEO26OoA=v4gsV9KI!PD+K^e(7M! zmOzr(+ZuE=J4B1!*K7{wZd;|5ytHduTPtuUM!=OqI7I$j2-6bzCYQ(I7lO`Wa~_3` zH|Hc+EEb+r7C~k@A8@R7Y%I3>JfUW9oz#%k*#eQ;|!H;Fkbz5d0k%`N?7c4*ogv`VzA?(N2t4+LDK#R_mV@EILNUE9HMm%)a0GftXAL}yKek$Qt%sbgg||!}{v~PeH@F%YYeYzR> zbqnkP_#@o|vnSBHQ)dqdEgp+s2&@k|g68ZhbFkI#Fq6pbIwWs`3iv|a8b99zPk`|+ z=-{tO$9^ns*K9+HWizhB7Z1Z^a^ewq0RHmwW=H`O0QO2ILI+OQf=6*dCBYO~@WG?t z1|TJEaGqR!42)1kuXV zk&^ksPDqawb%F?znEYTT?3FthbEOW!@4ykVYcFEovkQ(&Pf$eoU9c=X(mRe6?8c!b zb>nV07e9=S<(Dl_?M3MAd$5?nFtU3u?3cBNnASw9!#D*AkuSqA9pV$H?VGYTrwh_0 z{6qU7N7j5KX(FHRgL;6AWDSAsFoJwQppqK_WYlZi%*ciVP?Rys>tDzpul7nU7Ik^tgpf9|CMFLbPDowN5&2+fq(un9yEJCh6c=J4`KDi&9nLjjLDhc`DnfxYq#ZL?eSc|DD0l2^q|y=||^hAOTEaDN=GeZno7)BOk?4|#g*ko%Bw-hFl?VB)t5^A{}S-RyS&u-%m_bL5G_|Udt zk!QZey|PlTcfZFy-=}nWWWxpS5%aKp$L-Sc4+&pBJ#JruA(ia^1DAE8%y2%AoDjL$ z$@zhh*X#1Gl}jTzegURpxh=a0sYx15*vj3-b-GAdkgL*a$nhYTJ#wU`x0pqqc5q|! zleJQnko6XC?ds^-+=c1+S~)p3pDRo}S&kW-?KC*=?R&`f_aQZo%gXK|&s>9yF>5W@ zeS$k`<^EMU*?kGzTz5Hn;S4u{J6TR_W!yM}`-kP^#napps<55q?oRp@9VzEQS`{MY zq~a_$)tSB65#$>&=UpC=_cihMP$1y&20g935)Ci}<9wbNC? ze3H_)U%!i^eh(IIBug(bFV1LoG|Q9icLeO1MZ#JK-GC_e1zP3Q zk+Mt)0Q1f6^NOKn+#I^uUm1;JV8Byo{`QH?;#sYYXDG(Ay#IKLGVK0PU-2v`T{XyH z)Ecu}GS=kqpB~T>VF~5#QCD1+TIbl!hEa3vPfACLG=QQj(*X&12t% zAt9QBSt2bCF%^Nrtn>O>c%cCama?dAtUz>rC7!fJT}k~aJQ%G^23x%?+SN3xZJ`xxpPa-UdHkBFeWPfJJvu z8(ktsccB01=14v|8tag(igI|$fz5kE%?-E(u!4A8UWdrLynKCq%w*KpTN32cGn!z< zgM_2Gp}q$oo&=YLcTTy3!ikp5mvCOi26t}qkRj@+*g2w?*XoHs9LFA`4jzU0QAIe- zoJOw9hM{*+xKIQ712JX!rHb&y_Kaja=OpfvgcE$M&k5#FLK-P~9>%69UFm_T(gCum zoSVkIFAw(gdTyvw)j01}p-1A_(YYO8?!>nxq&9L|VVO_}72`0s=)uzC6K#5ol3bE* z#b92zZFCy>ul3xB@mk`EOi`f~(Jm8pUDyzRsvsQ~xZD(dKx3pk04}jIeD+&~^^eq(jLo@XX4dmKW}EN2h8Eu#!i(zeOws}gGX;PzI|S4rQGIZtI+u%%DH zb6btWK%bezTPiiY<}L*9V3ya%rKci8z5$wqF}0|+WWc+|JkGxBXk(`-ey zX)3}{CSVv3>oGPmjI@UId#u(Xn@t5U$}MKSS8u&bwI;Ud+?2a5%+_GTXJ&a+Z+Rrv zGGly!I;g>BEr=R+>kVI34a3*RrL|Q&j&~_}K7TMT=y5yqkoE-f8eHBye{0a`^YVRY zL|kpZ^xmit%U*pvt_H=rz+$uJiLRg{*Du)DA@LRSgvJeyK+xy&hmE^m7W;3H0?4Ps2uEv*7ZI#W5=R)e$Yd>%`)ffsljtcWktK zLX8f3de-5+sL(7Hm-xJr3f~trsLS*u0TzLPWs3U zja8aTOlu%lB~{}4*MzTQ@mxe<@ZNDQ7f%al*?!0zK8zm4A7a;GmEm*$mRX}C03{ki zGiH8M5DBKhH#u7PW?wVDvWkq$^sQw6L5C`yB$2V-wN>8z}cg5j=w zp-QGSn0gvY6V@80qv3AZaA);^D9uqlz!4Q-pp>RtkMSzQP$*6QeQGVElm=TYo8H!2 zzol9$lqO%bMM{$&wY;LY>`^VHTq>MlVZ+8r4K}K=Ml<7d;CDz66f%U{Fr5{8+fuD<|4hfNw_VF@2gGzb^azi} zAPk1-9MD7Ui-j5#(>bOG`+gJ*cj~7#GSk7-L#88e?`REjF%8bk25;65h~Zq(1N5i> z17$eII!!K8C%8H(7vHQ6OANn>!5W&b`f`?`;JAo~l*%)PNYyMm<1?0Z8|^)7j;*!G_Ol zJMYq4cB+=r#m$*o0#&e#JErYu!CLiwx#Q8gXu+IOs8$^5mK&RFq}TNTKUM))!OSnR zG6dub!Ke(|wdqO(MwnPoz~rcl%1;GLG~oTn6|#Iw`2Y@0agFCSKPO{ z;cn~mV*IbU;zs9tU2*GgQf03Tti<2dtCzK zxBe-ldoDLLDWd#)Kj||*LHSryRww6g?IPFaa+6aJ)(;ut9x?;}Iz*p9ej*p+40E}c z%Om#~4d*!SY<=YS@rDl!@keN%f2b$#4L4MBN9xIaiH2z$wwn_TJGoSWe3584F%PMSyv2q+7#uDC1_6P6FMn^3nCv%4q+he&-X2j=94fn(dlnbiODuVX1+qV(GWBNi zwy$#aa-hC9+DD_7ejkkyy%sKZ+TAvfBS{{rW|m}gh{a+dBw{eF9fLk0@wE<9cCDlD zrvDf~;}S6q?^a`#x&z}VopH~#?b8jd04o{0OY>oL5DM)HlrSz{AxL9}Kv z5-BhNVf4XxyoC43HUXz8X99vS>;};{5smmjd2$lWzyT_o3@;jJ|Ko^MLZ;v?;3I{N zdjx!@oS24K;E?j$bdZ3~qD7GGYw zvNq0Pq~;Xtr-Lc@x&J9{&l+nmIN}{5yEI>O*_2GJeZ>em{}jR@6H*Z&$nn%K716l0 zRGf+&oS|K9u+g$PNK^{vAP>@6F7QIEKRG6za?>z~EM>Tl#;5ZJ&nm8TL_qqUJ-&*i z&YyXF)8}F!Sv%36+B0C*q>k%Sp$4SEy6?HUcopZ!mx=AXn6t3Mc#c!0kTnZd`XdVg z{6K+hydwR;_?f|44~Me3hu#H zj_0IyuF)vdu5~<%sD(TWt<}p5xm8s7#A^dttL@Aj+WpxT%Umh*NC*07sKi*BHy<;VrdUCz@kEQb-VQ(zsJU9lEb zI%wzyQFOWrk>pwjvnJ8bb!di2MHPH#iPU>NKcb>q_5mudeQ0oewz|uM_ zWaXPW_<$C25vur*zf8zLn3DWFN@DMl3%wq<*E3(ej2^P&J7s#K0XO4e8g6aC@Nl_g zaj9MIzL!4tukJINHZ&mO_VP%29;JUHh9!jG*n(Cnv<=hf)t`{wLsD4k#pk{tnCjw$ zJA>;C1{l;26?F*F)F=uA{pSWZDI+0FOi%c0Vsms}q0?L7DoM(ey~R%194!~vJ+}D^ z>?L_Jy_O-w-Cl@tB}0%gq-U98`y%_YBsssN(B)ikeOzPpTPu1DLZbgINU6>gW&&B) zL#Fl&`G@Ufo1_1xiyKmo{eLN`K3jN#Z^N2zF^V$JB1XxYCphAI{A+ZyQrK}zI>rA| zWw6q`L3sD6p8B@0g}t|=vz6T~LRpM|R#If$M6qvh-N`WRAu!MbZ9-gqxp*vyHV26z z%5NEBlOUW(QyR0xGoooh#A7ZGzzkbBLud`*u>32QRxPQe@jpSjmj0L KC{?*)3jPE{Yj`UF diff --git a/docs/_build/doctrees/environment.pickle b/docs/_build/doctrees/environment.pickle index 6011344facfeb95fb34bc0ab93f5da885cdf44e5..56919d3124c1ac8dd26fb8541fbfac8bc512f093 100644 GIT binary patch delta 18341 zcmeHvd3;nw_Ak{*x|4J#q_gkoY!J4rYzc&P0NFx=2ndKFG)Y6!B;7IH5l}JU%!r8% z1}a*L%A%;_0>VUX5EXF21s51cMA-xtMFoQ6zQ8-@)a^~?z0aH9y!ZZjKR#ObobNex zZq=z%r|Q2jKT!|%%HwJI6qlr z-_BzAZ3XcnYXytruNM@E_jhSyMb~sFxx#x3fgvQxu{uL_VEY<|k{U@(`X{oXPU}sN!T+%I6fPsK=m6 zrszDw(iEKmd_!?YaS`JFuA@fwK{Up%Ar4DRvou}8FBGSXq$v34wJPC3mhQpWGr&9Fkvwaiz!&(uvn*JfdgW_ z8jB%VOv54{i)*kb#bP=ZI4af|SPZ~oCKfm%)@!lo6VwLfEF?>?n2p6?EaqT=-nU+d z#Xv0PVu7x=UXKMj-CB)B85Z-fK#yB%uo#WSd@TB7Va8%O77MUIuUl)Oh_|A@t#w$S zo2?cs^14q92@h(+AnDLVMb4RUb()&|+MzkwBsuhLhvw85l0&a{XrBK~a_H?2%`ZW7 zsUG^dL-V%}B!{l<&{S7elMLP6p{b0SPaL|uLlb+(Ox&yzG?B(d#G(H? zG?r8K#NilpXdW2VKpc((-2SrUa4b4BuT(ophU3wpalI)y9Fq=B_w-hh;ka~Yl7GID zxDp(j4$bm)HxY}Y(xK_Q=T_oyWI8n8oo*uzN2fz`@7ddl!x8GxJoBC8aFjYUbuYG) z3`eR%(-hW09FA6p=JXcH;fQr;wx7R~WH4$|fXz>J5sO3Dq3NkwLmUoYhi20QYl*`V z?9jBlAvqky4o&vlbtJ=y0)F}sINn`0k@ewg%j5a|BlWyxWHP^TL~mX=B7(<^EatN+ zqWS%!l6mWhDE`~%2>#m0-hA?iFura?7C$~Ri$66oi9a)<5C36A?=W*+O)X4Er-c`d z>;qO(K|2v7X*{DmonIPh2_-0ZH;i zc@Z1LqehjnTwXP*56j_GM#uBrBf~%&QX%L)F)AVq#~!S>M)JL*3V3NnZ@y>LBt8^c zIIChdE95&zP2E)S_MAuaAVPoL-nQQ=W9-YsMpab#z+0ngO3CQC4(b3_M70-;$<~3ug zRIcfYX-#B{<%U*^!}XSSQVG?16b)HssT% z$7T1!&_ElJ`S}GCCQhF*^_n@9G6Pd;r!|lNI)~@$vtFwQuG0 zES_TR$A6z-WZA^cpLhrFb4@naPO9a}GqSneIfU<@WaQsnoz44I+4znL+5BXek^fR< zOy*sDkV9 ziG0IUBY$K{HqZOCh!hojy+-XLBVrn*jX<9v>3wD3M zcokoBjgemlySAUFa^rL(-+OH~pZDu9{xQ&pXJv~E!LU*;o?+z0Q1RUvja&^kte$D% zWs|e{=QHPV+oWvqZZ=EiE3P&2vYFYUJ(msPiL;D+#q?}`^Q^gi+I4Asg-I=bD*``d z%r^4bv$FYZv*+=0XydJO5_!rTBYzXj-8yGB9|QMCOn;q`KRF|<+jiYFM#LQnWtZgA}b#476-d#tv>GW|mewOhtI}gg(W>+eQhDpDV_Bc> zxawTSH-4q(d)LHt&zl#IOfwdOD zY~R2`e#zn6t@|OqnbXGSPn^I9HBR7V^K!cHYfNIie|>cKt4)iLi0!Vh-yo5fn{Q@( z&BdX7{-1~QCtDWtj&E~#nPb0L5zOo`DV7UkD3i^KYMsfDi|$@>;~|B*Zt439RdLfx z3f23TR}|{A<*zH$i)}{~>amq?DAdY7ZC9uvE7$%e?q}_-*Hl*p6b}FP&KZ_ zjGsO?blsY#`LXjk-S4l7V?gNn&v$*mD!Sk6)T@x~zPPT9@vRM@R|<3Z4(_t%bl-XR zL*UixMRUu!FVbD`DSQ*D+6kSUK>tcM)aXTzj14UO7o51Is$ayZAr2u+D?;e-} zUEy!7rnnFhX{?EVvW~`CY)E7K`6D~*u%xAni|J5v`%aR$t20<411{RV$^h*p>c>ph zz}7jX9a#JpaQmO6of$c7zu1_{hKj3m*>mEiH1J9PJ}j9bsdvxpi_RqSqdc}kA+7zv zQ~bg&c33V`MC(8{L0neJ5ptgz$oey6qupuwY_39f6tG~GSku8|6JTYviGy9Tida(& zZTNZs$a-EV6|a;qi+FJ$%Mp_YLmzKkY{&Ig+&2XK*eqKeGL*U52=|LaStWF^IocgH zoQ)$&d@_~=yBkZ{96$DGSpYj}WB|LXJb=xs2w;UVfK`tUU|Yv1tQ+oIEpNGZ9<*Tk zICh;M`^xwLwq#-e`|uc5p$A-9f&hWGz zZWQ9qyEcHmcUAx!Iwyc#dL5hX?NNyPi@5>pwbcsi2FpH2gt!OK572wq?8gSX^|gNY z2zIZi3t%r;0@!(r0@!!z1K5$)0QSkn0c>(p0NZX0V1Kd)u#0X8U=OzhuvaR^VQ=jQxed_1sPDalDFKm?a3Vzqc`q3a+)c zu^lR^koy;Q9DtzP+09JM&z1`y8V|VxrdMGO%!eE9V9&9B0-`?KUDeLUF!6IDYjFSA zfivz{63pDITnyGSAocFX)ohVMezt}|t|YX&6WlM!{cG_~gkHnhtqOUni`6LPw0qcX zaw}UEO#9d>Mw?jRa0d$t$tDB87Fh0>N4K-{lZh0$geD&S7FN=4gL zkl_{;K-`~Jh0x{{INX`S1-GSeqY^y902X(oRchLfR_SOj3TAL43NqYh8`4`6V%)2mmf^)^3_ZqjM}^OevYS3(_ELar;JHGWW= zV!zW5Q=oNMLS0uv_go3x>j!mO*ZXmb9l^CE+MJ#wTDY3 z;du{B+|~?5)xG!uHb(^!_1Xqj&iaa=jqGWbEwl$%CV#uOxBJtLESs@nq1g;^H&Rq> zW>sn#cjDEj3tYc53vYL5pejNo*y80bk&znaduf~Mpk#_iFe&_kNQX0Q4LN`M6TYInDVt?XDZ8zLV38+(uCyPLXMT@Wi3 zQO|?IKc8bCurcCIHyh6e2>niWiWRy4xs#CviZd@T z0WSe=eUVL##K{bE4JWgwN#eo%OymCjMfRkYm0>G(v2ro=1@Pjp`Qc*CTkIA#+6`WP zg|YtF>{m26&%L!H-i6w`-(h-nJdP#KYI(uo1ujpVexFUIncPnKt)0?aJ7u?aN^b2G z9qp9b+9|WOlN;M9ueDQJYp1N%PD!nua#}m3w06p9?Uc~kDWA2|BX&C_vv$g5?bd;q zsM;x&wNvNYDUr2P9&4vG)=pWhosw8P<)n7%Njt@9JM`niKiCuqw^1LkeEnV7y?QC< z;(vu;v382(AF%!!1s2;rU_(+QLkAMuNeAk*JG)kPx$^kF+9Yvd6N?J9Dq5o8Fzc_F zrIU+Sq*?1>maf2bjPbgJd(N^rvF$L6RAh9paX`x6In0J6O0#sz@t*(9=}l6!a#uR~ zcwg44{0J=GvYds-=hxbsn$0biE`P1VqVfof6>C0Z89Hg3Dt{#5`4?a=VP&!SWmR}& zK+R)PrS}omNAXXt7E`G%ND5ePhO=^QPbmHB=vM7CRYfFp8R=dp8mtkVt$1F+AKL-kLL#_~;9%GT7 z8x$n5Z}kCL)fq0sglR4Xl?GU{ouG8;-JEWsrsm{V}jY z{=9IMrTClDqqc$C7gN*P(&)Q2Ofze&wAFZwW&2l3knq#TSW2vnAUa!1$-?OaQ$y@I z0X7X!`npT`ULXggWFiD$DxFmiFZzV??gyB zYL54>Ym|*#_DP`P7zzLV6Yuy@{c#dbKg9<68<{AsJr!tnvUvOy8|<$$Rh&J=%KVXQ zM8Rn`(jS>AyM5mB3wK7}=G+ow#Uj7q+^1iFIqN6j-V&f`O>)*!7xQ0l~O3nR5Z^kn;&YbZZ~bOj_v0^^E}_xxo0D`p+$Np&@#o%wCHOXUvB&9z{6$)U^rREE zGfbnzAssBYOCyx7Pt=CSc!q_}npaEV-ZS2@CAd@COZ|c+D60~kM|XLKMXpNOV#h6z zSpEua*TKsdj~|P~zArq9VB;6wL_ikSd%GL0w2_XtH^^rE{sqfboI~(tsSRt0PE7fd z#rVs%NZIo}AY1(Z}s_Dr7 zRS9=|>Wb@RM532UHE=xAGUp2c{pT*fM% z9wz%Yc%)MgtTyy#pKhi@fba7uX>0&C+_Lv-*0G`G|Oi zHr_Ynn-Qm71~fTTtnLXM8MV0hEsPpWksSYag-IAj)HB)013F>;Cp35J#lT+lmfDbT zW8)kab4zV~4Y<0+*9~#*BD8|yt_&5OZ!&#Agr-Oh zSWR_&gC)X~t>{fw2CVw9b9R5r@)e^5XJHl3UKkG=tbL>+<-E#&vRFl%Wc{$8g-TkZ zcVUA_8wzE!O<&6h_uDMW(=t*UEn5cr5tz{{S!9y3tE0DQLi&S@I=Pwd@F`YPA>q~5KgPkNhA>fKTKq^o^W?^ekt?aaaBEIEqa z&5}dP`C#{5-RJ!ENs6OcgpOjMNT+E|B@l>DKn>T&ZYkg90M)XOmeNviWCK3G`o7G0~ z-wku;_Acv5RC!j?gg@tp@ASiW`Qf|$@E82>0%$qdq;?icn(P<*;U(EMz8eqm6AbmM zFx(F>^}|Q_;UoRvK0jQ7PXk0>$|K&?w+NpWBGT#psHl0cVANVgLA<= zrkc)*;Qa?&O2k2?x}FUdgH)=k;2hAQQkBCw;10YP#RZkB7|#2$f>Z@?-gixqYBC!w zHU_DN!PD%AL8=jvIIb?uGJC7Db%CXeubdqtN`h6fZ9m|+Vo_fXBXSwJUet5o&;?YG zEF3v3h>JjuKGcZ>3Y7H-P-H|)APxb_WbqP6LV&VUvINrl^fD`7=cG#{6ICb)Wl10x z0ZJ@=B+w55$|e0JFbDxkANdj}LVz+xF#=-8%FqNjT&)OErL+w}86||FQn?fX$^m5( zC`W*%{U`}kB0vufqa|<^0t3OF)^QS;hyYFJNfIz2Kr?uX1g0SXlXfV4{wk4~h*11r zD}gx((A2z60@Vo6Y@8>0t5wN;{br*p#L;NS6EnY==n8UJngJ14t5u0|>dGnS3Ufjf zDmx6nDd!5cwKn;~B{Vr);m!3eW{0Jw(PC4ksGM=GkQU1#_~KkPTh2CDXiHPg5_=0i zr;uqm*<9iawJJhQNQL-C4FP+P+o(~M=tD4M9Ae7NEWv$OqN+NT?H1<-tI|WTCl0YT zQI#qN4pF75_PptyG(`1?nq`Te5vq8x?MP^Ecic$Ttw|ACwq?$SCW`}4HnZHDW~#z7 zbcTfujTZdExjd_YDrm9>hUe$!8^j@}s!x)EnqFROZ)|O{)es}a?yy!>xkg-mT@}v^ z?(1(+tyQIcW9gHweT1N{L2b_Vc>@ESl~;RIge=5 z#h%Aiq3&fal_}DX|7N}FS;@OWJ~UChx=ahg!m$y;P^wKDX82!NsUE2G-BX+Dp=bJu9R=!i zcf%{HyCSfix{#uLYh#w!@i@3U6c?2ZE-EV;47Ljk%ZiHVE?5-hOS-7A#9%0}H#-ZQ zjn?`Ce2^J9*%-q;$UjkmKvp1O=Y>sk2 z(Ly`Qpr&oHqkKk7s|EEfOU2%$+PEN~g?@uJMjTwK4I!>(y*5FODh)PqYN<9h3F%sg zqsC!rgx3M~7DxFMyA6$qr`K!aA`OG=2E3cK(J;ti5G$u@rI%z!|pEC(;NA!2P1v6QaB9}wD?II_w-qX)Y}XKRbikTs(owrB9)!d6?YvjO%Qb#{xxV6!_7j@IU8 zdyCVc)HN)#IP-yNY&0ye3^X|Gc3Zw-y1k`kS*APks46_TZ8to!nEEkOD_hm&3Zaca zmzBV!%+!sbx}gcT9t!##Gkt?-n6 zOqjipymJzXy{*KZ9Ka7(tT)8#LhtL+9T2LO;@z$4>}1l2pvAqWUh07iZ!fmNrx)%%+ti;FfN9MMwGl28 zPN@6BrTGMT{H_z~V%fIat6qe*ZGxYQgY?V^Xj`EOJE_i!#Uovpb|TJ7dH*hJp{O{i zP7FqgmVxJSEIlM#aFHV*jl zd7Aqzt>%$pY}hDG+$8eWv8o;z`q0HoYAj3XfQ-<8n$(mK0G6i^%B&1yrc@MYLE!r` zV7GX0lqLntemzQ)Dzi@&GtEOq9Vpr?i~Nn7%*^Bj8HE(yctMNZ4Br>vVUmACh{qc( zve+yw@EcA=CGg&)<+%%58tpOV0hNA{VuaEhPp1eB1JEC6rC;{1I#r=@pQ;G`Q?`L7 z!VxC3Sa0$;euaQ3v2Gzw<3HDiX2syymn+;~Q0m@Q8+suP9P{|fP@KbWt_;lz!BcX0 zqJh69;EuX2)Mx{IT2Ck*z}R|1QQ6fKnhB?k+j>IdL-35Ob)6-_{Yg(~#AOhkzYvxK zmzbWgK5!}P2}@0;<~IuUszZJCO@Q}@=BQ4J< zq_sqx=m|>tKIS`(kPB-mgrXO&WBk+N? z$=vMo$l3$p8Emk7$AR#WZ=j0Xt;N)HSYEz&YX<_CwNqOOm%5$WY`EO9Q;R-$WTzHA z_1aEt9$e1v)aJt_ZI>3O(^b2)xo}y$OFI}Yn|5gv*iiS*UD}yn02{ekmpGh)_x-A4 z(5A>H^I}U)o!Mz#*aE-2mjPUL(lnmJ(%kB-$9z;vakX2_+pH@DBX@1qWy`MJ%1mph zYl|$-nuX>Y?JdwnTrkDK%{p}E#m%}*8S5V~)Ay*_n^apGEVepF4Sg~|PWp+NVsM>{ z@)yhZj3kd&nTDWTkMeqW;H|+^HCVMh3*~T?sm5ynE7rgP$^v}593fZI1uD}jP!J~{ z(IrC5+z(L$k36J9zy9`+E>}l43JEeL!@3}*JgQ3q!T5)D=;=id>oVce{;)1Zsy!HF z+KOsB9@Sy&y!|j)JNK{-hbMfCE}0fPQ&O<0AQ*cM8<-^)3mo*+*cU>}q@zv6!KQ0b ztoAK-*KN_Iq)x%fpun5ZdUUZ-40=?T2A8Twb*V`{W6h1`T8IVsjuB6Fjba51TO5gX z*d4{~B--{Uc+4o?c~qCExTn!>U)SBvkn`FqZKgDghbKE+n3lNY|v|{>E7^|?uHN;`KYbBtD$uR0516NV>a-sD?!8noe1AChY33Ks8Pcv|;FIatvfA~N6`>mox$ zf)?0`10vEw@L=D;3kyCbTS(wN8;j|`4o@n}uC!*Fo{S^*#S>BdGTCsTEOL(yS-hZ delta 15443 zcmaKT30Raz^0zg>499>BC~_kVhadvt5f2bh)VJAjy zOedOX%zbNu$;7OQ8kJUNIgqr+4mCzpCn&nPL4upU-2zRllmP?yl~xuD9P| z$G?Vt;~2VaUB95fvdRV4h9@j#itjJ$dZ9eJg{pFWHDcyeEio*T&&vAK!RSB%JwhE!3K8>25E zl|-4i0}}kGgQLWz+=Sd*LcQI^watWvdv$}RO^qu}T+WS$ks@S3tUiy_B1M_Chd~MZ zh+>5E3679Odmvma8{nr;CO$@P{tzNn=hDRs1LE}q362vZa-y`_L&U`aW*96?1LGh? zWDksO$sv^ho_f_le*%KE$yU%!1O=*fq>v0mnw_DgmG+9dvRZ3tlfAUQtf{JPNn0OV zKa|ifk$@R1{G$i3*w%+hi(e zHf&R<$fRN_6@96=g^E-vrcpstVw+CIC@N-9K@(z|Nrl;`1`K{`sIlpB7ppt%g5eO?hE*oSyyKYS_{%l{xZlt6|8)iX*SK8a{bhapdh* z!~DZ(tVw=uHT-Z1e>W|2QUPW*Rxq|lG)E}X-| z#IBoT#D(EzF(EHiTpSrI&gR95qM{_RczCFIEH6xK8XhB(hewO=M<ii6Ia5UKt*Q^Z>D~V5-Vh$lh4vyuzx+_Kv~UKD6}v}d zi>!iZ@o`~}c(kC0I9xbJj42#~^)tn=qP_-N1P`_K(R!9DRu}Dp0&Fo6Mv3~7S&)l; zjSz2)><(!tix4M924cwjYNT1r7&SxZ950p?juaC|`J=%+@ye(q=pzn|N(rFpM6=G( z*nvJ`+vs?h#F~Yp$BKiaVzCKQTcGt45rxK89wHj7hjh2r<(Lh&AAvnJMvJ4(!=W!Fu@Y$+7iEoSlZ zzM*0l(g$xbi_u3%3G1Xn@j2G}V3JL|G##HO$BRvq3&qB%X3^)1{vvluAs$uBrzMF$ z&LxQLQwoJ1do|_KXmQ8XLh&Q^=H01P;+yei@z>Rx#p+uM#cSitqVBiJqHtQFxb0T6 znEl61;yBWE6U_3mFRqID(+kDa+vCJCi(VX@UL$tnP-e}rihoWti!W!)7N3@wGneL>8)|?b+=Cij*+&Wz8Ez5+{q$}&z<2!@o6)TC_aH^5XI+C zIZ=G<*oop}XE9NH>^O+xW9MO__}F=hC_Z+!5yi*OJHB*&@VWCLfqd>9A&SqPb42mE zbCoDQcdik|hfRI6FqWHxYWYVy!Ph?Riyqfv9gAEat2jf(!gEQTPno5Zi1WGHejoaixzF z{c6U^D@CBJKUxz7#D%r*tDUD&`SwPQdawRo5Pv@xCwiZlDqd=sFE)Rt3=V7TkaIA5 zkyBDIu+=v$mO06g+#cFI1Bf!UFIaF`qe>QjqERF6*`-k-i}z{N7fU|SsGTjxH0s&q z|J0~u_rIo5Bmee-M*X?+6^(j()oG1-sr5;Xn(eFsxj7aFuUoxU-V=v?`gnCuAj%|u zd+5I~qWz;aCLQzbSJ$1zy}e45e4gC?_x0lu_}tX~<-;yTyz zZB<0?P3slWv3WZaV$Ng1?T>8vw?dnr`j;Z6K66$Pqo2F3i0Lo9%Y^8?$f>+Gb766os#klS315Krg?Z1VowIPqnYmsHQg_(Wp3=(5O9c zbxLQxy+OlZ=Ytxx`JKB!_FVupw^8d>55-{CBL7%`_LFu;08ys)@q6ZL)c*Gtg81|Z zdUYs9y5-`dqsi@4_s0MdruHQr!8#^3e()Z-{9T!<`m_$#{Z@w&^zXv&_hj+I(GKzUS#Gw^@dmNwi{$oqk4FJywI4mP z9OQ42=$sEuu9crfVaJzz-V2B_wLf!eu}0;dSp}lwavjFpMdIlz$zuN(9T?xBDOT*B zJq=^;JTYu`a{J0Je*uwE71sXH*Gmv%J) zyWAnpUaZ4--ACMcWt{lzQnL8qN{7gA7%b~exIM02b&80_fQ(MTS0fB#GB^%uWl4rolbhn8Lxk?K z<9QY@ug1eTacx(!YgPh80#YW|%HB|_sGk$T26E{Gni)Lz%B5u$B+Kc^XyWqkb~^o* z$m=>~*_DwB(LijNYi1htfXU;1phY9+XP}$p^Z^uFhT>@%BbzgE1bqfVvg=GHWB`#S zS8NvCsgaIs@YBgU=+gm&{a^gUg>^Nqp z16Dpd2*7>w zQ_zK#gcY;O(uF-TwF^6Tnis2gy*1sdlX_R5nO)dNZtcSQmvmtl-VV39N2GUseTT-n zu+N$!kkY&M&+ei(a84KYsWLCt&t8wtk=NO1K+_fw=i$geT zE%xD*wK$aX)nb26RuQC3wK#y&RIETbYO#(JRFqM2iXdgC#eSTYB1btXJ}4o@hSKOt z0$IvMi}jp{78^MWMKhFwqKxuSv_{D%a+G;uManu6q?8ju$~W;r$!2jTXPPLZG!wxL zx&0Azr&<2`2(-pf20p9JmL1rwTN&#kvmuh0{_^ohp%tf~&j#qrx&0d32tB`e;bl#? zZfk7!X&h&=3R-a^wCYBv^+w2fBedEJYSEfpG|>0_iYF zUfcqUpwKn%NjRzVvD&ibtIxonP$cu-Me}jbfx-3hv*7P57rYH2u5Vw2?mmzuLq9~} zJFkP$mHH~2^o306+YTQ=U)Lw?Q0W7ya;OV_h60)M7M`A?7M<)jdCkMwS2ZtsHMG9f2WQH(C3b>So9l+yaG1A;PUV>nId>E1pt{JC8!VTj$JCDDu|%va7Km zS#k`5ycNDt3ZY+PYm>l7-#H$>bh+T}c)C`H;mfK{-`}v#L-?J@U-Ln>OgIiva?x?< zsSTemY<^Z&HXnyn%?iW6$^RUOVcy7}%I@Y%X!l>wL#XtB9)de(o$FkuIyn(lWsOx$ zW%C-_w1!!xlZlt1H&u-9QpZo0oq!xut+lSH+Fs|WBQKu7&f34%8tYltTw9KplU-~E zQQaQ$ahi*3FFEV(!%B-)Tmv^@o%Bf#Ls^$~EQ(dflvg#}MY%$!TjGzaN^3+#g(u}a| zJ|)9g6Rug}JK2@W5{JjFEBwPdL(nKZWNfWUSogJi>KJ}dnfmZ+=%s~czRF$O>3TXm zy$zHvH9=m5J#B}zU?Y$_kehV3zxA31{(U-WKzJnmGKHquoRITCR z*i*)zhX75+ci(%IY{+@{3SfAjGI#%ZxJm0E!|y8?T@We1KM!Hr5>J+U&Z{_SlsOmN zi;k-tP*rfcB4x`3=k?khxPGp;A7f3z2(F`5ZM`oljTX$#ElXuLoR}USWPwk z2te&sZsM3%6OUem?%oDJBU$fK zUY6@Gd3C;7{1RS0&s9#|>hXyWWRh9*0gQ9FOB3a`#m*c^eGS3}Uiplz&}? zP;a4;g$dGf9veFT0iK=S^V?mkigr7398PxpCC(f z1;Z8@`ZK=9yz~=%?`GO47%A0^75oC}-gvoL-gOKTr{tlfo1i-g1>iTby{tX)T}Z2V+i*QtL0a$j{CK*ccmt+ z5^jdX7PU!uM0ca>o&Iy1>nbQKZR3}nX4^7E=?6OwnwVBn39|5Syl$>*YFN}JP(-i) zwx=IbDwkUVSC%fIn zi6P~-p2|}_Qg`m>k>2K!_VqzClJX8caZc*$9bggNa&GH_DJ2~&m*1bk@9Y%Kv$J7_x}mEwD3JaJ1&@&ZMLJN z_lBYwf71(p%M0J(h5y3~-|2;CVk2miJKImuY(Luz&oSc_UT4F(UV;H$6$W|XgT3$} zUieTie7F~$$8@t(UsG0Yt!bp6|2h5VS?km40Ts@rWpBIPuy zDARj9|K9Y0w(es>nC4;a5W+dEy#g!S|E;>-aHhxEyyK zuigjAZP#HYWUn{qh8mp)>%yjnvI@+(iq_B2O@aaIpVnCd))$@TFO}WaU)6=<)tn#b zDxt5mfvyN|=3WL}I^N8E3c6{KD>HQXNRvx-x;ybM?u1TvGv37|`snhZk1VA}vV7b} zmy5^MAs<~f41CyEHxY(Ni?8k`ykh&CuWmTrtnKmD4Q#ngb4^86K2E@|#QoOE(Gaz9 zqRb~1IVxCFPh3AAEDT7Y7{9b?KWGV3KnMXG;<_s!i~tT>;R=XI?KVsM`ARQ^M3aho zyH`Lw0UU7>6p%y!2N|;hQVHO=lBR$R0yvWNmH+Y6#o$HeMSQem5zbK}TPfxez=2_a z0tOSnq2MM33?qPd|8ND2Ab>Y~fdWPnz%))NzYgXOdtR^@?iYos6r+Y!uw&e z0&XFI_v%WHaFq}s}A3pxXu zaIKZ91m9~5I{oop#~aS!4dM(`FTkZW);ev=svYR`ejTKHMlY`zf&%2TdAbnU@1OeauDyA>rBT6Yb&Hy+Ypso0753UR z*QA-cz=WW=)iqZ7d2fDNw%nAbi_oWK24-bt$>q(u)G!`FeucfJxwfv98Q1P+UA`eA z&EDKp-`rG6wX^EW8t!VgHl?MzdMwec)x}UVrTlj25#5p2w5Z;i=Gxt&+aKuK*s5Ep zlkxx5C%HyFsGAz1$Tv6Yc4*|<7j?Oj{kpVbt4Yi3t-8{fe@7br3=sLWT5D5TX{EKX zB28X-PdB~yjaAeb(rk_Px|+%~S-DrYpx{QW|BvBWnZ3K(NSAxUbn!0BKHWpX)N`X> z|19jC?Ej!HLi!xmg)1=E1LhyrB^uHitM9RxPiBhIx&{dQY#i_MY2ZQ{(#PxK0Q9-uN#m$+i0rU7&Bvc3kw9 zG_W)RhFQOw>bm*3uozqbme&Z>SJ#%!vv#7}!SWrU0k)db#^&-yE5=h+zX+DzIurz> zfPe4JGpY&AI!hi2yK56&syVBzMO7})S*DQOfXfP1@GgjLPSaT)AvmO=xeiZ_d1^^? zc4ec^vJ0inbgXzPQvT-YGjra3Wx&(NLRxlh{9(d9dRDoir%ETF!CG0}V6C7~Kzhw$ zcfZI73cv^d`??CiZ`g6%SNELoi;kfY6fY_M68mrDKO!jYJjAxkttb5AqbVpmgSAts zlvVJnCmH3xyKwz}!mr~Aw6$R7VFLU@LR%M$`XAtauXD1TcVFg&ap(_8y_z& z(Z>vB2k!H=MEPL{+^uMlgN3nvmR1xh|9K8ad2NZlm!48tXOD-Uhn{}?sYZq_(f9Br zROT$vM@6wbw6pV`3+*D=vQ!^2p2en4F*>+%y%IKXz<7xY-*4u1Z>q8BZ>W@XDW%gN! zO=cy-_}Y2no2<2{Ky{3&N1YolYki#|4zk{G8V>ZuCT`C(BpB#)Zq)u>$F(8T@N6#i ztQ3D&&t3Yg_zF(D|NOg4tqVEl*)D5K4ZUJ{;sd}EOM5s7#h&}z?ed9ILkwEkS!$re z;TUo%Jd}Z@3H5oQkV)+pneqYeCYUFinBrL*%9_y4ZphONLq^o9v(D>csT0#W%zCN8 znqub_F{1$pdIQZe^j?U1;Pp8M*Xwiq@26cdc!fW0h#4#Vlfo%YaR%1d=QUDbs+`+q zOLeub@V^{~3imwePkU>_lm1CC%Jt5Z{*`s{?4vPYX(t~Amp0hT@#HV9ZLVpm#<(O4 zOaWALmMI_+bHWx=K!hLVH_iB$QLfia0m0W%xGOY}cIUa!z*Kw$nF3>@+3Hq5%YNFA z0d%!nI?ryOM;41rf#mmDra;=gOHsQgB}h&qYc0Zqvbh|cnI^ZI0waA%Brk^s#^_lI zztrr62%Rm>J0uL zIVEL{BHJ3_dh9~r`SWOV)zKhw>vQVSaWp6sxgU-O72#vhu^{q%$*~|>JqwNnkz*e| z7SsnHyN(5A;p5V=Alg6?$AePvk#{_3AU^Io9@Go+T#n;GGrmUl)E;Bx5b9a+$Hkwa zt3qna=37fE%bLpOHsIe9sgt4jlw~Z9$XM3gR7D@tDtNhVm)Y+bbI{0~_l#yWk4wR_ zn#VrRin&JF0(%4cfKCv(Sr^R#Ohf;$OiMPUlppw^*frERWDx2BBQGbQpVPGBQMGDRtK%&C*({_Q{Xs2 z6Ty-jCxGGq!||Oh$oo5trYMhAS7#^7S2~O_37yI+X)kSTDr;(Pq-DZ_9&*oK=&7?M z%WEA*{Qb6!ecu@CWwO4etOEbYt$|hrR*_S2HhOYRBaN`Z+H|+oN*P14YC zZ;VW5jo=!)eSUMj)}+>8kOpEO8mQK(_l-S$NnOHzW4M_sx+h9Cq;iW=s=*CqePE2} z%c3rm#z-hnqy6qWYh~wTCcEq(81M4KX*_el80|-|Qut0_jdn#JG#(l_fEHWv+r{7F z8VI1jp3&kc!xt=JxHa%_=pq{Eit`EHm!HBaJ1rwor4dy^C;=&*+hE0%psr$(dQ)i> zc{#DJq@v(o0WG0{eKduO}cQ656O;Y4`|ZJ|HQLnt^#SAax42h2B>Za=cL-^f^elp-TVX}rqr?5N(*WV&q o)@~v7$?bs@ImH?hg%)krkl0AB(batfHS-?}>zmq|o6EEQA9N3^7XSbN diff --git a/docs/_build/html/_modules/advertools/crawlytics.html b/docs/_build/html/_modules/advertools/crawlytics.html index 507dab2a..d9b0fa1a 100644 --- a/docs/_build/html/_modules/advertools/crawlytics.html +++ b/docs/_build/html/_modules/advertools/crawlytics.html @@ -154,7 +154,7 @@

Source code for advertools.crawlytics

 
 >>> import advertools as adv
 >>> import pandas as pd
->>> crawldf = pd.read_json('path/to/output_file.jl', lines=True)
+>>> crawldf = pd.read_json("path/to/output_file.jl", lines=True)
 >>> img_df = adv.crawlytics.images(crawldf)
 >>> img_df
 
@@ -211,7 +211,7 @@ 

Source code for advertools.crawlytics

 The ``crawlytics.links`` function gives you a summary of the links, that is similar to
 the format of the ``crawlytics.images`` DataFrame.
 
->>> link_df = adv.crawlytics.links(crawldf, internal_url_regex='nytimes.com')
+>>> link_df = adv.crawlytics.links(crawldf, internal_url_regex="nytimes.com")
 >>> link_df
 
 ====  ===========================================================  ========================================================================  ==================  ==========  ==========
@@ -301,9 +301,10 @@ 

Source code for advertools.crawlytics

 columns of interest, write them to a new file, and delete the old large crawl file.
 
 >>> crawl_subset = adv.crawlytics.jl_subset(
-...    filepath='/path/to/output_file.jl',
-...    columns=[col1, col2, ...],
-...    regex=column_regex)
+...     filepath="/path/to/output_file.jl",
+...     columns=[col1, col2, ...],
+...     regex=column_regex,
+... )
 
 You can use the ``columns`` parameter to specify exactly which columns you want. You can
 also use a regular expression to specify a set of columns. Here are some examples of
@@ -346,7 +347,7 @@ 

Source code for advertools.crawlytics

 One of the main advantags of using parquet is that you can select which columns you want
 to read.
 
->>> adv.crawlytics.parquet_columns('output_file.parquet') # first 15 columns only
+>>> adv.crawlytics.parquet_columns("output_file.parquet")  # first 15 columns only
 
 ====  ==============  ======
   ..  column          type
@@ -370,7 +371,7 @@ 

Source code for advertools.crawlytics

 
 Check how many columns we have of each type.
 
->>> adv.crawlytics.parquet_columns('nyt_crawl.parquet')['type'].value_counts()
+>>> adv.crawlytics.parquet_columns("nyt_crawl.parquet")["type"].value_counts()
 
 ====  =========================================================================================================================================================  =======
   ..  type                                                                                                                                                         count
@@ -410,6 +411,7 @@ 

Source code for advertools.crawlytics

     "jl_to_parquet",
     "parquet_columns",
     "compare",
+    "running_crawls",
 ]
 
 
@@ -433,7 +435,7 @@ 

Source code for advertools.crawlytics

     --------
     >>> import advertools as adv
     >>> import pandas as pd
-    >>> crawldf = pd.read_json('output_file.jl', lines=True)
+    >>> crawldf = pd.read_json("output_file.jl", lines=True)
     >>> redirect_df = adv.crawlytics.redirects(crawldf)
     >>> redirect_df
 
@@ -516,7 +518,7 @@ 

Source code for advertools.crawlytics

     --------
     >>> import advertools as adv
     >>> import pandas as pd
-    >>> crawldf = pd.read_json('output_file.jl', lines=True)
+    >>> crawldf = pd.read_json("output_file.jl", lines=True)
     >>> link_df = adv.crawlytics.links(crawldf)
     >>> link_df
 
@@ -581,7 +583,7 @@ 

Source code for advertools.crawlytics

     --------
     >>> import advertools as adv
     >>> import pandas as pd
-    >>> crawldf = pd.read_json('output_file.jl', lines=True)
+    >>> crawldf = pd.read_json("output_file.jl", lines=True)
     >>> image_df = adv.crawlytics.images(crawldf)
     >>> image_df
 
@@ -643,15 +645,17 @@ 

Source code for advertools.crawlytics

 
     Read only the columns "url" and "meta_desc":
 
-    >>> adv.crawlytics.jl_subset('output_file.jl', columns=['url', 'meta_desc'])
+    >>> adv.crawlytics.jl_subset("output_file.jl", columns=["url", "meta_desc"])
 
     Read columns matching the regex "jsonld":
 
-    >>> adv.crawlytics.jl_subset('output_file.jl', regex='jsonld')
+    >>> adv.crawlytics.jl_subset("output_file.jl", regex="jsonld")
 
     Read the columns "url" and "meta_desc" as well as columns matching "jsonld":
 
-    >>> adv.crawlytics.jl_subset('output_file.jl', columns=['url', 'meta_desc'], regex='jsonld')
+    >>> adv.crawlytics.jl_subset(
+    ...     "output_file.jl", columns=["url", "meta_desc"], regex="jsonld"
+    ... )
 
     Returns
     -------
@@ -766,9 +770,9 @@ 

Source code for advertools.crawlytics

 
     >>> import advertools as adv
     >>> import pandas as pd
-    >>> df1 = pd.read_json('output_file1.jl', lines=True)
-    >>> df2 = pd.read_json('output_file2.jl', lines=True)
-    >>> adv.crawlytics.compare(df1, df1, 'size')
+    >>> df1 = pd.read_json("output_file1.jl", lines=True)
+    >>> df2 = pd.read_json("output_file2.jl", lines=True)
+    >>> adv.crawlytics.compare(df1, df1, "size")
 
     ====  ==========================  ========  ========  ======  ===========
       ..  url                           size_x    size_y    diff    diff_perc
@@ -806,6 +810,8 @@ 

Source code for advertools.crawlytics

 
 
 
+
+[docs] def running_crawls(): """Get details of currently running spiders. @@ -816,11 +822,47 @@

Source code for advertools.crawlytics

     * elapsed: The elapsed time since the spider started.
     * %mem: The percentage of memory that this spider is consuming.
     * %cpu: The percentage of CPU that this spider is consuming.
-    * args: The full command that was used to start this spider. Use this to identify
+    * command: The command that was used to start this spider. Use this to identify
       the spider(s) that you want to know about.
     * output_file: The path to the output file for each running crawl job.
     * crawled_urls: The current number of lines in ``output_file``.
+
+    Examples
+    --------
+    While a crawl is running:
+
+    >>> import advertools as adv
+    >>> adv.crawlytics.running_crawls()
+
+    ====  ======  =========  =========  ======  ======  =========================================================================================================================================================================================================================================================================================================================================================================================================  =============  ==============
+      ..     pid  started    elapsed      %mem    %cpu  command                                                                                                                                                                                                                                                                                                                                                                                                    output_file      crawled_urls
+    ====  ======  =========  =========  ======  ======  =========================================================================================================================================================================================================================================================================================================================================================================================================  =============  ==============
+       0  195720  21:41:14   00:11         1.1     103  /opt/tljh/user/bin/python /opt/tljh/user/bin/scrapy runspider /opt/tljh/user/lib/python3.10/site-packages/advertools/spider.py -a url_list=https://cnn.com -a allowed_domains=cnn.com -a follow_links=True -a exclude_url_params=None -a include_url_params=None -a exclude_url_regex=None -a include_url_regex=None -a css_selectors=None -a xpath_selectors=None -o cnn.jl -s CLOSESPIDER_PAGECOUNT=200  cnn.jl                     30
+    ====  ======  =========  =========  ======  ======  =========================================================================================================================================================================================================================================================================================================================================================================================================  =============  ==============
+
+    After a few moments:
+
+    >>> adv.crawlytics.running_crawls()
+
+    ====  ======  =========  =========  ======  ======  =========================================================================================================================================================================================================================================================================================================================================================================================================  =============  ==============
+      ..     pid  started    elapsed      %mem    %cpu  command                                                                                                                                                                                                                                                                                                                                                                                                    output_file      crawled_urls
+    ====  ======  =========  =========  ======  ======  =========================================================================================================================================================================================================================================================================================================================================================================================================  =============  ==============
+       0  195720  21:41:14   00:27         1.2    96.7  /opt/tljh/user/bin/python /opt/tljh/user/bin/scrapy runspider /opt/tljh/user/lib/python3.10/site-packages/advertools/spider.py -a url_list=https://cnn.com -a allowed_domains=cnn.com -a follow_links=True -a exclude_url_params=None -a include_url_params=None -a exclude_url_regex=None -a include_url_regex=None -a css_selectors=None -a xpath_selectors=None -o cnn.jl -s CLOSESPIDER_PAGECOUNT=200  cnn.jl                     72
+    ====  ======  =========  =========  ======  ======  =========================================================================================================================================================================================================================================================================================================================================================================================================  =============  ==============
+
+    After starting a new crawl:
+
+    >>> adv.crawlytics.running_crawls()
+
+    ====  ======  =========  =========  ======  ======  =================================================================================================================================================================================================================================================================================================================================================================================================================  =============  ==============
+      ..     pid  started    elapsed      %mem    %cpu  command                                                                                                                                                                                                                                                                                                                                                                                                            output_file      crawled_urls
+    ====  ======  =========  =========  ======  ======  =================================================================================================================================================================================================================================================================================================================================================================================================================  =============  ==============
+       0  195720  21:41:14   01:02         1.6    95.7  /opt/tljh/user/bin/python /opt/tljh/user/bin/scrapy runspider /opt/tljh/user/lib/python3.10/site-packages/advertools/spider.py -a url_list=https://cnn.com -a allowed_domains=cnn.com -a follow_links=True -a exclude_url_params=None -a include_url_params=None -a exclude_url_regex=None -a include_url_regex=None -a css_selectors=None -a xpath_selectors=None -o cnn.jl -s CLOSESPIDER_PAGECOUNT=200          cnn.jl                    154
+       1  195769  21:42:09   00:07         0.4    83.8  /opt/tljh/user/bin/python /opt/tljh/user/bin/scrapy runspider /opt/tljh/user/lib/python3.10/site-packages/advertools/spider.py -a url_list=https://nytimes.com -a allowed_domains=nytimes.com -a follow_links=True -a exclude_url_params=None -a include_url_params=None -a exclude_url_regex=None -a include_url_regex=None -a css_selectors=None -a xpath_selectors=None -o nyt.jl -s CLOSESPIDER_PAGECOUNT=200  nyt.jl                     17
+    ====  ======  =========  =========  ======  ======  =================================================================================================================================================================================================================================================================================================================================================================================================================  =============  ==============
     """
+    if platform.system() == "Windows":
+        return "This is function does not support Windows yet. Will be, soon. Sorry!"
     ps = run(["ps", "xo", "pid,start,etime,%mem,%cpu,args"])
     ps_stdout = ps.stdout.splitlines()
     df = pd.DataFrame(
@@ -842,7 +884,8 @@ 

Source code for advertools.crawlytics

         crawl_urls = crawl_urls[: min(len(crawl_urls), len(df_subset))]
         df_subset["crawled_urls"] = crawl_urls
     df_subset.columns = df_subset.columns.str.lower()
-    return df_subset.rename(columns={"args": "command"})
+    return df_subset.rename(columns={"args": "command"})
+
diff --git a/docs/_build/html/advertools.crawlytics.html b/docs/_build/html/advertools.crawlytics.html index 02d00e67..d0da7a71 100644 --- a/docs/_build/html/advertools.crawlytics.html +++ b/docs/_build/html/advertools.crawlytics.html @@ -156,7 +156,7 @@

Analyzing crawled images
>>> import advertools as adv
 >>> import pandas as pd
->>> crawldf = pd.read_json('path/to/output_file.jl', lines=True)
+>>> crawldf = pd.read_json("path/to/output_file.jl", lines=True)
 >>> img_df = adv.crawlytics.images(crawldf)
 >>> img_df
 
@@ -333,7 +333,7 @@

Analyzing links in a crawled websitecrawlytics.links function gives you a summary of the links, that is similar to the format of the crawlytics.images DataFrame.

-

You can use the columns parameter to specify exactly which columns you want. You can @@ -651,7 +652,7 @@

Exploring the columns and data types of parquet filesAnother simple function gives us a DataFrame of the available columns in a parquet file. One of the main advantags of using parquet is that you can select which columns you want to read.

-
>>> adv.crawlytics.parquet_columns('output_file.parquet') # first 15 columns only
+
>>> adv.crawlytics.parquet_columns("output_file.parquet")  # first 15 columns only
 
@@ -725,7 +726,7 @@

Exploring the columns and data types of parquet files

Check how many columns we have of each type.

-
>>> adv.crawlytics.parquet_columns('nyt_crawl.parquet')['type'].value_counts()
+
>>> adv.crawlytics.parquet_columns("nyt_crawl.parquet")["type"].value_counts()
 
@@ -814,9 +815,9 @@

Module functionsExamples

>>> import advertools as adv
 >>> import pandas as pd
->>> df1 = pd.read_json('output_file1.jl', lines=True)
->>> df2 = pd.read_json('output_file2.jl', lines=True)
->>> adv.crawlytics.compare(df1, df1, 'size')
+>>> df1 = pd.read_json("output_file1.jl", lines=True)
+>>> df2 = pd.read_json("output_file2.jl", lines=True)
+>>> adv.crawlytics.compare(df1, df1, "size")
 

@@ -888,7 +889,7 @@

Module functionsExamples

>>> import advertools as adv
 >>> import pandas as pd
->>> crawldf = pd.read_json('output_file.jl', lines=True)
+>>> crawldf = pd.read_json("output_file.jl", lines=True)
 >>> image_df = adv.crawlytics.images(crawldf)
 >>> image_df
 
@@ -1030,15 +1031,17 @@

Module functions
>>> adv.crawlytics.jl_subset('output_file.jl', columns=['url', 'meta_desc'])
+
>>> adv.crawlytics.jl_subset("output_file.jl", columns=["url", "meta_desc"])
 

Read columns matching the regex "jsonld":

-
>>> adv.crawlytics.jl_subset('output_file.jl', regex='jsonld')
+
>>> adv.crawlytics.jl_subset("output_file.jl", regex="jsonld")
 

Read the columns "url" and "meta_desc" as well as columns matching "jsonld":

-
>>> adv.crawlytics.jl_subset('output_file.jl', columns=['url', 'meta_desc'], regex='jsonld')
+
>>> adv.crawlytics.jl_subset(
+...     "output_file.jl", columns=["url", "meta_desc"], regex="jsonld"
+... )
 
@@ -1092,7 +1095,7 @@

Module functionsExamples

+ + + + + + + + + + + + + + + + + + + + + + + + +

pid

started

elapsed

%mem

%cpu

command

output_file

crawled_urls

0

195720

21:41:14

00:11

1.1

103

/opt/tljh/user/bin/python /opt/tljh/user/bin/scrapy runspider /opt/tljh/user/lib/python3.10/site-packages/advertools/spider.py -a url_list=https://cnn.com -a allowed_domains=cnn.com -a follow_links=True -a exclude_url_params=None -a include_url_params=None -a exclude_url_regex=None -a include_url_regex=None -a css_selectors=None -a xpath_selectors=None -o cnn.jl -s CLOSESPIDER_PAGECOUNT=200

cnn.jl

30

+

After a few moments:

+
>>> adv.crawlytics.running_crawls()
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + +

pid

started

elapsed

%mem

%cpu

command

output_file

crawled_urls

0

195720

21:41:14

00:27

1.2

96.7

/opt/tljh/user/bin/python /opt/tljh/user/bin/scrapy runspider /opt/tljh/user/lib/python3.10/site-packages/advertools/spider.py -a url_list=https://cnn.com -a allowed_domains=cnn.com -a follow_links=True -a exclude_url_params=None -a include_url_params=None -a exclude_url_regex=None -a include_url_regex=None -a css_selectors=None -a xpath_selectors=None -o cnn.jl -s CLOSESPIDER_PAGECOUNT=200

cnn.jl

72

+

After starting a new crawl:

+
>>> adv.crawlytics.running_crawls()
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

pid

started

elapsed

%mem

%cpu

command

output_file

crawled_urls

0

195720

21:41:14

01:02

1.6

95.7

/opt/tljh/user/bin/python /opt/tljh/user/bin/scrapy runspider /opt/tljh/user/lib/python3.10/site-packages/advertools/spider.py -a url_list=https://cnn.com -a allowed_domains=cnn.com -a follow_links=True -a exclude_url_params=None -a include_url_params=None -a exclude_url_regex=None -a include_url_regex=None -a css_selectors=None -a xpath_selectors=None -o cnn.jl -s CLOSESPIDER_PAGECOUNT=200

cnn.jl

154

1

195769

21:42:09

00:07

0.4

83.8

/opt/tljh/user/bin/python /opt/tljh/user/bin/scrapy runspider /opt/tljh/user/lib/python3.10/site-packages/advertools/spider.py -a url_list=https://nytimes.com -a allowed_domains=nytimes.com -a follow_links=True -a exclude_url_params=None -a include_url_params=None -a exclude_url_regex=None -a include_url_regex=None -a css_selectors=None -a xpath_selectors=None -o nyt.jl -s CLOSESPIDER_PAGECOUNT=200

nyt.jl

17

+ +