From 62664d7252c38cf1e487ec6c1cdd55ca294191c1 Mon Sep 17 00:00:00 2001 From: allo Date: Fri, 17 Feb 2006 20:55:31 +0000 Subject: [PATCH] AJAX Check for robots.txt before crawling. Icons from herrlich TODO: Style it nicely ;-) git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1689 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/IndexCreate_p.html | 10 ++- htroot/env/grafics/failed.png | Bin 0 -> 6002 bytes htroot/env/grafics/ok.png | Bin 0 -> 7125 bytes htroot/js/Bookmarks.js | 2 +- .../{gettitle_p.java => getpageinfo_p.java} | 60 ++++++++++++------ htroot/xml/util/getpageinfo_p.xml | 5 ++ htroot/xml/util/gettitle_p.xml | 2 - 7 files changed, 56 insertions(+), 23 deletions(-) create mode 100644 htroot/env/grafics/failed.png create mode 100644 htroot/env/grafics/ok.png rename htroot/xml/util/{gettitle_p.java => getpageinfo_p.java} (64%) create mode 100644 htroot/xml/util/getpageinfo_p.xml delete mode 100644 htroot/xml/util/gettitle_p.xml diff --git a/htroot/IndexCreate_p.html b/htroot/IndexCreate_p.html index ef21cf9d8..4ddb89c45 100644 --- a/htroot/IndexCreate_p.html +++ b/htroot/IndexCreate_p.html @@ -3,6 +3,8 @@ YaCy '#[clientname]#': Index Creation #%env/templates/metas.template%# + + #%env/templates/header.template%# @@ -124,7 +126,13 @@ You can define URLs as start points for Web page crawling and start crawling her From URL: - + + + + + + + diff --git a/htroot/env/grafics/failed.png b/htroot/env/grafics/failed.png new file mode 100644 index 0000000000000000000000000000000000000000..82b6b1b1dfbc8524b2f8f97b78d11fe09361507a GIT binary patch literal 6002 zcmd5=c|6qJ_n(F|iBiIhrN|>>9cu;|DY8WNvP8xhdtn$^A1NhcDNCY32?;S-3RxbR zANd^_<-M~XUzdZec1 zqq;n{9Uk}8$;96NCFa=!PqicQrH+X$mJJS;x^<;i+DJ;OeYlc^to^_dz&O|B^w)Abd>7KeE3SAb-pLtBb(q0Lnj>*w+6Ik1hKTJb>sw z@ctW%>W=)&`G5TV-vvb3V$AB+=eLnxo1;UTwl7G1+`LKHwn7w!RA`@>ONumk(2<;6 zT0UmQMPM2P6>2wH#Z}qWW13XV=1P?H8+l2~PHLRVTX2W!~EyB(x#17k=my zo?POJIpaQAH_@o_d?k>eQ_jQhIFnSACXxPjqqc+x8rgoLOG!5NhE<%H@Oj1Kf3d?Y^DNR3busvT&t=(_VGdkt2{R3!2 zSe9O30$1%|`Zu4|Rl?-=z-muHL1ub_ahtB63aZwHjogkHx^Bncr-z&N=9E# z+2|Z;bXlZ18x{S^N=BnJ>n04ad`yK!W_+n!KE>+o-JNyoc>6>gdqa8Z(C$kfFRX7$ zg6)#s4zn-b<7-bknCSBNZ4Lh}bYv&ee(RF-A&fUv_mieQ#<#Y?B~Y#X^G(H~%d5kg zS!jzt;38KvPLh}X&Kys-R27BlCc-gW*RdDMR=Bh(*KW?!4l}1(o$HV_@1E(icH(~d z+qHGJu@`p2Mf7C-Sg4F(9i>X85z>L>S(-q%%z8TKNGlmnBJ4 z>B7KQr()5#aO%*D0)530EA82~baS&GI)Iq8)$w!W+b77ht&}m-wquheQD#%EYBU9a zWp-r#KC)&uN8vq7v}x!r>L%QppK101E@D8MOVaB@>gC|wxc@T9O&x-J$39sZUW$*D zrh)k2!g?3{i3)E#lfnaBqmRv0Pregm*aK_JECGyfT3`WBzCJB%PK(mt9S&&qSr_BW zgY~indhQ@Ns*(XGNS1NS8DfQiQ&V=x!jtsJ#fR8n#sk*MAFXdWmh0x0-!CKWMFm|W zCSuF7{To)FxOGrx&cx6}Tj5`tYS?bI>^BX8}JnJ5okV>*;IV zQ>M`OtCP>Z+2vK*2(dl!V2BfHnwF%6B*{>clq%IQ(>B(rR3O?)tB>t30yf_uv*Wz! zZ`9@H)8An7I4MdrhK4?eo8F7PaQ#g}v*_HRz*1C@F2F!QumG5xK=C4zRPIcc1<;RSfq-v$>c@MVZ=8uy9L-P_G# zXS}$F^lagd<&CC>*!qxRLH*qv5ZCG8LiojNN55uoWgAJ3SeradLaN7@bBuK)lF5kg zO1ej8zaQRF%NdZWz+o^30v=yiV>k?U@+l}rZg5*=l*yE?Oo(;o$xY83xqSN@v3U1L zhQ$Xl6u(nzj|@kPPQlngt**mNGa6jvJO6xFnXv@jD+Q_1P60JjWVtXjSEz#KCUklt zJULN`Ds@n;*08xlAaZ?*7+-oL^U=m3;7rxL_rg_P2Z2bHHqen$4Rqov^*O4h!q_=8 zroct2CZqoP{D9H&>?}Cbf0+3jMpMJnh*;bJT0Q9~fuoI2{2Z+#f{PP5Yi8EU@L^`xZKTF!mXJ4F(Sr&Xki0Vz)9#~j zif4dVu9mY=_t0%(9BR#Bb1?kRQ=?PPwp^KVx^keZ#z}5O-k%hk{ZKI>QO{;6m(0UV zc`KbG+ZPb7g(up_Y`Hj52|pFZJfh9LQ0Mb1Z%~JtH*SO7_RK^Kda6Lg*c$qGAn8`zvmHIut zHMMMC$M0yICy~+YKG`{+wkL*`GJhKQn2NAA$WsmN9a+jzY36rUqW1GKZP0^?L_(7` zO%_-Z1_4_boiY&PMA_c^`BqPcBO;NV0bA5l&OZ}_TG-i}20C~BSN6g1FOSZDY_j2c zxb>1Lr=Mr7RZsrI(`=3(Yty;=Et4pW2yndJ%$5teQH~JsrnIAh0@#TBCp;#|0z=kq1=Uf;7?I7b;$DyzRJZ4 zgbxA7vrgfEAEI~JJhl?m|MN1c44t)<7 zsSc+{X8^trOahZDs(>4J!Qwi`jNW^3y0H>y?XwH1KnlbZM0Q6UO+dI)1MOnDa~`mW!0F&u+gA6NBO;fOi-MVU%Xq2QLp0KWID z;{iYSbu^eiD8Tp<@>2fY;0MFf*?`FVj&*i0NP_#ThJQ(-l zhmvlOH%z=#p~E``CF#b}sMs|*jzq1Y=o%2QTS#?>kk`M6vFM_kgTVh`fn2|GfI@BF zGaB7_6%+;dmZ$d1H|q=f?BI@ZqPpXe)$?*3#-c2}C2_i!3BD@4{DQ{9<6kwvEw8y2 z53@v~i~ATGYuTU72VF!mXk^nYry#y76!;gmyMRaDUvxq5^Llyh`>f%3J2 zAuM;z(pe-GG=ye1vpGNP?_#wBxvHS=Jb_z>+aX+y4z5gsY;*l^MHX_UqQ2y6gE^w- zB6}1c^z$9NE#3FLXv~GGxYB&c9}mhEoayE`7Lv#!GXr#B1c9oV$=TQ${I!T3_NS->L<3d@vSe2Cq>~Ia7-ml8%h#W zE##tEaH3d{-?b8g`uWv3Q5NpS#_-0|Mk{YYq~z*6{kZl7E8n?=o{SfQeNjKF>(X4t zAA_D0DLuD$h5S$wO1fTQr`9h``z*)qSbZksZ2tb~+ypjQtXknuUD5tIhiFB>ML?{^ ze4b0y20OJxv~+PSB~!R++&9`BOfdsNddltvZ9D@<@8U+Aigz5x@IoMx3J$WHgo9O5 z)S&@z`la4Qs(TJDQTjAv%(=HlNCr#GZ7}>1`u+oR2|?RJ;4Jsr`c1QhyYU@Jj1xEx zvyOiMy23tDg|Xttc`KkKDjXa0?#ULE6gQ-bSVTGLkocKu5~KMfd#(%J@N9I^p5CcV zq6*qu!|e#uhL&+Z;-i?0498CO*A(`p0AB{T?qq#pg_K&|S#Q_cqnkb* zzwx%oke;V2A|moqnd8kNQFzO7Of(^4;FpOxp>CpIxf#~>2DZdE->cqE-k^*D7jVsujyfit{*{OQpO~JH6_q46yhLh% z`w_yG>1Z5UeyaWX9^EM+$gsd*l6~4!Uh05{yD`S6w`{YCvEZi{Q#HP@kzBOC=C)R5 z(djvHFwk)to=sLKFo~tyOa0FdvnK1;2^Mk^JBfe11fk4myKiw$as&w1-iQQA9+ETT z#%Hu=b=7uH>#jEm@&*!LNoD!$MVbETM)t7^shqa?h3OSNLnHaqs9HOD>QZzfxMkvyZtm81M#dM`bp|kl;%$YMl+*cW982+0 z@3yiQ1+#FIm8s!hq!AeJLmUhP)rtBjCfAWNT&xnNb4H2$XfqgFxMam9;p1#MlN1NF zKNwEU^FgoVy(r3D|4>pjJOJl!rci#~KV0})l;MY|a;+S0ZoG%gD9+g~uK(vJn5MZ- z$e_59>}Ck4-hv6WH#ap~67%(wKDvYx$@}phc$J1%JjIlVbs^LBNRY)TUQGcgiILFe zY$0dhurRbmJrW(w8^~F{!-=YYE+O?MbR;t3Lv-9$Ng~BYj;c;(IHoO40JyOD&?PR({wdZY-kM%RyrcNBVPVC2@bTw0W z^E7Uf-soN-s&P-a;%_G2`Lgd|*TgT48pZ9mL&>L0Lf`E;Az0cogLM;;dWFkwKU$8b zX~~CxYpruIH(lh;%DyUdPy$LJ9#NjmhYcG4*W2X*2-uM|w51!!I|v<$bS ztu4H@pI+Zf;L_4SVECE7-!3ByGp_h`mMZ#aImG?A5j}S{ukXv4SP7^cKar>MKJVdi zxay`j33|ZU=WCr-lXe9OqaV}1jiy}IG|00y#CRXO^22t?VG3D?Z%Iy35yt5}=Yi_h z!>#*l9@@{GL5%m~>eplI#n1E)4ywkmL=@sb4Zt*$0G=yw z)4ZbPumzfIW#+_g#nYx_E-jPji{Td$oM9(>1vU#+o(m)wS!k{jJ_0T+QTmh67Oj~M z{tKa-S7W2g9d}|x1ZTWq%STH~)4oV!mqY7p+4U1xxmW;R0fyUZ8-_!_GW@m=9ME2} zi7qq9)11LkSmz#B1N-JB0hOyUj90q9Dc>~jShbM#c(bvxs{YzyG96le{^-0NJ9z=uY)evT=x%fKF z;gtuPstSx}uiJT>DXcMySGt#oGYjrLLKbp+!wSY5da>9Ga<{f0s%$seg=Am|%l!M7 zPmvVzSs{V1Ij09Hqx18!24MjgBMd+g{f-a207D)XZ$e&qreM?_u{@0O?B8Pw-l?@D z7U?Hhy-PaE6zE-l;MnEtb6dnBRR<6pQK}cEu$c-zwH8z4>XC_tOQnOaK1&vb61OcnDll916P3I6i}#Bh z#=@5qDyl~qPY=Z)tKWgfl6~c#hF*p$-8Ro*`YFB4&-|EOj#)xng7$$zqm4N= z)qWW1@x(gta6~2XXy$W9IdEzWJm>6GHxXdfka{7|u1+-OZ*TicO6tum{x*h4eul^O z^YhJXZV%QEn)-5s`kx8-t4>% z#HeJODSTiqtxgVuMu!|x!e0l>DmS5B79iCVkC8cj6feJ*PSH>mRY#c?af~>+r|i$v zZr7#$V?rNSkZYZkABZn~JGf1dlJWIAG%`b?rCOq&5djge6~oML)<1{~H_S79uI~-e z8)-s(`g8r6q;d}F#1&~Zraar>xs8YLQ>DMx!^`i{!6J4n&HB1VSoW4z9_32vu0d=* zy+W?_QM!5mM*lpVV1Y$MS9DR@%{;=g%Dhqrz}ls)q;6qzr}Mc39H}M?EFL2+isxXj z@eGV8;ERC8PGAqSx15=O3I!1qAiro$EeYq;&#Oy!U4T$I(n^syQjF8dt-GC*z?ziR z?Y^$GXB)duq27rV8Ts^*g5H&fRZ=nNr?WfBT^`ZUcWxSWb!nf!P)D@deHwWYkjwtM znN!Xpbhdf){kMo4;tkEsIrzf+$wj(0^kQ%6QHEl~)+AC~bXa(#Cs2PreF@uogJ~e zdWp;%eO7#P{}_$3wEH9ZY`x*LF>yws|5*J~kFxPjrP7Fz2hmgPef`JA-Tm?VL@%+0 zn_0B9k*~MrVC~IEmu1wr7a(6!i}6v*#Zk?k`p)NOco8a%Z3&0TbRQ>HZ1U)h;&ChM zvr={aBY!IDjPmGzO_m_-42Lx9O)`L`2)nOIq^+_Bx|(|~3#@O1p?UJBzxNGQZAm=3 zrw@b;#41RzId_F$ig$c~r{*(RQ`45?=CFenv9~~b>gm_Psa_{EQ>(1*8XYW;o2z)q za(p^5EZk)QEVoi;Hs-$^eI0aJ3WO^MW@S-E&qYYeDgaA*4);YU@GXbMA0S+WI~q23 zD(Gi3ttTQ3tdj7~JHHiwhxPWKcQNIAxA!5$4%6Z)jgkh;J3}8OWoc34d^GUnjK1X` zyGPA+_IdQE_vU3QjXg1bJr|BeLX8@*WlOzErKFqeG$d}_!AjEHwEqHiO~Z3DPN5>9 z{U_p=C36X!v;w3EkJtM*%QmXe40CzdDEClIirMFYydJS>YWARqu|b7tp=oq)#}nS2 zLI>=Z$or)ejN(oqDj9&(s>|#~5I(5d*WoG_Jm%Gt-ROxt7G_k0k75q#xO4goeQzLD zBQs0X=LIui)+&qS+a>97HH9YJyM(mU9deN?nP<8Jm>y?bd%kr!)tZ2j(Zv}$MtSu% z>+^A0ixUh8mGkPlBW%gtw_Um@CqM-CmUb9tf9lxj1qfq&hm?iTN27o0qN$ip3UcPj zyb=I}(wR4LU3a5Fp1W$UW0ZxS(1M(`N+BtfHJmvc3Fs#|U3>nd;GG^M7=*|TCTQi+ z>$PPB_ofNATiO(W@a~?OeuG_3QOyI@@tI7^CQusg>Wdf! zZ2tB%JP39G;4`-Wp5w;m^Q2BPBHp1LfA4Gu&Bf5vqJj{2K8Wj2XcK~SRhag|y{8W0 z&O!z@4&|(`RG8$%5sA&OSlX=$L2~(1-R;X=gP2ha2&0!U?cANg_71u8yo`u`CktA; z#BrC4m19-&=&78#D+B?BqQ(zzz@!Qik$L|%tV&^c|B+ySNXDtm=3%IS3?UjKDVtdU@TJQ9lX?buOpUcf5QI(+gCKWHVj^)@uh0)G;fr*^4l zUi8Tw1RYGsLab_eebWAXau_{WO?^P-m_^%VrSUm%EU0Yc7!G;5?t3JkNg(xMA?XpU zo$aEbJ8E#I&Aq%MQ6O)$@*w;+ZxHYF!RBZd>0tX0$R=a3#GME0$=5bVv7{_CpD7f8 z`8ZyrW@+DqBblN@@SJq8h17JcGJyr>e;s476?h3lz^Vb1oDP+dda}4rPKK}cCBP|rdAQ-m2wMET*{iWpmW5xN220IL>9>hhNXB>zDaiJ4ePCXz4WYndaAcA ze&5fuCD7nVT`~J2_ks$hN54{IHBK(bUkh$8tRWA!i9*M+=8ysAxizE&_ngauigJa) zT7!h`jp$f?FD)$aL%mnle73pnypJL^v1)#-7#LKel3R)U?2-SYx3wWuNVDD zMOzdtWhi_Mefj2eCCrg2c;((78)5c^YTJiRalM}QiIPv9-iDbCy%b0pZ_*jx96w%$aZ=aLHgV&9HD){k8wAt zISpk%RIuxg5~UG|1-8cTmYdri{^HxnnapG1INLZ%>8PgYM5>gE7YOT#5lGklwI@ew zt3%E@m`p1M8}JG~um?wmKY6q;_{Xa?8;9M#Hsrk%rS(XnW0Ojh#k zN@kQ5KiQkaDB_*ewe+DiZM1r1-gYvvOn-O^IefwwAZ546JgUeU^LL;h%FgK!8 z5p*_$*UfsHfJV3>oOq)-nPgq|Ab-fs5M$sTFcA6LrOh^u0!imlr6lZR^I2o-I%wm@aNjZ%o9J8 zHXA~?>jkeLm>ZD)1zu2{U<`NsX5%i_kA`z2|(xBIxhwt@u%g5%6& z|B3Me$joi*BG-)L^06(VA=;bi&R>+y z&M~7p#2d1yhIpBlUW>$}r&?TB0(#>~$E}(Y$?p+87}}KZeoFQwyv{sG*MH<|M6q+d2dh3)s;REsl%<`E1fnTDWp;RRg=5fr{ph z>AZxPQwQ%jwbpFjT|cK~TGMlc1EYkuxIPV;6Mrde~^LjT5_A-q-4Ie|p~{bU%jMA`fnKoB>hz$OAT`UcN?B_uFlo@fD@heb5i@7rHP(4F?Kf$=;06I?Boa=tA+U!_yDHUm{@0+&-q)(Nh& znUU4j6IQh~&V>7^W54_TXE$Dg#^voWFI(E&xu@$ImhI*KV8pLz)P4*YZ{!mJ zoh4r$UZYz6@IL4GC}vqQVC*CvE>b;!*gC(VP`r>W&q^M2&BFD43ix;QJc-knZ9If5 z{$i@tVC^My3;$7ZL{nuuokf{ZE<%ry=WZXmQ$OCbhhN1OOJ}AFXW`f+jv zqN>Kza^&#RrN0t&1sfD~WfgL*vTugp;#J8@6Y+ni^)Vs5&#BI*x~mK632p@LRAifm z)%y+eF$7%=$tBsM53i%cMgIqP^;Q9~Vp251Jr72%Bw*_0{$+HYW zUamL#ik8x|JbjbvG_%{~O=@f{t2(%~+Gl9ihdrF0xG+8DKBIv7hG zmPUyTuyoSAB&bwBdnVzVW1BeDbgE}iQQ=Q%Vc9stJpC0gXJ(9GONPSjpEM%+2!8?& z{hqwY=Hjlf$z5?{iX?w(7h620d{0Y4rUUmjQu~= zG0G5de{gO)Vf7zxdg{opO$dw@)={89bRyXA<*qGCyqbodgY%gZT-)3*5H=TGr+&iV z8u42t3)l!XC&sySe_jw`RC}uXxIpWGP3IULZmx}mni1*(WxW$*hU z{$Bd!9)F+;Xt&Q2#9wgNVKx!Q&+KW(6iaBfN=F&Po#Bnx$GJbv-<+VGCAgott>I&a zafhQsv&oeJJzx{Cdg^?{T^J&th*5b=Sjl>xJF*=yUCI5F` z2L8HrwtR*MW*~_l4ady;j($aWmDVfvwkz=WJ6KKWCDzmoB$;iqgR4$GYrd#Bf8sYG z4be8=t$NgKIzSVn8j8bLE!hicP}a7Uz}i9GdO&n5&|df)2X|!?Eh+~~R|)^IQce!; za)AILdS(<4R=V%eu1I@ZoA#5eWi~UQl@zaR&jLPAA=7q?qt z&0e>o^FyB_JHQ1jYrL-QnvUm#tg?ly@T~nvl^B^(@e~&^11O*QarT9Nu7Nj*CIw><-h%c?Cai0sIb)o>3kg=G!eFk5zj$e9sIS4b~ z%zh;QNFU?Q&KkILPzdgabpix1<-rKkKO&fe!_xq{?Wt`gafMv@f|4IlX4FDT{Z^yl z`Pu2rF|{f1NcU?H8DR5Y8pDcK@A`Nlfk6EzH7s<=b1TErF~ETp5TerG``73{L_Ike zw`QS(?N3vn(feBH!zt@iU?mvT`Yok_$Z;454-n8@1&~S$NNDcF0LABc+#$&ORFDF; zIcbH8wEHRZScS=C-wB>|DrAyq{RirGff@BZ*D>|L$bs;C>dU&_Hn?Os6&EemcoJO! ze$-RM=_8>wmSWB89>3iaJOcvI<={R+S$vIMxqo_u+hHxymwn+K=QNLXwz2Z)hmjo@ zMG&x_8Y(HiW!>EsiLs*CBVKp@pYqtMwZ+ z9!_{2r4NFO3r*1+CuOPk->#*O8w$}H|5t5kCo1_X4&VC-RHQ?op)*vngAiVQ^c~#y zJ0y@%$>%L783!hQP&jyQ(%YqKmp?CLHLn;B!B&BS;EWq=NjYI*8D{8dLeGG>aVd3j zWY*hm{Opi;1b^1s~tC}q85b~jn5u|0&yW;kb3VgDXnE%MI|e!F+WG(e@&mlE_")+7, line.toLowerCase().indexOf("")); - prop.put("title", title); - return prop; - }catch(IndexOutOfBoundsException e){} + String url=(String) post.get("url"); + if (!url.toLowerCase().startsWith("http://")) { + url = "http://" + url; + } + if (actions.indexOf("title")>=0) { + try { + content = httpc.wget(new URL(url)); + + Iterator it = content.iterator(); + String line; + String title; + while (it.hasNext()) { + line = (String) it.next(); + try { + title = line.substring(line.toLowerCase().indexOf( + "") + 7, line.toLowerCase().indexOf( + "")); + prop.put("title", title); + } catch (IndexOutOfBoundsException e) { + } + } + + } catch (MalformedURLException e) { + } catch (IOException e) { } - } catch (MalformedURLException e) {} catch (IOException e) {} + } + if(actions.indexOf("robots")>=0){ + try { + if(robotsParser.isDisallowed(new URL(url))){ + prop.put("robots-allowed", 0); + }else{ + prop.put("robots-allowed", 1); + } + } catch (MalformedURLException e) {} + } + } // return rewrite properties return prop; diff --git a/htroot/xml/util/getpageinfo_p.xml b/htroot/xml/util/getpageinfo_p.xml new file mode 100644 index 000000000..8c7eb2d55 --- /dev/null +++ b/htroot/xml/util/getpageinfo_p.xml @@ -0,0 +1,5 @@ + + + #[title]# + #(robots-allowed)#0::1::#(/robots-allowed)# + \ No newline at end of file diff --git a/htroot/xml/util/gettitle_p.xml b/htroot/xml/util/gettitle_p.xml deleted file mode 100644 index 00111a470..000000000 --- a/htroot/xml/util/gettitle_p.xml +++ /dev/null @@ -1,2 +0,0 @@ - -#[title]# \ No newline at end of file