AJAX Check for robots.txt before crawling.

Icons from herrlich
TODO: Style it nicely ;-)

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1689 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
allo 19 years ago
parent 0b5a736280
commit 62664d7252

@ -3,6 +3,8 @@
<head>
<title>YaCy '#[clientname]#': Index Creation</title>
#%env/templates/metas.template%#
<script src="/js/ajax.js"></script>
<script src="/js/IndexCreate.js"></script>
</head>
<body marginheight="0" marginwidth="0" leftmargin="0" topmargin="0">
#%env/templates/header.template%#
@ -124,7 +126,13 @@ You can define URLs as start points for Web page crawling and start crawling her
</tr>
<tr><td class="small">From&nbsp;URL:</td>
<td class="small"><input type="radio" name="crawlingMode" value="url" checked="checked"></td>
<td class="small"><input name="crawlingURL" type="text" size="41" maxlength="256" value="http://"></td>
<td class="small">
<input name="crawlingURL" type="text" size="41" maxlength="256" value="http://" onkeypress="changed()">
<span id="robotsOK"></span>
</td>
</tr>
<tr>
<td colspan="2"><span id="title"></span></td>
</tr>
</table>
</td>

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.9 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 7.0 KiB

@ -8,6 +8,6 @@ function handleResponse(){
function loadTitle(){
url=document.getElementsByName("url")[0].value;
if(document.getElementsByName("title")[0].value==""){
sndReq('/xml/util/gettitle_p.xml?url='+url);
sndReq('/xml/util/getpageinfo_p.xml?actions=title&url='+url);
}
}

@ -51,36 +51,58 @@ import java.net.URL;
import java.util.ArrayList;
import java.util.Iterator;
import de.anomic.data.robotsParser;
import de.anomic.http.httpHeader;
import de.anomic.http.httpc;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
public class gettitle_p {
public class getpageinfo_p {
public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) {
serverObjects prop = new serverObjects();
prop.put("title", "");
prop.put("robots-allowed", 3); //unknown
String actions="title";
if(post!=null && post.containsKey("url")){
if(post.containsKey("actions"))
actions=(String)post.get("actions");
ArrayList content;
String url;
try {
url=(String) post.get("url");
if(!url.toLowerCase().startsWith("http://")){
url="http://"+url;
}
content = httpc.wget(new URL(url));
Iterator it=content.iterator();
String line;
String title;
while(it.hasNext()){
line=(String) it.next();
try{
title=line.substring(line.toLowerCase().indexOf("<title>")+7, line.toLowerCase().indexOf("</title>"));
prop.put("title", title);
return prop;
}catch(IndexOutOfBoundsException e){}
String url=(String) post.get("url");
if (!url.toLowerCase().startsWith("http://")) {
url = "http://" + url;
}
if (actions.indexOf("title")>=0) {
try {
content = httpc.wget(new URL(url));
Iterator it = content.iterator();
String line;
String title;
while (it.hasNext()) {
line = (String) it.next();
try {
title = line.substring(line.toLowerCase().indexOf(
"<title>") + 7, line.toLowerCase().indexOf(
"</title>"));
prop.put("title", title);
} catch (IndexOutOfBoundsException e) {
}
}
} catch (MalformedURLException e) {
} catch (IOException e) {
}
} catch (MalformedURLException e) {} catch (IOException e) {}
}
if(actions.indexOf("robots")>=0){
try {
if(robotsParser.isDisallowed(new URL(url))){
prop.put("robots-allowed", 0);
}else{
prop.put("robots-allowed", 1);
}
} catch (MalformedURLException e) {}
}
}
// return rewrite properties
return prop;

@ -0,0 +1,5 @@
<?xml version='1.0' standalone='yes'?>
<pageinfo>
<title>#[title]#</title>
<robots>#(robots-allowed)#0::1::#(/robots-allowed)#</robots>
</pageinfo>

@ -1,2 +0,0 @@
<?xml version='1.0' standalone='yes'?>
<title>#[title]#</title>
Loading…
Cancel
Save