*) Bugfix for "Robots.txt wird immer wieder geladen"

See: http://www.yacy-forum.de/viewtopic.php?p=10241#10233

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@794 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
theli 19 years ago
parent 35c6c5ead7
commit 023be89586

@ -44,32 +44,24 @@
package de.anomic.data;
import java.lang.String;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLDecoder;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashSet;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLDecoder;
import java.util.ArrayList;
import java.util.Date;
import de.anomic.http.httpHeader;
import de.anomic.http.httpc;
import de.anomic.plasma.plasmaCrawlRobotsTxt;
import de.anomic.plasma.plasmaParser;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaURL;
import de.anomic.server.logging.serverLog;
import de.anomic.tools.bitfield;
import de.anomic.yacy.yacyCore;
/*
* A class for Parsing robots.txt files.
@ -145,11 +137,32 @@ public final class robotsParser{
return deny;
}
public static boolean containsRobotsData(URL nexturl) {
// generating the hostname:poart string needed to do a DB lookup
String urlHostPort = nexturl.getHost() + ":" + ((nexturl.getPort()==-1)?80:nexturl.getPort());
urlHostPort = urlHostPort.toLowerCase();
// doing a DB lookup to determine if the robots data is already available
plasmaCrawlRobotsTxt.Entry robotsTxt4Host = plasmaSwitchboard.robots.getEntry(urlHostPort);
// if we have not found any data or the data is older than 7 days, we need to load it from the remote server
if ((robotsTxt4Host == null) || (robotsTxt4Host.getLoadedDate() == null) ||
(System.currentTimeMillis() - robotsTxt4Host.getLoadedDate().getTime() > 7*24*60*60*1000)) {
return false;
}
return true;
}
// public static boolean enqueueRobotsCheck(String nexturlString, String referrerString, String initiatorHash, String name, Date loadDate, int currentdepth, plasmaCrawlProfile.entry profile) {
//
// }
public static boolean isDisallowed(URL nexturl) {
if (nexturl == null) throw new IllegalArgumentException();
// generating the hostname:poart string needed to do a DB lookup
String urlHostPort = nexturl.getHost() + ":" + ((nexturl.getPort()==-1)?80:nexturl.getPort());
urlHostPort = urlHostPort.toLowerCase();
// doing a DB lookup to determine if the robots data is already available
plasmaCrawlRobotsTxt.Entry robotsTxt4Host = plasmaSwitchboard.robots.getEntry(urlHostPort);
@ -259,18 +272,4 @@ public final class robotsParser{
}
return new Object[]{new Boolean(accessCompletelyRestricted),robotsTxt};
}
// /*
// * Test the class with a file robots.txt in the workdir and a testpath as argument.
// */
// public static void main(String[] args){
// robotsParser rp=new robotsParser(new File("robots.txt"));
// for(int i=0;i<args.length;i++){
// if(rp.isAllowedRobots(args[i])){
// System.out.println(args[i]+" is allowed.");
// }else{
// System.out.println(args[i]+" is NOT allowed.");
// }
// }
// }
}

@ -156,7 +156,7 @@ public class plasmaCrawlRobotsTxt {
private String hostName;
public Entry(String hostName, Map mem) {
this.hostName = hostName;
this.hostName = hostName.toLowerCase();
this.mem = mem;
if (this.mem.containsKey(DISALLOW_PATH_LIST)) {

Loading…
Cancel
Save