*) Synchronizing robots.txt downloads to avoid parallel downloads of the same file by separate threads

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@998 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
theli 20 years ago
parent 8194fde340
commit 3b5d0eb053

@ -163,66 +163,69 @@ public final class robotsParser{
// generating the hostname:poart string needed to do a DB lookup // generating the hostname:poart string needed to do a DB lookup
String urlHostPort = nexturl.getHost() + ":" + ((nexturl.getPort()==-1)?80:nexturl.getPort()); String urlHostPort = nexturl.getHost() + ":" + ((nexturl.getPort()==-1)?80:nexturl.getPort());
urlHostPort = urlHostPort.toLowerCase(); urlHostPort = urlHostPort.toLowerCase().intern();
// doing a DB lookup to determine if the robots data is already available plasmaCrawlRobotsTxt.Entry robotsTxt4Host = null;
plasmaCrawlRobotsTxt.Entry robotsTxt4Host = plasmaSwitchboard.robots.getEntry(urlHostPort); synchronized(urlHostPort) {
// doing a DB lookup to determine if the robots data is already available
// if we have not found any data or the data is older than 7 days, we need to load it from the remote server robotsTxt4Host = plasmaSwitchboard.robots.getEntry(urlHostPort);
if (
(robotsTxt4Host == null) ||
(robotsTxt4Host.getLoadedDate() == null) ||
(System.currentTimeMillis() - robotsTxt4Host.getLoadedDate().getTime() > 7*24*60*60*1000)
) {
URL robotsURL = null;
// generating the proper url to download the robots txt
try {
robotsURL = new URL(nexturl.getProtocol(),nexturl.getHost(),(nexturl.getPort()==-1)?80:nexturl.getPort(),"/robots.txt");
} catch (MalformedURLException e) {
serverLog.logSevere("ROBOTS","Unable to generate robots.txt URL for URL '" + nexturl.toString() + "'.");
return false;
}
Object[] result = null; // if we have not found any data or the data is older than 7 days, we need to load it from the remote server
boolean accessCompletelyRestricted = false; if (
byte[] robotsTxt = null; (robotsTxt4Host == null) ||
String eTag = null; (robotsTxt4Host.getLoadedDate() == null) ||
Date modDate = null; (System.currentTimeMillis() - robotsTxt4Host.getLoadedDate().getTime() > 7*24*60*60*1000)
try { ) {
serverLog.logFine("ROBOTS","Trying to download the robots.txt file from URL '" + robotsURL + "'."); URL robotsURL = null;
result = downloadRobotsTxt(robotsURL,5,robotsTxt4Host); // generating the proper url to download the robots txt
try {
if (result != null) { robotsURL = new URL(nexturl.getProtocol(),nexturl.getHost(),(nexturl.getPort()==-1)?80:nexturl.getPort(),"/robots.txt");
accessCompletelyRestricted = ((Boolean)result[0]).booleanValue(); } catch (MalformedURLException e) {
robotsTxt = (byte[])result[1]; serverLog.logSevere("ROBOTS","Unable to generate robots.txt URL for URL '" + nexturl.toString() + "'.");
eTag = (String) result[2]; return false;
modDate = (Date) result[3];
} else if (robotsTxt4Host != null) {
robotsTxt4Host.setLoadedDate(new Date());
plasmaSwitchboard.robots.addEntry(robotsTxt4Host);
} }
} catch (Exception e) {
serverLog.logSevere("ROBOTS","Unable to download the robots.txt file from URL '" + robotsURL + "'. " + e.getMessage()); Object[] result = null;
} boolean accessCompletelyRestricted = false;
byte[] robotsTxt = null;
if ((robotsTxt4Host==null)||((robotsTxt4Host!=null)&&(result!=null))) { String eTag = null;
ArrayList denyPath = null; Date modDate = null;
if (accessCompletelyRestricted) { try {
denyPath = new ArrayList(); serverLog.logFine("ROBOTS","Trying to download the robots.txt file from URL '" + robotsURL + "'.");
denyPath.add("/"); result = downloadRobotsTxt(robotsURL,5,robotsTxt4Host);
} else {
// parsing the robots.txt Data and converting it into an arraylist if (result != null) {
try { accessCompletelyRestricted = ((Boolean)result[0]).booleanValue();
denyPath = robotsParser.parse(robotsTxt); robotsTxt = (byte[])result[1];
} catch (IOException e) { eTag = (String) result[2];
serverLog.logSevere("ROBOTS","Unable to parse the robots.txt file from URL '" + robotsURL + "'."); modDate = (Date) result[3];
} else if (robotsTxt4Host != null) {
robotsTxt4Host.setLoadedDate(new Date());
plasmaSwitchboard.robots.addEntry(robotsTxt4Host);
} }
} } catch (Exception e) {
serverLog.logSevere("ROBOTS","Unable to download the robots.txt file from URL '" + robotsURL + "'. " + e.getMessage());
}
// storing the data into the robots DB if ((robotsTxt4Host==null)||((robotsTxt4Host!=null)&&(result!=null))) {
robotsTxt4Host = plasmaSwitchboard.robots.addEntry(urlHostPort,denyPath,new Date(),modDate,eTag); ArrayList denyPath = null;
} if (accessCompletelyRestricted) {
} denyPath = new ArrayList();
denyPath.add("/");
} else {
// parsing the robots.txt Data and converting it into an arraylist
try {
denyPath = robotsParser.parse(robotsTxt);
} catch (IOException e) {
serverLog.logSevere("ROBOTS","Unable to parse the robots.txt file from URL '" + robotsURL + "'.");
}
}
// storing the data into the robots DB
robotsTxt4Host = plasmaSwitchboard.robots.addEntry(urlHostPort,denyPath,new Date(),modDate,eTag);
}
}
}
if (robotsTxt4Host.isDisallowed(nexturl.getPath())) { if (robotsTxt4Host.isDisallowed(nexturl.getPath())) {
return true; return true;

Loading…
Cancel
Save