*) Synchronizing robots.txt downloads to avoid parallel downloads of the same file by separate threads

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@998 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
theli 20 years ago
parent 8194fde340
commit 3b5d0eb053

@ -163,10 +163,12 @@ public final class robotsParser{
// generating the hostname:poart string needed to do a DB lookup // generating the hostname:poart string needed to do a DB lookup
String urlHostPort = nexturl.getHost() + ":" + ((nexturl.getPort()==-1)?80:nexturl.getPort()); String urlHostPort = nexturl.getHost() + ":" + ((nexturl.getPort()==-1)?80:nexturl.getPort());
urlHostPort = urlHostPort.toLowerCase(); urlHostPort = urlHostPort.toLowerCase().intern();
plasmaCrawlRobotsTxt.Entry robotsTxt4Host = null;
synchronized(urlHostPort) {
// doing a DB lookup to determine if the robots data is already available // doing a DB lookup to determine if the robots data is already available
plasmaCrawlRobotsTxt.Entry robotsTxt4Host = plasmaSwitchboard.robots.getEntry(urlHostPort); robotsTxt4Host = plasmaSwitchboard.robots.getEntry(urlHostPort);
// if we have not found any data or the data is older than 7 days, we need to load it from the remote server // if we have not found any data or the data is older than 7 days, we need to load it from the remote server
if ( if (
@ -223,6 +225,7 @@ public final class robotsParser{
robotsTxt4Host = plasmaSwitchboard.robots.addEntry(urlHostPort,denyPath,new Date(),modDate,eTag); robotsTxt4Host = plasmaSwitchboard.robots.addEntry(urlHostPort,denyPath,new Date(),modDate,eTag);
} }
} }
}
if (robotsTxt4Host.isDisallowed(nexturl.getPath())) { if (robotsTxt4Host.isDisallowed(nexturl.getPath())) {
return true; return true;

Loading…
Cancel
Save