*) Synchronizing robots.txt downloads to avoid parallel downloads of the same file by separate threads

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@998 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
theli 20 years ago
parent 8194fde340
commit 3b5d0eb053

@ -163,64 +163,67 @@ public final class robotsParser{
// generating the hostname:poart string needed to do a DB lookup // generating the hostname:poart string needed to do a DB lookup
String urlHostPort = nexturl.getHost() + ":" + ((nexturl.getPort()==-1)?80:nexturl.getPort()); String urlHostPort = nexturl.getHost() + ":" + ((nexturl.getPort()==-1)?80:nexturl.getPort());
urlHostPort = urlHostPort.toLowerCase(); urlHostPort = urlHostPort.toLowerCase().intern();
// doing a DB lookup to determine if the robots data is already available plasmaCrawlRobotsTxt.Entry robotsTxt4Host = null;
plasmaCrawlRobotsTxt.Entry robotsTxt4Host = plasmaSwitchboard.robots.getEntry(urlHostPort); synchronized(urlHostPort) {
// doing a DB lookup to determine if the robots data is already available
// if we have not found any data or the data is older than 7 days, we need to load it from the remote server robotsTxt4Host = plasmaSwitchboard.robots.getEntry(urlHostPort);
if (
(robotsTxt4Host == null) || // if we have not found any data or the data is older than 7 days, we need to load it from the remote server
(robotsTxt4Host.getLoadedDate() == null) || if (
(System.currentTimeMillis() - robotsTxt4Host.getLoadedDate().getTime() > 7*24*60*60*1000) (robotsTxt4Host == null) ||
) { (robotsTxt4Host.getLoadedDate() == null) ||
URL robotsURL = null; (System.currentTimeMillis() - robotsTxt4Host.getLoadedDate().getTime() > 7*24*60*60*1000)
// generating the proper url to download the robots txt ) {
try { URL robotsURL = null;
robotsURL = new URL(nexturl.getProtocol(),nexturl.getHost(),(nexturl.getPort()==-1)?80:nexturl.getPort(),"/robots.txt"); // generating the proper url to download the robots txt
} catch (MalformedURLException e) { try {
serverLog.logSevere("ROBOTS","Unable to generate robots.txt URL for URL '" + nexturl.toString() + "'."); robotsURL = new URL(nexturl.getProtocol(),nexturl.getHost(),(nexturl.getPort()==-1)?80:nexturl.getPort(),"/robots.txt");
return false; } catch (MalformedURLException e) {
} serverLog.logSevere("ROBOTS","Unable to generate robots.txt URL for URL '" + nexturl.toString() + "'.");
return false;
Object[] result = null;
boolean accessCompletelyRestricted = false;
byte[] robotsTxt = null;
String eTag = null;
Date modDate = null;
try {
serverLog.logFine("ROBOTS","Trying to download the robots.txt file from URL '" + robotsURL + "'.");
result = downloadRobotsTxt(robotsURL,5,robotsTxt4Host);
if (result != null) {
accessCompletelyRestricted = ((Boolean)result[0]).booleanValue();
robotsTxt = (byte[])result[1];
eTag = (String) result[2];
modDate = (Date) result[3];
} else if (robotsTxt4Host != null) {
robotsTxt4Host.setLoadedDate(new Date());
plasmaSwitchboard.robots.addEntry(robotsTxt4Host);
} }
} catch (Exception e) {
serverLog.logSevere("ROBOTS","Unable to download the robots.txt file from URL '" + robotsURL + "'. " + e.getMessage());
}
if ((robotsTxt4Host==null)||((robotsTxt4Host!=null)&&(result!=null))) { Object[] result = null;
ArrayList denyPath = null; boolean accessCompletelyRestricted = false;
if (accessCompletelyRestricted) { byte[] robotsTxt = null;
denyPath = new ArrayList(); String eTag = null;
denyPath.add("/"); Date modDate = null;
} else { try {
// parsing the robots.txt Data and converting it into an arraylist serverLog.logFine("ROBOTS","Trying to download the robots.txt file from URL '" + robotsURL + "'.");
try { result = downloadRobotsTxt(robotsURL,5,robotsTxt4Host);
denyPath = robotsParser.parse(robotsTxt);
} catch (IOException e) { if (result != null) {
serverLog.logSevere("ROBOTS","Unable to parse the robots.txt file from URL '" + robotsURL + "'."); accessCompletelyRestricted = ((Boolean)result[0]).booleanValue();
robotsTxt = (byte[])result[1];
eTag = (String) result[2];
modDate = (Date) result[3];
} else if (robotsTxt4Host != null) {
robotsTxt4Host.setLoadedDate(new Date());
plasmaSwitchboard.robots.addEntry(robotsTxt4Host);
} }
} catch (Exception e) {
serverLog.logSevere("ROBOTS","Unable to download the robots.txt file from URL '" + robotsURL + "'. " + e.getMessage());
} }
// storing the data into the robots DB if ((robotsTxt4Host==null)||((robotsTxt4Host!=null)&&(result!=null))) {
robotsTxt4Host = plasmaSwitchboard.robots.addEntry(urlHostPort,denyPath,new Date(),modDate,eTag); ArrayList denyPath = null;
if (accessCompletelyRestricted) {
denyPath = new ArrayList();
denyPath.add("/");
} else {
// parsing the robots.txt Data and converting it into an arraylist
try {
denyPath = robotsParser.parse(robotsTxt);
} catch (IOException e) {
serverLog.logSevere("ROBOTS","Unable to parse the robots.txt file from URL '" + robotsURL + "'.");
}
}
// storing the data into the robots DB
robotsTxt4Host = plasmaSwitchboard.robots.addEntry(urlHostPort,denyPath,new Date(),modDate,eTag);
}
} }
} }

Loading…
Cancel
Save