From 3b5d0eb0536c9dd01f3495885b705dbc2415f6b6 Mon Sep 17 00:00:00 2001 From: theli Date: Fri, 28 Oct 2005 09:10:48 +0000 Subject: [PATCH] *) Synchronizing robots.txt downloads to avoid parallel downloads of the same file by separate threads git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@998 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- source/de/anomic/data/robotsParser.java | 113 ++++++++++++------------ 1 file changed, 58 insertions(+), 55 deletions(-) diff --git a/source/de/anomic/data/robotsParser.java b/source/de/anomic/data/robotsParser.java index 672a2309d..c638df0dc 100644 --- a/source/de/anomic/data/robotsParser.java +++ b/source/de/anomic/data/robotsParser.java @@ -163,66 +163,69 @@ public final class robotsParser{ // generating the hostname:poart string needed to do a DB lookup String urlHostPort = nexturl.getHost() + ":" + ((nexturl.getPort()==-1)?80:nexturl.getPort()); - urlHostPort = urlHostPort.toLowerCase(); + urlHostPort = urlHostPort.toLowerCase().intern(); - // doing a DB lookup to determine if the robots data is already available - plasmaCrawlRobotsTxt.Entry robotsTxt4Host = plasmaSwitchboard.robots.getEntry(urlHostPort); - - // if we have not found any data or the data is older than 7 days, we need to load it from the remote server - if ( - (robotsTxt4Host == null) || - (robotsTxt4Host.getLoadedDate() == null) || - (System.currentTimeMillis() - robotsTxt4Host.getLoadedDate().getTime() > 7*24*60*60*1000) - ) { - URL robotsURL = null; - // generating the proper url to download the robots txt - try { - robotsURL = new URL(nexturl.getProtocol(),nexturl.getHost(),(nexturl.getPort()==-1)?80:nexturl.getPort(),"/robots.txt"); - } catch (MalformedURLException e) { - serverLog.logSevere("ROBOTS","Unable to generate robots.txt URL for URL '" + nexturl.toString() + "'."); - return false; - } + plasmaCrawlRobotsTxt.Entry robotsTxt4Host = null; + synchronized(urlHostPort) { + // doing a DB lookup to determine if the robots data is already available + robotsTxt4Host = plasmaSwitchboard.robots.getEntry(urlHostPort); - Object[] result = null; - boolean accessCompletelyRestricted = false; - byte[] robotsTxt = null; - String eTag = null; - Date modDate = null; - try { - serverLog.logFine("ROBOTS","Trying to download the robots.txt file from URL '" + robotsURL + "'."); - result = downloadRobotsTxt(robotsURL,5,robotsTxt4Host); - - if (result != null) { - accessCompletelyRestricted = ((Boolean)result[0]).booleanValue(); - robotsTxt = (byte[])result[1]; - eTag = (String) result[2]; - modDate = (Date) result[3]; - } else if (robotsTxt4Host != null) { - robotsTxt4Host.setLoadedDate(new Date()); - plasmaSwitchboard.robots.addEntry(robotsTxt4Host); + // if we have not found any data or the data is older than 7 days, we need to load it from the remote server + if ( + (robotsTxt4Host == null) || + (robotsTxt4Host.getLoadedDate() == null) || + (System.currentTimeMillis() - robotsTxt4Host.getLoadedDate().getTime() > 7*24*60*60*1000) + ) { + URL robotsURL = null; + // generating the proper url to download the robots txt + try { + robotsURL = new URL(nexturl.getProtocol(),nexturl.getHost(),(nexturl.getPort()==-1)?80:nexturl.getPort(),"/robots.txt"); + } catch (MalformedURLException e) { + serverLog.logSevere("ROBOTS","Unable to generate robots.txt URL for URL '" + nexturl.toString() + "'."); + return false; } - } catch (Exception e) { - serverLog.logSevere("ROBOTS","Unable to download the robots.txt file from URL '" + robotsURL + "'. " + e.getMessage()); - } - - if ((robotsTxt4Host==null)||((robotsTxt4Host!=null)&&(result!=null))) { - ArrayList denyPath = null; - if (accessCompletelyRestricted) { - denyPath = new ArrayList(); - denyPath.add("/"); - } else { - // parsing the robots.txt Data and converting it into an arraylist - try { - denyPath = robotsParser.parse(robotsTxt); - } catch (IOException e) { - serverLog.logSevere("ROBOTS","Unable to parse the robots.txt file from URL '" + robotsURL + "'."); + + Object[] result = null; + boolean accessCompletelyRestricted = false; + byte[] robotsTxt = null; + String eTag = null; + Date modDate = null; + try { + serverLog.logFine("ROBOTS","Trying to download the robots.txt file from URL '" + robotsURL + "'."); + result = downloadRobotsTxt(robotsURL,5,robotsTxt4Host); + + if (result != null) { + accessCompletelyRestricted = ((Boolean)result[0]).booleanValue(); + robotsTxt = (byte[])result[1]; + eTag = (String) result[2]; + modDate = (Date) result[3]; + } else if (robotsTxt4Host != null) { + robotsTxt4Host.setLoadedDate(new Date()); + plasmaSwitchboard.robots.addEntry(robotsTxt4Host); } - } + } catch (Exception e) { + serverLog.logSevere("ROBOTS","Unable to download the robots.txt file from URL '" + robotsURL + "'. " + e.getMessage()); + } - // storing the data into the robots DB - robotsTxt4Host = plasmaSwitchboard.robots.addEntry(urlHostPort,denyPath,new Date(),modDate,eTag); - } - } + if ((robotsTxt4Host==null)||((robotsTxt4Host!=null)&&(result!=null))) { + ArrayList denyPath = null; + if (accessCompletelyRestricted) { + denyPath = new ArrayList(); + denyPath.add("/"); + } else { + // parsing the robots.txt Data and converting it into an arraylist + try { + denyPath = robotsParser.parse(robotsTxt); + } catch (IOException e) { + serverLog.logSevere("ROBOTS","Unable to parse the robots.txt file from URL '" + robotsURL + "'."); + } + } + + // storing the data into the robots DB + robotsTxt4Host = plasmaSwitchboard.robots.addEntry(urlHostPort,denyPath,new Date(),modDate,eTag); + } + } + } if (robotsTxt4Host.isDisallowed(nexturl.getPath())) { return true;