diff --git a/source/de/anomic/crawler/RobotsTxt.java b/source/de/anomic/crawler/RobotsTxt.java index a9e6d5e4a..fed0e3474 100644 --- a/source/de/anomic/crawler/RobotsTxt.java +++ b/source/de/anomic/crawler/RobotsTxt.java @@ -1,4 +1,4 @@ -//plasmaCrawlRobotsTxt.java +//plasmaCrawlRobotsTxt.java //------------------------------------- //part of YACY //(C) by Michael Peter Christen; mc@yacy.net @@ -35,8 +35,6 @@ import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import java.util.regex.Pattern; -import org.apache.log4j.Logger; - import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.HeaderFramework; @@ -47,79 +45,81 @@ import net.yacy.kelondro.blob.BEncodedHeap; import net.yacy.kelondro.index.RowSpaceExceededException; import net.yacy.kelondro.io.ByteCount; +import org.apache.log4j.Logger; + public class RobotsTxt { - + private static Logger log = Logger.getLogger(RobotsTxt.class); protected static final String ROBOTS_DB_PATH_SEPARATOR = ";"; protected static final Pattern ROBOTS_DB_PATH_SEPARATOR_MATCHER = Pattern.compile(ROBOTS_DB_PATH_SEPARATOR); - - BEncodedHeap robotsTable; + + private final BEncodedHeap robotsTable; private final ConcurrentHashMap syncObjects; //private static final HashSet loadedRobots = new HashSet(); // only for debugging - + private static class DomSync { private DomSync() {} } - + public RobotsTxt(final BEncodedHeap robotsTable) { this.robotsTable = robotsTable; - syncObjects = new ConcurrentHashMap(); + this.syncObjects = new ConcurrentHashMap(); log.info("initiated robots table: " + robotsTable.getFile()); } - + public void clear() { log.info("clearing robots table"); this.robotsTable.clear(); - syncObjects.clear(); + this.syncObjects.clear(); } - + public int size() { return this.robotsTable.size(); } - + public RobotsTxtEntry getEntry(final MultiProtocolURI theURL, final Set thisAgents) throws IOException { if (theURL == null) throw new IllegalArgumentException(); if (!theURL.getProtocol().startsWith("http")) return null; return getEntry(theURL, thisAgents, true); } - + private RobotsTxtEntry getEntry(final MultiProtocolURI theURL, final Set thisAgents, final boolean fetchOnlineIfNotAvailableOrNotFresh) throws IOException { // this method will always return a non-null value - String urlHostPort = getHostPort(theURL); + final String urlHostPort = getHostPort(theURL); RobotsTxtEntry robotsTxt4Host = null; Map record; try { record = this.robotsTable.get(this.robotsTable.encodedKey(urlHostPort)); - } catch (RowSpaceExceededException e) { + } catch (final RowSpaceExceededException e) { log.warn("memory exhausted", e); record = null; } if (record != null) robotsTxt4Host = new RobotsTxtEntry(urlHostPort, record); - + if (fetchOnlineIfNotAvailableOrNotFresh && ( - robotsTxt4Host == null || + robotsTxt4Host == null || robotsTxt4Host.getLoadedDate() == null || System.currentTimeMillis() - robotsTxt4Host.getLoadedDate().getTime() > 7*24*60*60*1000 )) { - + // make or get a synchronization object DomSync syncObj = this.syncObjects.get(urlHostPort); if (syncObj == null) { syncObj = new DomSync(); this.syncObjects.put(urlHostPort, syncObj); } - + // we can now synchronize for each host separately synchronized (syncObj) { - + // if we have not found any data or the data is older than 7 days, we need to load it from the remote server - + // check the robots table again for all threads that come here because they waited for another one // to complete a download try { record = this.robotsTable.get(this.robotsTable.encodedKey(urlHostPort)); - } catch (RowSpaceExceededException e) { + } catch (final RowSpaceExceededException e) { log.warn("memory exhausted", e); record = null; } @@ -129,16 +129,16 @@ public class RobotsTxt { System.currentTimeMillis() - robotsTxt4Host.getLoadedDate().getTime() <= 1*24*60*60*1000) { return robotsTxt4Host; } - + // generating the proper url to download the robots txt MultiProtocolURI robotsURL = null; - try { + try { robotsURL = new MultiProtocolURI("http://" + urlHostPort + "/robots.txt"); } catch (final MalformedURLException e) { log.fatal("Unable to generate robots.txt URL for host:port '" + urlHostPort + "'.", e); robotsURL = null; } - + Object[] result = null; if (robotsURL != null) { if (log.isDebugEnabled()) log.debug("Trying to download the robots.txt file from URL '" + robotsURL + "'."); @@ -155,15 +155,15 @@ public class RobotsTxt { ", robotsTxt4Host=" + ((robotsTxt4Host == null) ? "NULL" : robotsTxt4Host.getLoadedDate().toString()); loadedRobots.add(robotsURL.toNormalform(false, false)); */ - + if (result == null) { // no robots.txt available, make an entry to prevent that the robots loading is done twice if (robotsTxt4Host == null) { // generate artificial entry robotsTxt4Host = new RobotsTxtEntry( - robotsURL, - new ArrayList(), - new ArrayList(), + robotsURL, + new ArrayList(), + new ArrayList(), new Date(), new Date(), null, @@ -173,13 +173,13 @@ public class RobotsTxt { } else { robotsTxt4Host.setLoadedDate(new Date()); } - + // store the data into the robots DB - int sz = this.robotsTable.size(); + final int sz = this.robotsTable.size(); addEntry(robotsTxt4Host); if (this.robotsTable.size() <= sz) { log.fatal("new entry in robots.txt table failed, resetting database"); - this.clear(); + clear(); addEntry(robotsTxt4Host); } } else { @@ -189,7 +189,7 @@ public class RobotsTxt { denyPath = new ArrayList(); denyPath.add("/"); } - + // store the data into the robots DB robotsTxt4Host = addEntry( robotsURL, @@ -207,14 +207,14 @@ public class RobotsTxt { return robotsTxt4Host; } - + private RobotsTxtEntry addEntry( - final MultiProtocolURI theURL, - final ArrayList allowPathList, - final ArrayList denyPathList, - final Date loadedDate, - final Date modDate, - final String eTag, + final MultiProtocolURI theURL, + final ArrayList allowPathList, + final ArrayList denyPathList, + final Date loadedDate, + final Date modDate, + final String eTag, final String sitemap, final long crawlDelayMillis, final String agentName @@ -226,7 +226,7 @@ public class RobotsTxt { addEntry(entry); return entry; } - + private String addEntry(final RobotsTxtEntry entry) { // writes a new page and returns key try { @@ -236,24 +236,24 @@ public class RobotsTxt { log.warn("cannot write robots.txt entry", e); return null; } - } - + } + // methods that had been in robotsParser.java: - + private static final int DOWNLOAD_ACCESS_RESTRICTED = 0; private static final int DOWNLOAD_ROBOTS_TXT = 1; private static final int DOWNLOAD_ETAG = 2; private static final int DOWNLOAD_MODDATE = 3; - + static final String getHostPort(final MultiProtocolURI theURL) { String urlHostPort = null; final int port = getPort(theURL); urlHostPort = theURL.getHost() + ":" + port; - urlHostPort = urlHostPort.toLowerCase().intern(); - + urlHostPort = urlHostPort.toLowerCase().intern(); + return urlHostPort; } - + private static final int getPort(final MultiProtocolURI theURL) { int port = theURL.getPort(); if (port == -1) { @@ -262,41 +262,41 @@ public class RobotsTxt { } else if (theURL.getProtocol().equalsIgnoreCase("https")) { port = 443; } - + } return port; } private static Object[] downloadRobotsTxt(final MultiProtocolURI robotsURL, int redirectionCount, final RobotsTxtEntry entry) throws Exception { if (robotsURL == null || !robotsURL.getProtocol().startsWith("http")) return null; - + if (redirectionCount < 0) return new Object[]{Boolean.FALSE,null,null}; redirectionCount--; - + boolean accessCompletelyRestricted = false; byte[] robotsTxt = null; long downloadStart, downloadEnd; String eTag=null, oldEtag = null; Date lastMod=null; downloadStart = System.currentTimeMillis(); - + // if we previously have downloaded this robots.txt then we can set the if-modified-since header RequestHeader reqHeaders = new RequestHeader(); - + // add yacybot user agent reqHeaders.put(HeaderFramework.USER_AGENT, ClientIdentification.getUserAgent()); - + // adding referer reqHeaders.put(RequestHeader.REFERER, (MultiProtocolURI.newURL(robotsURL,"/")).toNormalform(true, true)); - + if (entry != null) { oldEtag = entry.getETag(); reqHeaders = new RequestHeader(); final Date modDate = entry.getModDate(); if (modDate != null) reqHeaders.put(RequestHeader.IF_MODIFIED_SINCE, HeaderFramework.formatRFC1123(entry.getModDate())); - + } - + // setup http-client //TODO: adding Traffic statistic for robots download? final HTTPClient client = new HTTPClient(); @@ -304,7 +304,7 @@ public class RobotsTxt { try { // check for interruption if (Thread.currentThread().isInterrupted()) throw new InterruptedException("Shutdown in progress."); - + // sending the get request robotsTxt = client.GETbytes(robotsURL); // statistics: @@ -313,7 +313,7 @@ public class RobotsTxt { } final int code = client.getHttpResponse().getStatusLine().getStatusCode(); final ResponseHeader header = new ResponseHeader(client.getHttpResponse().getAllHeaders()); - + // check the response status if (code > 199 && code < 300) { if (!header.mime().startsWith("text/plain")) { @@ -324,15 +324,15 @@ public class RobotsTxt { // getting some metadata eTag = header.containsKey(HeaderFramework.ETAG)?(header.get(HeaderFramework.ETAG)).trim():null; lastMod = header.lastModified(); - + // if the robots.txt file was not changed we break here if ((eTag != null) && (oldEtag != null) && (eTag.equals(oldEtag))) { if (log.isDebugEnabled()) log.debug("Robots.txt from URL '" + robotsURL + "' was not modified. Abort downloading of new version."); return null; } - - - downloadEnd = System.currentTimeMillis(); + + + downloadEnd = System.currentTimeMillis(); if (log.isDebugEnabled()) log.debug("Robots.txt successfully loaded from URL '" + robotsURL + "' in " + (downloadEnd-downloadStart) + " ms."); } } else if (code == 304) { @@ -343,16 +343,16 @@ public class RobotsTxt { if (redirectionUrlString==null) { if (log.isDebugEnabled()) log.debug("robots.txt could not be downloaded from URL '" + robotsURL + "' because of missing redirecton header. [" + client.getHttpResponse().getStatusLine() + "]."); - robotsTxt = null; + robotsTxt = null; } else { - + redirectionUrlString = redirectionUrlString.trim(); - + // generating the new URL object - final MultiProtocolURI redirectionUrl = MultiProtocolURI.newURL(robotsURL, redirectionUrlString); - + final MultiProtocolURI redirectionUrl = MultiProtocolURI.newURL(robotsURL, redirectionUrlString); + // following the redirection - if (log.isDebugEnabled()) log.debug("Redirection detected for robots.txt with URL '" + robotsURL + "'." + + if (log.isDebugEnabled()) log.debug("Redirection detected for robots.txt with URL '" + robotsURL + "'." + "\nRedirecting request to: " + redirectionUrl); return downloadRobotsTxt(redirectionUrl,redirectionCount,entry); } @@ -363,7 +363,7 @@ public class RobotsTxt { if (log.isDebugEnabled()) log.debug("robots.txt could not be downloaded from URL '" + robotsURL + "'. [" + client.getHttpResponse().getStatusLine() + "]."); robotsTxt = null; - } + } } catch (final Exception e) { throw e; } diff --git a/source/net/yacy/kelondro/util/FileUtils.java b/source/net/yacy/kelondro/util/FileUtils.java index 0704a0bae..476674f9d 100644 --- a/source/net/yacy/kelondro/util/FileUtils.java +++ b/source/net/yacy/kelondro/util/FileUtils.java @@ -535,6 +535,7 @@ public final class FileUtils { key = escaped_backslash.matcher(key).replaceAll("\\"); String value = escaped_newline.matcher(line.substring(pos + 1).trim()).replaceAll("\n"); value = value.replace("\\\\", "\\"); // does not work: escaped_backslashbackslash.matcher(value).replaceAll("\\"); + //System.out.println("key = " + key + ", value = " + value); props.put(key, value); } }