diff --git a/source/net/yacy/crawler/robots/RobotsTxt.java b/source/net/yacy/crawler/robots/RobotsTxt.java index 1070100ad..dce3ab687 100644 --- a/source/net/yacy/crawler/robots/RobotsTxt.java +++ b/source/net/yacy/crawler/robots/RobotsTxt.java @@ -132,9 +132,7 @@ public class RobotsTxt { // we can now synchronize for each host separately synchronized (syncObj) { - // if we have not found any data or the data is older than 7 days, we need to load it from the remote server - // check the robots table again for all threads that come here because they waited for another one // to complete a download try { @@ -156,7 +154,7 @@ public class RobotsTxt { // generating the proper url to download the robots txt DigestURI robotsURL = null; try { - robotsURL = new DigestURI("http://" + urlHostPort + "/robots.txt"); + robotsURL = new DigestURI((urlHostPort.endsWith(":443") ? "https://" : "http://") + urlHostPort + "/robots.txt"); } catch (final MalformedURLException e) { log.fatal("Unable to generate robots.txt URL for host:port '" + urlHostPort + "'.", e); robotsURL = null; @@ -175,58 +173,9 @@ public class RobotsTxt { } if (response == null) { - // no robots.txt available, make an entry to prevent that the robots loading is done twice - if (robotsTxt4Host == null) { - // generate artificial entry - robotsTxt4Host = new RobotsTxtEntry( - robotsURL, - new ArrayList(), - new ArrayList(), - new Date(), - new Date(), - null, - null, - Integer.valueOf(0), - null); - } else { - robotsTxt4Host.setLoadedDate(new Date()); - } - - // store the data into the robots DB - final int sz = robotsTable.size(); - addEntry(robotsTxt4Host); - if (robotsTable.size() <= sz) { - log.fatal("new entry in robots.txt table failed, resetting database"); - try {clear();} catch (IOException e) {} - addEntry(robotsTxt4Host); - } + processOldEntry(robotsTxt4Host, robotsURL, robotsTable); } else { - final byte[] robotsTxt = response.getContent(); - //Log.logInfo("RobotsTxt", "robots of " + robotsURL.toNormalform(true, true) + ":\n" + ((robotsTxt == null) ? "null" : UTF8.String(robotsTxt))); // debug TODO remove - RobotsTxtParser parserResult; - ArrayList denyPath; - if (response.getResponseHeader().getStatusCode() == 401 || response.getResponseHeader().getStatusCode() == 403) { - parserResult = new RobotsTxtParser(thisAgents); - // create virtual deny path - denyPath = new ArrayList(); - denyPath.add("/"); - } else { - parserResult = new RobotsTxtParser(thisAgents, robotsTxt); - denyPath = parserResult.denyList(); - } - - // store the data into the robots DB - String etag = response.getResponseHeader().containsKey(HeaderFramework.ETAG) ? (response.getResponseHeader().get(HeaderFramework.ETAG)).trim() : null; - robotsTxt4Host = addEntry( - robotsURL, - parserResult.allowList(), - denyPath, - new Date(), - response.getResponseHeader().lastModified(), - etag, - parserResult.sitemap(), - parserResult.crawlDelayMillis(), - parserResult.agentName()); + processNewEntry(robotsURL, response, thisAgents); } } } @@ -246,118 +195,107 @@ public class RobotsTxt { return; } if (robotsTable == null || robotsTable.containsKey(robotsTable.encodedKey(urlHostPort))) return; + Thread t = new Thread() { + public void run(){ + // make or get a synchronization object + DomSync syncObj = RobotsTxt.this.syncObjects.get(urlHostPort); + if (syncObj == null) { + syncObj = new DomSync(); + RobotsTxt.this.syncObjects.put(urlHostPort, syncObj); + } + // we can now synchronize for each host separately + synchronized (syncObj) { + if (robotsTable.containsKey(robotsTable.encodedKey(urlHostPort))) return; - if (concurrent) - new Thread() {public void run(){ensureExist(urlHostPort, robotsTable, thisAgents);}}.start(); - else - ensureExist(urlHostPort, robotsTable, thisAgents); - } - - private void ensureExist(final String urlHostPort, BEncodedHeap robotsTable, final Set thisAgents) { - - // make or get a synchronization object - DomSync syncObj = RobotsTxt.this.syncObjects.get(urlHostPort); - if (syncObj == null) { - syncObj = new DomSync(); - RobotsTxt.this.syncObjects.put(urlHostPort, syncObj); - } - // we can now synchronize for each host separately - synchronized (syncObj) { - if (robotsTable.containsKey(robotsTable.encodedKey(urlHostPort))) return; + // generating the proper url to download the robots txt + DigestURI robotsURL = null; + try { + robotsURL = new DigestURI((urlHostPort.endsWith(":443") ? "https://" : "http://") + urlHostPort + "/robots.txt"); + } catch (final MalformedURLException e) { + log.fatal("Unable to generate robots.txt URL for host:port '" + urlHostPort + "'.", e); + robotsURL = null; + } - // generating the proper url to download the robots txt - DigestURI robotsURL = null; - try { - robotsURL = new DigestURI("http://" + urlHostPort + "/robots.txt"); - } catch (final MalformedURLException e) { - log.fatal("Unable to generate robots.txt URL for host:port '" + urlHostPort + "'.", e); - robotsURL = null; - } + Response response = null; + if (robotsURL != null) { + if (log.isDebugEnabled()) log.debug("Trying to download the robots.txt file from URL '" + robotsURL + "'."); + Request request = new Request(robotsURL, null); + try { + response = RobotsTxt.this.loader.load(request, CacheStrategy.NOCACHE, null, 0); + } catch (IOException e) { + response = null; + } + } - Response response = null; - if (robotsURL != null) { - if (log.isDebugEnabled()) log.debug("Trying to download the robots.txt file from URL '" + robotsURL + "'."); - Request request = new Request(robotsURL, null); - try { - response = RobotsTxt.this.loader.load(request, CacheStrategy.NOCACHE, null, 0); - } catch (IOException e) { - response = null; + if (response == null) { + processOldEntry(null, robotsURL, robotsTable); + } else { + processNewEntry(robotsURL, response, thisAgents); + } } } + }; + if (concurrent) t.start(); else t.run(); + } - RobotsTxtEntry robotsTxt4Host = null; - if (response == null) { - // no robots.txt available, make an entry to prevent that the robots loading is done twice - // generate artificial entry - robotsTxt4Host = new RobotsTxtEntry( - robotsURL, - new ArrayList(), - new ArrayList(), - new Date(), - new Date(), - null, - null, - Integer.valueOf(0), - null); - - // store the data into the robots DB - final int sz = robotsTable.size(); - addEntry(robotsTxt4Host); - if (robotsTable.size() <= sz) { - log.fatal("new entry in robots.txt table failed, resetting database"); - try {clear();} catch (IOException e) {} - addEntry(robotsTxt4Host); - } - } else { - final byte[] robotsTxt = response.getContent(); - //Log.logInfo("RobotsTxt", "robots of " + robotsURL.toNormalform(true, true) + ":\n" + ((robotsTxt == null) ? "null" : UTF8.String(robotsTxt))); // debug TODO remove - RobotsTxtParser parserResult; - ArrayList denyPath; - if (response.getResponseHeader().getStatusCode() == 401 || response.getResponseHeader().getStatusCode() == 403) { - parserResult = new RobotsTxtParser(thisAgents); - // create virtual deny path - denyPath = new ArrayList(); - denyPath.add("/"); - } else { - parserResult = new RobotsTxtParser(thisAgents, robotsTxt); - denyPath = parserResult.denyList(); - } + private void processOldEntry(RobotsTxtEntry robotsTxt4Host, DigestURI robotsURL, BEncodedHeap robotsTable) { + // no robots.txt available, make an entry to prevent that the robots loading is done twice + if (robotsTxt4Host == null) { + // generate artificial entry + robotsTxt4Host = new RobotsTxtEntry( + robotsURL, + new ArrayList(), + new ArrayList(), + new Date(), + new Date(), + null, + null, + Integer.valueOf(0), + null); + } else { + robotsTxt4Host.setLoadedDate(new Date()); + } - // store the data into the robots DB - String etag = response.getResponseHeader().containsKey(HeaderFramework.ETAG) ? (response.getResponseHeader().get(HeaderFramework.ETAG)).trim() : null; - robotsTxt4Host = addEntry( - robotsURL, - parserResult.allowList(), - denyPath, - new Date(), - response.getResponseHeader().lastModified(), - etag, - parserResult.sitemap(), - parserResult.crawlDelayMillis(), - parserResult.agentName()); - } + // store the data into the robots DB + final int sz = robotsTable.size(); + addEntry(robotsTxt4Host); + if (robotsTable.size() <= sz) { + log.fatal("new entry in robots.txt table failed, resetting database"); + try {clear();} catch (IOException e) {} + addEntry(robotsTxt4Host); } } - private RobotsTxtEntry addEntry( - final MultiProtocolURI theURL, - final ArrayList allowPathList, - final ArrayList denyPathList, - final Date loadedDate, - final Date modDate, - final String eTag, - final String sitemap, - final long crawlDelayMillis, - final String agentName - ) { - final RobotsTxtEntry entry = new RobotsTxtEntry( - theURL, allowPathList, denyPathList, - loadedDate, modDate, - eTag, sitemap, crawlDelayMillis, agentName); - addEntry(entry); - return entry; - } + private void processNewEntry(DigestURI robotsURL, Response response, final Set thisAgents) { + final byte[] robotsTxt = response.getContent(); + //Log.logInfo("RobotsTxt", "robots of " + robotsURL.toNormalform(true, true) + ":\n" + ((robotsTxt == null) ? "null" : UTF8.String(robotsTxt))); // debug TODO remove + RobotsTxtParser parserResult; + ArrayList denyPath; + if (response.getResponseHeader().getStatusCode() == 401 || response.getResponseHeader().getStatusCode() == 403) { + parserResult = new RobotsTxtParser(thisAgents); + // create virtual deny path + denyPath = new ArrayList(); + denyPath.add("/"); + } else { + parserResult = new RobotsTxtParser(thisAgents, robotsTxt); + denyPath = parserResult.denyList(); + } + // store the data into the robots DB + String etag = response.getResponseHeader().containsKey(HeaderFramework.ETAG) ? (response.getResponseHeader().get(HeaderFramework.ETAG)).trim() : null; + final RobotsTxtEntry robotsTxt4Host = new RobotsTxtEntry( + robotsURL, + parserResult.allowList(), + denyPath, + new Date(), + response.getResponseHeader().lastModified(), + etag, + parserResult.sitemap(), + parserResult.crawlDelayMillis(), + parserResult.agentName()); + addEntry(robotsTxt4Host); + } + private String addEntry(final RobotsTxtEntry entry) { // writes a new page and returns key try { @@ -371,25 +309,21 @@ public class RobotsTxt { } static final String getHostPort(final MultiProtocolURI theURL) { - final int port = getPort(theURL); - String host = theURL.getHost(); - if (host == null) return null; - StringBuilder sb = new StringBuilder(host.length() + 6); - sb.append(host).append(':').append(Integer.toString(port)); - return sb.toString(); - } - - private static final int getPort(final MultiProtocolURI theURL) { int port = theURL.getPort(); if (port == -1) { if (theURL.getProtocol().equalsIgnoreCase("http")) { port = 80; } else if (theURL.getProtocol().equalsIgnoreCase("https")) { port = 443; + } else { + port = 80; } - } - return port; + String host = theURL.getHost(); + if (host == null) return null; + StringBuilder sb = new StringBuilder(host.length() + 6); + sb.append(host).append(':').append(Integer.toString(port)); + return sb.toString(); } }