From c25d7bcb80bd25007340c566831a3dbd8b856fb9 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Mon, 29 Oct 2012 21:08:45 +0100 Subject: [PATCH] - added concurrency for robots.txt loading - changed data model for domain counter --- htroot/CrawlProfileEditor_p.java | 3 - source/net/yacy/crawler/Balancer.java | 10 +- source/net/yacy/crawler/CrawlStacker.java | 29 ++-- .../net/yacy/crawler/data/CrawlProfile.java | 127 ++++++------------ source/net/yacy/crawler/data/NoticedURL.java | 19 +-- source/net/yacy/crawler/robots/RobotsTxt.java | 103 ++++++++++++++ source/net/yacy/search/Switchboard.java | 6 +- 7 files changed, 172 insertions(+), 125 deletions(-) diff --git a/htroot/CrawlProfileEditor_p.java b/htroot/CrawlProfileEditor_p.java index 3a5630921..7ab881a48 100644 --- a/htroot/CrawlProfileEditor_p.java +++ b/htroot/CrawlProfileEditor_p.java @@ -75,9 +75,6 @@ public class CrawlProfileEditor_p { labels.add(new eentry(CrawlProfile.INDEX_MEDIA, "Index Media", false, eentry.BOOLEAN)); labels.add(new eentry(CrawlProfile.STORE_HTCACHE, "Store in HTCache", false, eentry.BOOLEAN)); labels.add(new eentry(CrawlProfile.REMOTE_INDEXING, "Remote Indexing", false, eentry.BOOLEAN)); - labels.add(new eentry(CrawlProfile.XSSTOPW, "Static stop-words", false, eentry.BOOLEAN)); - labels.add(new eentry(CrawlProfile.XDSTOPW, "Dynamic stop-words", false, eentry.BOOLEAN)); - labels.add(new eentry(CrawlProfile.XPSTOPW, "Parent stop-words", false, eentry.BOOLEAN)); } public static serverObjects respond( diff --git a/source/net/yacy/crawler/Balancer.java b/source/net/yacy/crawler/Balancer.java index be3ea0f28..4926d3ffa 100644 --- a/source/net/yacy/crawler/Balancer.java +++ b/source/net/yacy/crawler/Balancer.java @@ -256,7 +256,7 @@ public class Balancer { * @throws IOException * @throws SpaceExceededException */ - public String push(final Request entry) throws IOException, SpaceExceededException { + public String push(final Request entry, final RobotsTxt robots) throws IOException, SpaceExceededException { assert entry != null; final byte[] hash = entry.url().hash(); synchronized (this) { @@ -275,8 +275,9 @@ public class Balancer { // add the hash to a queue pushHashToDomainStacks(entry.url().getHost(), entry.url().hash()); - return null; } + robots.ensureExist(entry.url(), Balancer.this.myAgentIDs, true); // concurrently load all robots.txt + return null; } /** @@ -319,8 +320,7 @@ public class Balancer { * @param crawlURL * @return */ - private long getRobotsTime(final RobotsTxt robots, final CrawlProfile profileEntry, final DigestURI crawlURL) { - if (profileEntry == null) return 0; + private long getRobotsTime(final RobotsTxt robots, final DigestURI crawlURL) { long sleeptime = Latency.waitingRobots(crawlURL, robots, this.myAgentIDs); // this uses the robots.txt database and may cause a loading of robots.txt from the server return sleeptime < 0 ? 0 : sleeptime; } @@ -450,7 +450,7 @@ public class Balancer { } if (crawlEntry == null) return null; - long robotsTime = getRobotsTime(robots, profileEntry, crawlEntry.url()); + long robotsTime = getRobotsTime(robots, crawlEntry.url()); Latency.updateAfterSelection(crawlEntry.url(), profileEntry == null ? 0 : robotsTime); if (delay && sleeptime > 0) { // force a busy waiting here diff --git a/source/net/yacy/crawler/CrawlStacker.java b/source/net/yacy/crawler/CrawlStacker.java index 1ffae79a9..2a918bfd8 100644 --- a/source/net/yacy/crawler/CrawlStacker.java +++ b/source/net/yacy/crawler/CrawlStacker.java @@ -37,6 +37,7 @@ import java.util.Locale; import java.util.Map; import java.util.Properties; import java.util.concurrent.BlockingQueue; +import java.util.concurrent.atomic.AtomicInteger; import net.yacy.cora.document.ASCII; import net.yacy.cora.document.Classification.ContentDomain; @@ -50,13 +51,13 @@ import net.yacy.crawler.data.CrawlQueues; import net.yacy.crawler.data.NoticedURL; import net.yacy.crawler.data.ResultURLs; import net.yacy.crawler.data.ZURL; -import net.yacy.crawler.data.CrawlProfile.DomProfile; import net.yacy.crawler.data.ResultURLs.EventOrigin; import net.yacy.crawler.data.ZURL.FailCategory; import net.yacy.crawler.retrieval.FTPLoader; import net.yacy.crawler.retrieval.HTTPLoader; import net.yacy.crawler.retrieval.Request; import net.yacy.crawler.retrieval.SMBLoader; +import net.yacy.crawler.robots.RobotsTxt; import net.yacy.interaction.contentcontrol.ContentControlFilterUpdateThread; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataNode; @@ -75,7 +76,7 @@ public final class CrawlStacker { private final Log log = new Log("STACKCRAWL"); - + private final RobotsTxt robots; private final WorkflowProcessor fastQueue, slowQueue; private final CrawlQueues nextQueue; private final CrawlSwitchboard crawler; @@ -87,6 +88,7 @@ public final class CrawlStacker { // this is the process that checks url for double-occurrences and for allowance/disallowance by robots.txt public CrawlStacker( + final RobotsTxt robots, final CrawlQueues cq, final CrawlSwitchboard cs, final Segment indexSegment, @@ -94,6 +96,7 @@ public final class CrawlStacker { final boolean acceptLocalURLs, final boolean acceptGlobalURLs, final FilterEngine domainList) { + this.robots = robots; this.nextQueue = cq; this.crawler = cs; this.indexSegment = indexSegment; @@ -366,32 +369,30 @@ public final class CrawlStacker { entry.url().getContentDomain() == ContentDomain.AUDIO || entry.url().getContentDomain() == ContentDomain.VIDEO || entry.url().getContentDomain() == ContentDomain.CTRL) { - warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.NOLOAD, entry); + warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.NOLOAD, entry, this.robots); //if (warning != null) this.log.logWarning("CrawlStacker.stackCrawl of URL " + entry.url().toNormalform(true, false) + " - not pushed: " + warning); return null; } - - final DigestURI referrerURL = (entry.referrerhash() == null || entry.referrerhash().length == 0) ? null : this.nextQueue.getURL(entry.referrerhash()); - + // add domain to profile domain list if (profile.domMaxPages() != Integer.MAX_VALUE && profile.domMaxPages() > 0) { - profile.domInc(entry.url().getHost(), (referrerURL == null) ? null : referrerURL.getHost().toLowerCase(), entry.depth()); + profile.domInc(entry.url().getHost()); } if (global) { // it may be possible that global == true and local == true, so do not check an error case against it if (proxy) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: global = true, proxy = true, initiator = proxy" + ", profile.handle = " + profile.handle()); if (remote) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: global = true, remote = true, initiator = " + ASCII.String(entry.initiator()) + ", profile.handle = " + profile.handle()); - warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.GLOBAL, entry); + warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.GLOBAL, entry, this.robots); } else if (local) { if (proxy) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: local = true, proxy = true, initiator = proxy" + ", profile.handle = " + profile.handle()); if (remote) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: local = true, remote = true, initiator = " + ASCII.String(entry.initiator()) + ", profile.handle = " + profile.handle()); - warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.LOCAL, entry); + warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.LOCAL, entry, this.robots); } else if (proxy) { if (remote) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: proxy = true, remote = true, initiator = " + ASCII.String(entry.initiator()) + ", profile.handle = " + profile.handle()); - warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.LOCAL, entry); + warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.LOCAL, entry, this.robots); } else if (remote) { - warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.REMOTE, entry); + warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.REMOTE, entry, this.robots); } if (warning != null) this.log.logWarning("CrawlStacker.stackCrawl of URL " + entry.url().toNormalform(true) + " - not pushed: " + warning); @@ -479,13 +480,13 @@ public final class CrawlStacker { // deny urls that exceed allowed number of occurrences final int maxAllowedPagesPerDomain = profile.domMaxPages(); if (maxAllowedPagesPerDomain < Integer.MAX_VALUE && maxAllowedPagesPerDomain > 0) { - final DomProfile dp = profile.getDom(url.getHost()); - if (dp != null && dp.count >= maxAllowedPagesPerDomain) { + final AtomicInteger dp = profile.getCount(url.getHost()); + if (dp != null && dp.get() >= maxAllowedPagesPerDomain) { if (this.log.isFine()) this.log.logFine("URL '" + urlstring + "' appeared too often in crawl stack, a maximum of " + profile.domMaxPages() + " is allowed."); return "crawl stack domain counter exceeded"; } - if (ResultURLs.domainCount(EventOrigin.LOCAL_CRAWLING, url.getHost()) >= profile.domMaxPages()) { + if (ResultURLs.domainCount(EventOrigin.LOCAL_CRAWLING, url.getHost()) >= maxAllowedPagesPerDomain) { if (this.log.isFine()) this.log.logFine("URL '" + urlstring + "' appeared too often in result stack, a maximum of " + profile.domMaxPages() + " is allowed."); return "result stack domain counter exceeded"; } diff --git a/source/net/yacy/crawler/data/CrawlProfile.java b/source/net/yacy/crawler/data/CrawlProfile.java index 2f6aa3d92..6b8ade6be 100644 --- a/source/net/yacy/crawler/data/CrawlProfile.java +++ b/source/net/yacy/crawler/data/CrawlProfile.java @@ -31,6 +31,7 @@ import java.util.Iterator; import java.util.Map; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.atomic.AtomicInteger; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; @@ -53,55 +54,34 @@ public class CrawlProfile extends ConcurrentHashMap implements M public static final Pattern MATCH_NEVER_PATTERN = Pattern.compile(MATCH_NEVER_STRING); // this is a simple record structure that hold all properties of a single crawl start - public static final String HANDLE = "handle"; - public static final String NAME = "name"; - public static final String DEPTH = "generalDepth"; - public static final String DIRECT_DOC_BY_URL= "directDocByURL"; - public static final String RECRAWL_IF_OLDER = "recrawlIfOlder"; - public static final String DOM_MAX_PAGES = "domMaxPages"; - public static final String CRAWLING_Q = "crawlingQ"; - public static final String PUSH_SOLR = "pushSolr"; - public static final String INDEX_TEXT = "indexText"; - public static final String INDEX_MEDIA = "indexMedia"; - public static final String STORE_HTCACHE = "storeHTCache"; - public static final String REMOTE_INDEXING = "remoteIndexing"; - public static final String XSSTOPW = "xsstopw"; - public static final String XDSTOPW = "xdstopw"; - public static final String XPSTOPW = "xpstopw"; - public static final String CACHE_STRAGEGY = "cacheStrategy"; - public static final String CRAWLER_URL_MUSTMATCH = "crawlerURLMustMatch"; - public static final String CRAWLER_URL_MUSTNOTMATCH = "crawlerURLMustNotMatch"; - public static final String CRAWLER_IP_MUSTMATCH = "crawlerIPMustMatch"; - public static final String CRAWLER_IP_MUSTNOTMATCH = "crawlerIPMustNotMatch"; - public static final String CRAWLER_COUNTRY_MUSTMATCH = "crawlerCountryMustMatch"; - public static final String CRAWLER_URL_NODEPTHLIMITMATCH = "crawlerNoLimitURLMustMatch"; - public static final String INDEXING_URL_MUSTMATCH = "indexURLMustMatch"; - public static final String INDEXING_URL_MUSTNOTMATCH = "indexURLMustNotMatch"; - public static final String COLLECTIONS = "collections"; + private static final String HANDLE = "handle"; + public static final String NAME = "name"; + public static final String DEPTH = "generalDepth"; + private static final String DIRECT_DOC_BY_URL= "directDocByURL"; + public static final String RECRAWL_IF_OLDER = "recrawlIfOlder"; + public static final String DOM_MAX_PAGES = "domMaxPages"; + public static final String CRAWLING_Q = "crawlingQ"; + public static final String INDEX_TEXT = "indexText"; + public static final String INDEX_MEDIA = "indexMedia"; + public static final String STORE_HTCACHE = "storeHTCache"; + public static final String REMOTE_INDEXING = "remoteIndexing"; + private static final String CACHE_STRAGEGY = "cacheStrategy"; + public static final String CRAWLER_URL_MUSTMATCH = "crawlerURLMustMatch"; + public static final String CRAWLER_URL_MUSTNOTMATCH = "crawlerURLMustNotMatch"; + private static final String CRAWLER_IP_MUSTMATCH = "crawlerIPMustMatch"; + private static final String CRAWLER_IP_MUSTNOTMATCH = "crawlerIPMustNotMatch"; + private static final String CRAWLER_COUNTRY_MUSTMATCH = "crawlerCountryMustMatch"; + private static final String CRAWLER_URL_NODEPTHLIMITMATCH = "crawlerNoLimitURLMustMatch"; + private static final String INDEXING_URL_MUSTMATCH = "indexURLMustMatch"; + private static final String INDEXING_URL_MUSTNOTMATCH = "indexURLMustNotMatch"; + private static final String COLLECTIONS = "collections"; private Pattern crawlerurlmustmatch = null, crawlerurlmustnotmatch = null; private Pattern crawleripmustmatch = null, crawleripmustnotmatch = null; private Pattern crawlernodepthlimitmatch = null; private Pattern indexurlmustmatch = null, indexurlmustnotmatch = null; - public final static class DomProfile { - - public String referrer; - public int depth, count; - - public DomProfile(final String ref, final int d) { - this.referrer = ref; - this.depth = d; - this.count = 1; - } - - public void inc() { - this.count++; - } - - } - - private final Map doms; + private final Map doms; /** * Constructor which creates CrawlPofile from parameters. @@ -156,7 +136,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M throw new NullPointerException("name must not be null or empty"); } if (name.length() > 256) name = name.substring(256); - this.doms = new ConcurrentHashMap(); + this.doms = new ConcurrentHashMap(); final String handle = Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(name)).substring(0, Word.commonHashLength); put(HANDLE, handle); put(NAME, name); @@ -177,9 +157,6 @@ public class CrawlProfile extends ConcurrentHashMap implements M put(INDEX_MEDIA, indexMedia); put(STORE_HTCACHE, storeHTCache); put(REMOTE_INDEXING, remoteIndexing); - put(XSSTOPW, xsstopw); // exclude static stop-words - put(XDSTOPW, xdstopw); // exclude dynamic stop-word - put(XPSTOPW, xpstopw); // exclude parent stop-words put(CACHE_STRAGEGY, cacheStrategy.toString()); put(COLLECTIONS, collections.trim().replaceAll(" ", "")); } @@ -191,25 +168,25 @@ public class CrawlProfile extends ConcurrentHashMap implements M public CrawlProfile(final Map ext) { super(ext == null ? 1 : ext.size()); if (ext != null) putAll(ext); - this.doms = new ConcurrentHashMap(); + this.doms = new ConcurrentHashMap(); } - public void domInc(final String domain, final String referrer, final int depth) { - final DomProfile dp = this.doms.get(domain); + public void domInc(final String domain) { + final AtomicInteger dp = this.doms.get(domain); if (dp == null) { // new domain - this.doms.put(domain, new DomProfile(referrer, depth)); + this.doms.put(domain, new AtomicInteger(1)); } else { // increase counter - dp.inc(); + dp.incrementAndGet(); } } - public String domName(final boolean attr, final int index){ - final Iterator> domnamesi = this.doms.entrySet().iterator(); + private String domName(final boolean attr, final int index){ + final Iterator> domnamesi = this.doms.entrySet().iterator(); String domname=""; - Map.Entry ey; - DomProfile dp; + Map.Entry ey; + AtomicInteger dp; int i = 0; while ((domnamesi.hasNext()) && (i < index)) { ey = domnamesi.next(); @@ -218,16 +195,12 @@ public class CrawlProfile extends ConcurrentHashMap implements M if (domnamesi.hasNext()) { ey = domnamesi.next(); dp = ey.getValue(); - domname = ey.getKey() + ((attr) ? ("/r=" + dp.referrer + ", d=" + dp.depth + ", c=" + dp.count) : " "); + domname = ey.getKey() + ((attr) ? ("/c=" + dp.get()) : " "); } return domname; } - public void clearDoms() { - this.doms.clear(); - } - - public DomProfile getDom(final String domain) { + public AtomicInteger getCount(final String domain) { return this.doms.get(domain); } @@ -245,7 +218,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M * @param key name of the parameter * @param value values if the parameter */ - public final void put(final String key, final int value) { + private final void put(final String key, final int value) { super.put(key, Integer.toString(value)); } @@ -254,7 +227,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M * @param key name of the parameter * @param value values if the parameter */ - public final void put(final String key, final long value) { + private final void put(final String key, final long value) { super.put(key, Long.toString(value)); } @@ -476,12 +449,6 @@ public class CrawlProfile extends ConcurrentHashMap implements M return (r.equals(Boolean.TRUE.toString())); } - public boolean pushSolr() { - final String r = get(PUSH_SOLR); - if (r == null) return true; - return (r.equals(Boolean.TRUE.toString())); - } - public boolean indexText() { final String r = get(INDEX_TEXT); if (r == null) return true; @@ -505,24 +472,6 @@ public class CrawlProfile extends ConcurrentHashMap implements M return (r.equals(Boolean.TRUE.toString())); } - public boolean excludeStaticStopwords() { - final String r = get(XSSTOPW); - if (r == null) return false; - return (r.equals(Boolean.TRUE.toString())); - } - - public boolean excludeDynamicStopwords() { - final String r = get(XDSTOPW); - if (r == null) return false; - return (r.equals(Boolean.TRUE.toString())); - } - - public boolean excludeParentStopwords() { - final String r = get(XPSTOPW); - if (r == null) return false; - return (r.equals(Boolean.TRUE.toString())); - } - public static long getRecrawlDate(final long oldTimeMinutes) { return System.currentTimeMillis() - (60000L * oldTimeMinutes); } @@ -535,7 +484,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M return new StringBuilder(host.length() + 20).append(protocol).append("://(www.)?").append(Pattern.quote(host)).append(".*").toString(); } - public static String mustMatchSubpath(final MultiProtocolURI uri) { + private static String mustMatchSubpath(final MultiProtocolURI uri) { String u = uri.toNormalform(true); if (!u.endsWith("/")) {int p = u.lastIndexOf("/"); if (p > 0) u = u.substring(0, p + 1);} return new StringBuilder(u.length() + 5).append(Pattern.quote(u)).append(".*").toString(); diff --git a/source/net/yacy/crawler/data/NoticedURL.java b/source/net/yacy/crawler/data/NoticedURL.java index 6052a4be0..b78315c11 100644 --- a/source/net/yacy/crawler/data/NoticedURL.java +++ b/source/net/yacy/crawler/data/NoticedURL.java @@ -173,19 +173,14 @@ public class NoticedURL { * @param entry * @return null if this was successful or a String explaining what went wrong in case of an error */ - public String push(final StackType stackType, final Request entry) { + public String push(final StackType stackType, final Request entry, final RobotsTxt robots) { try { switch (stackType) { - case LOCAL: - return this.coreStack.push(entry); - case GLOBAL: - return this.limitStack.push(entry); - case REMOTE: - return this.remoteStack.push(entry); - case NOLOAD: - return this.noloadStack.push(entry); - default: - return "stack type unknown"; + case LOCAL: return this.coreStack.push(entry, robots); + case GLOBAL: return this.limitStack.push(entry, robots); + case REMOTE: return this.remoteStack.push(entry, robots); + case NOLOAD: return this.noloadStack.push(entry, robots); + default: return "stack type unknown"; } } catch (final Exception er) { Log.logException(er); @@ -277,7 +272,7 @@ public class NoticedURL { try { final Request entry = pop(fromStack, false, cs, robots); if (entry != null) { - final String warning = push(toStack, entry); + final String warning = push(toStack, entry, robots); if (warning != null) { Log.logWarning("NoticedURL", "shift from " + fromStack + " to " + toStack + ": " + warning); } diff --git a/source/net/yacy/crawler/robots/RobotsTxt.java b/source/net/yacy/crawler/robots/RobotsTxt.java index b0817c2a3..aaeb188cd 100644 --- a/source/net/yacy/crawler/robots/RobotsTxt.java +++ b/source/net/yacy/crawler/robots/RobotsTxt.java @@ -234,6 +234,109 @@ public class RobotsTxt { return robotsTxt4Host; } + public void ensureExist(final MultiProtocolURI theURL, final Set thisAgents, boolean concurrent) { + final String urlHostPort = getHostPort(theURL); + final BEncodedHeap robotsTable; + try { + robotsTable = this.tables.getHeap(WorkTables.TABLE_ROBOTS_NAME); + } catch (IOException e1) { + log.fatal("tables not available", e1); + return; + } + if (robotsTable == null || robotsTable.containsKey(robotsTable.encodedKey(urlHostPort))) return; + + if (concurrent) + new Thread() {public void run(){ensureExist(urlHostPort, robotsTable, thisAgents);}}.start(); + else + ensureExist(urlHostPort, robotsTable, thisAgents); + } + + private void ensureExist(final String urlHostPort, BEncodedHeap robotsTable, final Set thisAgents) { + + // make or get a synchronization object + DomSync syncObj = RobotsTxt.this.syncObjects.get(urlHostPort); + if (syncObj == null) { + syncObj = new DomSync(); + RobotsTxt.this.syncObjects.put(urlHostPort, syncObj); + } + // we can now synchronize for each host separately + synchronized (syncObj) { + if (robotsTable.containsKey(robotsTable.encodedKey(urlHostPort))) return; + + // generating the proper url to download the robots txt + DigestURI robotsURL = null; + try { + robotsURL = new DigestURI("http://" + urlHostPort + "/robots.txt"); + } catch (final MalformedURLException e) { + log.fatal("Unable to generate robots.txt URL for host:port '" + urlHostPort + "'.", e); + robotsURL = null; + } + + Response response = null; + if (robotsURL != null) { + if (log.isDebugEnabled()) log.debug("Trying to download the robots.txt file from URL '" + robotsURL + "'."); + Request request = new Request(robotsURL, null); + try { + response = RobotsTxt.this.loader.load(request, CacheStrategy.NOCACHE, null, 0); + } catch (IOException e) { + response = null; + } + } + + RobotsTxtEntry robotsTxt4Host = null; + if (response == null) { + // no robots.txt available, make an entry to prevent that the robots loading is done twice + // generate artificial entry + robotsTxt4Host = new RobotsTxtEntry( + robotsURL, + new ArrayList(), + new ArrayList(), + new Date(), + new Date(), + null, + null, + Integer.valueOf(0), + null); + + // store the data into the robots DB + final int sz = robotsTable.size(); + addEntry(robotsTxt4Host); + if (robotsTable.size() <= sz) { + log.fatal("new entry in robots.txt table failed, resetting database"); + try {clear();} catch (IOException e) {} + addEntry(robotsTxt4Host); + } + } else { + final byte[] robotsTxt = response.getContent(); + //Log.logInfo("RobotsTxt", "robots of " + robotsURL.toNormalform(true, true) + ":\n" + ((robotsTxt == null) ? "null" : UTF8.String(robotsTxt))); // debug TODO remove + RobotsTxtParser parserResult; + ArrayList denyPath; + if (response.getResponseHeader().getStatusCode() == 401 || response.getResponseHeader().getStatusCode() == 403) { + parserResult = new RobotsTxtParser(thisAgents); + // create virtual deny path + denyPath = new ArrayList(); + denyPath.add("/"); + } else { + parserResult = new RobotsTxtParser(thisAgents, robotsTxt); + denyPath = parserResult.denyList(); + } + + // store the data into the robots DB + String etag = response.getResponseHeader().containsKey(HeaderFramework.ETAG) ? (response.getResponseHeader().get(HeaderFramework.ETAG)).trim() : null; + robotsTxt4Host = addEntry( + robotsURL, + parserResult.allowList(), + denyPath, + new Date(), + response.getResponseHeader().lastModified(), + etag, + parserResult.sitemap(), + parserResult.crawlDelayMillis(), + parserResult.agentName()); + } + } + } + private RobotsTxtEntry addEntry( final MultiProtocolURI theURL, final ArrayList allowPathList, diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 9c2f06b17..a0407fbe5 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -804,6 +804,7 @@ public final class Switchboard extends serverSwitch // initializing the stackCrawlThread this.crawlStacker = new CrawlStacker( + this.robots, this.crawlQueues, this.crawler, this.index, @@ -1318,6 +1319,7 @@ public final class Switchboard extends serverSwitch this.crawlStacker = new CrawlStacker( + this.robots, this.crawlQueues, this.crawler, this.index, @@ -2802,9 +2804,9 @@ public final class Switchboard extends serverSwitch } final String s; if (asglobal) { - s = sb.crawlQueues.noticeURL.push(StackType.GLOBAL, request); + s = sb.crawlQueues.noticeURL.push(StackType.GLOBAL, request, this.robots); } else { - s = sb.crawlQueues.noticeURL.push(StackType.LOCAL, request); + s = sb.crawlQueues.noticeURL.push(StackType.LOCAL, request, this.robots); } if (s != null) {