diff --git a/defaults/yacy.init b/defaults/yacy.init index a1c8a4de3..0dff8cfc4 100644 --- a/defaults/yacy.init +++ b/defaults/yacy.init @@ -1087,6 +1087,9 @@ thumbnailProgram = # - wiki : the wiki-page httpd.robots.txt = locked,dirs,bookmarks,network,news,status,profile +# maximum number of robots.txt loading threads +robots.txt.MaxActiveThreads = 200 + # class to use for parsing wikicode wikiParser.class = de.anomic.data.wikiCode diff --git a/htroot/CrawlCheck_p.java b/htroot/CrawlCheck_p.java index d66ccb8e2..29b51f836 100644 --- a/htroot/CrawlCheck_p.java +++ b/htroot/CrawlCheck_p.java @@ -71,8 +71,7 @@ public class CrawlCheck_p { // mass check final ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName)); - final int concurrency = Math.min(rootURLs.size(), 20); - Collection out = sb.robots.massCrawlCheck(rootURLs, agent, concurrency); + Collection out = sb.robots.massCrawlCheck(rootURLs, agent); // evaluate the result from the concurrent computation // make a string that is used to fill the starturls field again diff --git a/htroot/PerformanceQueues_p.java b/htroot/PerformanceQueues_p.java index 8d0539bdd..df8a94bef 100644 --- a/htroot/PerformanceQueues_p.java +++ b/htroot/PerformanceQueues_p.java @@ -29,7 +29,6 @@ import java.util.Iterator; import java.util.Map; import net.yacy.cora.protocol.ConnectionInfo; -import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; import net.yacy.kelondro.data.word.WordReference; import net.yacy.kelondro.rwi.IndexCell; @@ -235,7 +234,15 @@ public class PerformanceQueues_p { // storing the new values into configfile sb.setConfig(SwitchboardConstants.CRAWLER_THREADS_ACTIVE_MAX,maxBusy); - //switchboard.setConfig("crawler.MinIdleThreads",minIdle); + + /* + * configuring the robots.txt loading pool + */ + // get the current crawler pool configuration + maxBusy = post.getInt("Robots.txt Pool_maxActive", SwitchboardConstants.ROBOTS_TXT_THREADS_ACTIVE_MAX_DEFAULT); + + // storing the new values into configfile + sb.setConfig(SwitchboardConstants.ROBOTS_TXT_THREADS_ACTIVE_MAX, maxBusy); /* * configuring the http pool @@ -278,12 +285,16 @@ public class PerformanceQueues_p { prop.put("pool_0_name","Crawler Pool"); prop.put("pool_0_maxActive", sb.getConfigLong(SwitchboardConstants.CRAWLER_THREADS_ACTIVE_MAX, 0)); prop.put("pool_0_numActive", sb.crawlQueues.activeWorkerEntries().size()); + + prop.put("pool_1_name","Robots.txt Pool"); + prop.put("pool_1_maxActive", sb.getConfigInt(SwitchboardConstants.ROBOTS_TXT_THREADS_ACTIVE_MAX, SwitchboardConstants.ROBOTS_TXT_THREADS_ACTIVE_MAX_DEFAULT)); + prop.put("pool_1_numActive", sb.crawlQueues.activeWorkerEntries().size()); - prop.put("pool_1_name", "httpd Session Pool"); - prop.put("pool_1_maxActive", ConnectionInfo.getServerMaxcount()); - prop.put("pool_1_numActive", ConnectionInfo.getServerCount()); + prop.put("pool_2_name", "httpd Session Pool"); + prop.put("pool_2_maxActive", ConnectionInfo.getServerMaxcount()); + prop.put("pool_2_numActive", ConnectionInfo.getServerCount()); - prop.put("pool", "2"); + prop.put("pool", "3"); // parse initialization memory settings final String Xmx = sb.getConfig("javastart_Xmx", "Xmx600m").substring(3); diff --git a/source/net/yacy/crawler/robots/RobotsTxt.java b/source/net/yacy/crawler/robots/RobotsTxt.java index 2d07c403b..dba0a55f0 100644 --- a/source/net/yacy/crawler/robots/RobotsTxt.java +++ b/source/net/yacy/crawler/robots/RobotsTxt.java @@ -31,11 +31,16 @@ import java.net.MalformedURLException; import java.util.ArrayList; import java.util.Collection; import java.util.Date; +import java.util.List; import java.util.Map; -import java.util.concurrent.BlockingQueue; +import java.util.concurrent.Callable; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentMap; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.Future; import java.util.concurrent.LinkedBlockingQueue; +import java.util.concurrent.ThreadPoolExecutor; +import java.util.concurrent.TimeUnit; import java.util.regex.Pattern; import net.yacy.cora.document.id.DigestURL; @@ -49,8 +54,9 @@ import net.yacy.crawler.retrieval.Request; import net.yacy.crawler.retrieval.Response; import net.yacy.data.WorkTables; import net.yacy.kelondro.blob.BEncodedHeap; -import net.yacy.repository.LoaderDispatcher; +import net.yacy.kelondro.util.NamePrefixThreadFactory; import net.yacy.repository.Blacklist.BlacklistType; +import net.yacy.repository.LoaderDispatcher; public class RobotsTxt { @@ -64,12 +70,24 @@ public class RobotsTxt { //private static final HashSet loadedRobots = new HashSet(); // only for debugging private final WorkTables tables; private final LoaderDispatcher loader; + /** Thread pool used to launch concurrent tasks */ + private ThreadPoolExecutor threadPool; private static class DomSync { private DomSync() {} } - public RobotsTxt(final WorkTables worktables, LoaderDispatcher loader) { + /** + * + * @param worktables + * @param loader + * @param maxConcurrentTheads maximum active threads this instance is allowed to run for its concurrent tasks + */ + public RobotsTxt(final WorkTables worktables, LoaderDispatcher loader, final int maxActiveTheads) { + this.threadPool = new ThreadPoolExecutor(maxActiveTheads, maxActiveTheads, + 0L, TimeUnit.MILLISECONDS, + new LinkedBlockingQueue(), + new NamePrefixThreadFactory(RobotsTxt.class.getSimpleName())); this.syncObjects = new ConcurrentHashMap(); this.tables = worktables; this.loader = loader; @@ -89,6 +107,13 @@ public class RobotsTxt { this.tables.getHeap(WorkTables.TABLE_ROBOTS_NAME).clear(); this.syncObjects.clear(); } + + public void close() { + /* Shutdown all active robots.txt loading threads */ + if(this.threadPool != null) { + this.threadPool.shutdownNow(); + } + } public int size() throws IOException { return this.tables.getHeap(WorkTables.TABLE_ROBOTS_NAME).size(); @@ -246,7 +271,19 @@ public class RobotsTxt { } } }; - if (concurrent) t.start(); else t.run(); + if (concurrent) { + this.threadPool.execute(t); + } else { + t.run(); + } + } + + /** + * @return the approximate number of threads that are actively + * executing robots.txt loading tasks + */ + public int getActiveThreads() { + return this.threadPool != null ? this.threadPool.getActiveCount() : 0; } private void processOldEntry(RobotsTxtEntry robotsTxt4Host, DigestURL robotsURL, BEncodedHeap robotsTable) { @@ -374,45 +411,59 @@ public class RobotsTxt { } } - public Collection massCrawlCheck(final Collection rootURLs, final ClientIdentification.Agent userAgent, final int concurrency) { - // put the rootURLs into a blocking queue as input for concurrent computation - final BlockingQueue in = new LinkedBlockingQueue(); - try { - for (DigestURL u: rootURLs) in.put(u); - for (int i = 0; i < concurrency; i++) in.put(DigestURL.POISON); - } catch (InterruptedException e) {} - final BlockingQueue out = new LinkedBlockingQueue(); - final Thread[] threads = new Thread[concurrency]; - for (int i = 0; i < concurrency; i++) { - threads[i] = new Thread("RobotsTxt.massCrawlCheck-" + i) { - @Override - public void run() { - DigestURL u; - try { - while ((u = in.take()) != DigestURL.POISON) { - // try to load the robots - RobotsTxtEntry robotsEntry = getEntry(u, userAgent); - boolean robotsAllowed = robotsEntry == null ? true : !robotsEntry.isDisallowed(u); - if (robotsAllowed) { - try { - Request request = loader.request(u, true, false); - Response response = loader.load(request, CacheStrategy.NOCACHE, - BlacklistType.CRAWLER, userAgent); - out.put(new CheckEntry(u, robotsEntry, response, null)); - } catch (final IOException e) { - out.put(new CheckEntry(u, robotsEntry, null, "error response: " + e.getMessage())); - } - } else { - out.put(new CheckEntry(u, robotsEntry, null, null)); - } - } - } catch (InterruptedException e) {} - } - }; - threads[i].start(); + /** + * A unit task to load a robots.txt entry + */ + private class CrawlCheckTask implements Callable { + + private final DigestURL url; + private final ClientIdentification.Agent userAgent; + + public CrawlCheckTask(final DigestURL url, final ClientIdentification.Agent userAgent) { + this.url = url; + this.userAgent = userAgent; + } + + @Override + public CheckEntry call() throws Exception { + // try to load the robots + RobotsTxtEntry robotsEntry = getEntry(this.url, this.userAgent); + boolean robotsAllowed = robotsEntry == null ? true : !robotsEntry.isDisallowed(this.url); + if (robotsAllowed) { + try { + Request request = loader.request(this.url, true, false); + Response response = loader.load(request, CacheStrategy.NOCACHE, + BlacklistType.CRAWLER, userAgent); + return new CheckEntry(this.url, robotsEntry, response, null); + } catch (final IOException e) { + return new CheckEntry(this.url, robotsEntry, null, "error response: " + e.getMessage()); + } + } + return new CheckEntry(this.url, robotsEntry, null, null); + } + + + } + + public Collection massCrawlCheck(final Collection rootURLs, final ClientIdentification.Agent userAgent) { + final List> futures = new ArrayList<>(); + for (DigestURL u: rootURLs) { + futures.add(this.threadPool.submit(new CrawlCheckTask(u, userAgent))); + } + final Collection results = new ArrayList<>(); + /* Now collect the results concurrently loaded */ + for(Future future: futures) { + try { + results.add(future.get()); + } catch (InterruptedException e) { + log.warn("massCrawlCheck was interrupted before retrieving all results."); + break; + } catch (ExecutionException e) { + /* A robots.txt loading failed : let's continue and try to get the next result + * (most of time this should not happen, as Exceptions are caught inside the concurrent task) */ + continue; + } } - // wait for termiation - try {for (Thread t: threads) t.join();} catch (InterruptedException e1) {} - return out; + return results; } } diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index e564250cf..fa79ef593 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -870,7 +870,8 @@ public final class Switchboard extends serverSwitch { // load the robots.txt db this.log.config("Initializing robots.txt DB"); - this.robots = new RobotsTxt(this.tables, this.loader); + this.robots = new RobotsTxt(this.tables, this.loader, + this.getConfigInt(SwitchboardConstants.ROBOTS_TXT_THREADS_ACTIVE_MAX, SwitchboardConstants.ROBOTS_TXT_THREADS_ACTIVE_MAX_DEFAULT)); try { this.log.config("Loaded robots.txt DB: " + this.robots.size() + " entries"); } catch (final IOException e) { @@ -1868,6 +1869,7 @@ public final class Switchboard extends serverSwitch { this.crawlStacker.announceClose(); this.crawlStacker.close(); this.crawlQueues.close(); + this.robots.close(); this.indexingDocumentProcessor.shutdown(); this.indexingCondensementProcessor.shutdown(); this.indexingAnalysisProcessor.shutdown(); diff --git a/source/net/yacy/search/SwitchboardConstants.java b/source/net/yacy/search/SwitchboardConstants.java index b404c5828..865fc2cbd 100644 --- a/source/net/yacy/search/SwitchboardConstants.java +++ b/source/net/yacy/search/SwitchboardConstants.java @@ -342,6 +342,10 @@ public final class SwitchboardConstants { public static final String HTTPC_NAME_CACHE_CACHING_PATTERNS_NO = "httpc.nameCacheNoCachingPatterns"; public static final String ROBOTS_TXT = "httpd.robots.txt"; public static final String ROBOTS_TXT_DEFAULT = RobotsTxtConfig.LOCKED + "," + RobotsTxtConfig.DIRS; + /** Key of the setting configuring how many active robots.txt loading threads may be running on the same time at max */ + public static final String ROBOTS_TXT_THREADS_ACTIVE_MAX = "robots.txt.MaxActiveThreads"; + /** Default value of the setting configuring how many active robots.txt loading threads may be running on the same time at max */ + public static final int ROBOTS_TXT_THREADS_ACTIVE_MAX_DEFAULT = 200; /** *

public static final String BLACKLIST_CLASS_DEFAULT = "de.anomic.plasma.urlPattern.defaultURLPattern"

diff --git a/test/java/net/yacy/crawler/HostBalancerTest.java b/test/java/net/yacy/crawler/HostBalancerTest.java index acf1bb48d..a7a5454a0 100644 --- a/test/java/net/yacy/crawler/HostBalancerTest.java +++ b/test/java/net/yacy/crawler/HostBalancerTest.java @@ -42,7 +42,7 @@ public class HostBalancerTest { assertEquals("After clear", 0, hb.size()); WorkTables wt = new WorkTables(datadir); - RobotsTxt rob = new RobotsTxt(wt, null); + RobotsTxt rob = new RobotsTxt(wt, null, 10); String res = hb.push(req, null, rob); // push url assertNull(res); // should have no error text