From 4948c39e480f79b71da4aceabb675187eae57245 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Wed, 23 Oct 2013 11:27:19 +0200 Subject: [PATCH] added concurrency for mass crawl check --- htroot/CrawlCheck_p.java | 52 +++++++++---------- .../net/yacy/cora/document/id/DigestURL.java | 9 ++++ source/net/yacy/crawler/robots/RobotsTxt.java | 52 +++++++++++++++++++ 3 files changed, 85 insertions(+), 28 deletions(-) diff --git a/htroot/CrawlCheck_p.java b/htroot/CrawlCheck_p.java index 09f342b94..3af63de8c 100644 --- a/htroot/CrawlCheck_p.java +++ b/htroot/CrawlCheck_p.java @@ -18,21 +18,17 @@ * If not, see . */ -import java.io.IOException; import java.net.MalformedURLException; -import java.util.HashSet; +import java.util.Collection; +import java.util.LinkedHashSet; import java.util.Set; import java.util.regex.Pattern; import net.yacy.cora.document.id.DigestURL; -import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.util.ConcurrentLog; -import net.yacy.crawler.retrieval.Request; -import net.yacy.crawler.retrieval.Response; -import net.yacy.crawler.robots.RobotsTxtEntry; -import net.yacy.repository.Blacklist.BlacklistType; +import net.yacy.crawler.robots.RobotsTxt.CheckEntry; import net.yacy.search.Switchboard; import net.yacy.server.serverObjects; import net.yacy.server.serverSwitch; @@ -49,7 +45,7 @@ public class CrawlCheck_p { if (post.containsKey("crawlcheck")) { // get the list of rootURls for this crawl start - Set rootURLs = new HashSet(); + Set rootURLs = new LinkedHashSet(); String crawlingStart0 = post.get("crawlingURLs","").trim(); String[] rootURLs0 = crawlingStart0.indexOf('\n') > 0 || crawlingStart0.indexOf('\r') > 0 ? crawlingStart0.split("[\\r\\n]+") : crawlingStart0.split(Pattern.quote("|")); for (String crawlingStart: rootURLs0) { @@ -72,46 +68,46 @@ public class CrawlCheck_p { prop.put("table", 0); } else { prop.put("table", 1); - ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName)); + + // mass check + final ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName)); + final int concurrency = Math.min(rootURLs.size(), 20); + Collection out = sb.robots.massCrawlCheck(rootURLs, agent, concurrency); + // evaluate the result from the concurrent computation // make a string that is used to fill the starturls field again // and analyze the urls to make the table rows StringBuilder s = new StringBuilder(300); int row = 0; - for (DigestURL u: rootURLs) { - s.append(u.toNormalform(true)).append('\n'); - prop.put("table_list_" + row + "_url", u.toNormalform(true)); + for (CheckEntry entry: out) { + String u = entry.digestURL.toNormalform(true); + s.append(u).append('\n'); + prop.put("table_list_" + row + "_url", u); // try to load the robots - RobotsTxtEntry robotsEntry; boolean robotsAllowed = true; - robotsEntry = sb.robots.getEntry(u, agent); - if (robotsEntry == null) { + if (entry.robotsTxtEntry == null) { prop.put("table_list_" + row + "_robots", "no robots"); prop.put("table_list_" + row + "_crawldelay", agent.minimumDelta + " ms"); prop.put("table_list_" + row + "_sitemap", ""); } else { - robotsAllowed = !robotsEntry.isDisallowed(u); + robotsAllowed = !entry.robotsTxtEntry.isDisallowed(entry.digestURL); prop.put("table_list_" + row + "_robots", "robots exist: " + (robotsAllowed ? "crawl allowed" : "url disallowed")); - prop.put("table_list_" + row + "_crawldelay", Math.max(agent.minimumDelta, robotsEntry.getCrawlDelayMillis()) + " ms"); - prop.put("table_list_" + row + "_sitemap", robotsEntry.getSitemap() == null ? "-" : robotsEntry.getSitemap().toNormalform(true)); + prop.put("table_list_" + row + "_crawldelay", Math.max(agent.minimumDelta, entry.robotsTxtEntry.getCrawlDelayMillis()) + " ms"); + prop.put("table_list_" + row + "_sitemap", entry.robotsTxtEntry.getSitemap() == null ? "-" : entry.robotsTxtEntry.getSitemap().toNormalform(true)); } // try to load the url - if (robotsAllowed) try { - Request request = sb.loader.request(u, true, false); - final Response response = sb.loader.load(request, CacheStrategy.NOCACHE, BlacklistType.CRAWLER, agent); - if (response == null) { - prop.put("table_list_" + row + "_access", "no response"); + if (robotsAllowed) { + if (entry.response == null) { + prop.put("table_list_" + row + "_access", entry.error == null ? "no response" : entry.error); } else { - if (response.getResponseHeader().getStatusCode() == 200) { - prop.put("table_list_" + row + "_access", "200 ok, last-modified = " + response.lastModified()); + if (entry.response.getResponseHeader().getStatusCode() == 200) { + prop.put("table_list_" + row + "_access", "200 ok, last-modified = " + entry.response.lastModified()); } else { - prop.put("table_list_" + row + "_access", response.getResponseHeader().getStatusCode() + " - load failed"); + prop.put("table_list_" + row + "_access", entry.response.getResponseHeader().getStatusCode() + " - load failed"); } } - } catch (final IOException e) { - prop.put("table_list_" + row + "_access", "error response: " + e.getMessage()); } else { prop.put("table_list_" + row + "_access", "not loaded - prevented by robots.txt"); } diff --git a/source/net/yacy/cora/document/id/DigestURL.java b/source/net/yacy/cora/document/id/DigestURL.java index b45a58553..a9baceee0 100644 --- a/source/net/yacy/cora/document/id/DigestURL.java +++ b/source/net/yacy/cora/document/id/DigestURL.java @@ -47,6 +47,8 @@ import net.yacy.cora.util.ConcurrentLog; */ public class DigestURL extends MultiProtocolURL implements Serializable { + public static final DigestURL POISON = new DigestURL(); // poison pill for concurrent link generators + private static final long serialVersionUID = -1173233022912141885L; // class variables @@ -100,6 +102,13 @@ public class DigestURL extends MultiProtocolURL implements Serializable { return h; } + /** + * DigestURI to generate a poison pill + */ + private DigestURL() { + super(); + this.hash = null; + } /** * DigestURI from File diff --git a/source/net/yacy/crawler/robots/RobotsTxt.java b/source/net/yacy/crawler/robots/RobotsTxt.java index a109bf505..fb8b9f7f0 100644 --- a/source/net/yacy/crawler/robots/RobotsTxt.java +++ b/source/net/yacy/crawler/robots/RobotsTxt.java @@ -29,10 +29,13 @@ package net.yacy.crawler.robots; import java.io.IOException; import java.net.MalformedURLException; import java.util.ArrayList; +import java.util.Collection; import java.util.Date; import java.util.Map; +import java.util.concurrent.BlockingQueue; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentMap; +import java.util.concurrent.LinkedBlockingQueue; import java.util.regex.Pattern; import net.yacy.cora.document.id.DigestURL; @@ -47,6 +50,7 @@ import net.yacy.crawler.retrieval.Response; import net.yacy.data.WorkTables; import net.yacy.kelondro.blob.BEncodedHeap; import net.yacy.repository.LoaderDispatcher; +import net.yacy.repository.Blacklist.BlacklistType; public class RobotsTxt { @@ -327,4 +331,52 @@ public class RobotsTxt { return sb.toString(); } + public static class CheckEntry { + public final DigestURL digestURL; + public final RobotsTxtEntry robotsTxtEntry; + public final Response response; + public final String error; + public CheckEntry(DigestURL digestURL, RobotsTxtEntry robotsTxtEntry, Response response, String error) { + this.digestURL = digestURL; + this.robotsTxtEntry = robotsTxtEntry; + this.response = response; + this.error = error; + } + } + + public Collection massCrawlCheck(final Collection rootURLs, final ClientIdentification.Agent userAgent, final int concurrency) { + // put the rootURLs into a blocking queue as input for concurrent computation + final BlockingQueue in = new LinkedBlockingQueue(); + try { + for (DigestURL u: rootURLs) in.put(u); + for (int i = 0; i < concurrency; i++) in.put(DigestURL.POISON); + } catch (InterruptedException e) {} + final BlockingQueue out = new LinkedBlockingQueue(); + final Thread[] threads = new Thread[concurrency]; + for (int i = 0; i < concurrency; i++) { + threads[i] = new Thread() { + public void run() { + DigestURL u; + try { + while ((u = in.take()) != DigestURL.POISON) { + // try to load the robots + RobotsTxtEntry robotsEntry = getEntry(u, userAgent); + boolean robotsAllowed = robotsEntry == null ? true : !robotsEntry.isDisallowed(u); + if (robotsAllowed) try { + Request request = loader.request(u, true, false); + Response response = loader.load(request, CacheStrategy.NOCACHE, BlacklistType.CRAWLER, userAgent); + out.put(new CheckEntry(u, robotsEntry, response, null)); + } catch (final IOException e) { + out.put(new CheckEntry(u, robotsEntry, null, "error response: " + e.getMessage())); + } + } + } catch (InterruptedException e) {} + } + }; + threads[i].start(); + } + // wait for termiation + try {for (Thread t: threads) t.join();} catch (InterruptedException e1) {} + return out; + } }