diff --git a/htroot/CrawlCheck_p.java b/htroot/CrawlCheck_p.java
index 09f342b94..3af63de8c 100644
--- a/htroot/CrawlCheck_p.java
+++ b/htroot/CrawlCheck_p.java
@@ -18,21 +18,17 @@
* If not, see .
*/
-import java.io.IOException;
import java.net.MalformedURLException;
-import java.util.HashSet;
+import java.util.Collection;
+import java.util.LinkedHashSet;
import java.util.Set;
import java.util.regex.Pattern;
import net.yacy.cora.document.id.DigestURL;
-import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog;
-import net.yacy.crawler.retrieval.Request;
-import net.yacy.crawler.retrieval.Response;
-import net.yacy.crawler.robots.RobotsTxtEntry;
-import net.yacy.repository.Blacklist.BlacklistType;
+import net.yacy.crawler.robots.RobotsTxt.CheckEntry;
import net.yacy.search.Switchboard;
import net.yacy.server.serverObjects;
import net.yacy.server.serverSwitch;
@@ -49,7 +45,7 @@ public class CrawlCheck_p {
if (post.containsKey("crawlcheck")) {
// get the list of rootURls for this crawl start
- Set rootURLs = new HashSet();
+ Set rootURLs = new LinkedHashSet();
String crawlingStart0 = post.get("crawlingURLs","").trim();
String[] rootURLs0 = crawlingStart0.indexOf('\n') > 0 || crawlingStart0.indexOf('\r') > 0 ? crawlingStart0.split("[\\r\\n]+") : crawlingStart0.split(Pattern.quote("|"));
for (String crawlingStart: rootURLs0) {
@@ -72,46 +68,46 @@ public class CrawlCheck_p {
prop.put("table", 0);
} else {
prop.put("table", 1);
- ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName));
+
+ // mass check
+ final ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName));
+ final int concurrency = Math.min(rootURLs.size(), 20);
+ Collection out = sb.robots.massCrawlCheck(rootURLs, agent, concurrency);
+ // evaluate the result from the concurrent computation
// make a string that is used to fill the starturls field again
// and analyze the urls to make the table rows
StringBuilder s = new StringBuilder(300);
int row = 0;
- for (DigestURL u: rootURLs) {
- s.append(u.toNormalform(true)).append('\n');
- prop.put("table_list_" + row + "_url", u.toNormalform(true));
+ for (CheckEntry entry: out) {
+ String u = entry.digestURL.toNormalform(true);
+ s.append(u).append('\n');
+ prop.put("table_list_" + row + "_url", u);
// try to load the robots
- RobotsTxtEntry robotsEntry;
boolean robotsAllowed = true;
- robotsEntry = sb.robots.getEntry(u, agent);
- if (robotsEntry == null) {
+ if (entry.robotsTxtEntry == null) {
prop.put("table_list_" + row + "_robots", "no robots");
prop.put("table_list_" + row + "_crawldelay", agent.minimumDelta + " ms");
prop.put("table_list_" + row + "_sitemap", "");
} else {
- robotsAllowed = !robotsEntry.isDisallowed(u);
+ robotsAllowed = !entry.robotsTxtEntry.isDisallowed(entry.digestURL);
prop.put("table_list_" + row + "_robots", "robots exist: " + (robotsAllowed ? "crawl allowed" : "url disallowed"));
- prop.put("table_list_" + row + "_crawldelay", Math.max(agent.minimumDelta, robotsEntry.getCrawlDelayMillis()) + " ms");
- prop.put("table_list_" + row + "_sitemap", robotsEntry.getSitemap() == null ? "-" : robotsEntry.getSitemap().toNormalform(true));
+ prop.put("table_list_" + row + "_crawldelay", Math.max(agent.minimumDelta, entry.robotsTxtEntry.getCrawlDelayMillis()) + " ms");
+ prop.put("table_list_" + row + "_sitemap", entry.robotsTxtEntry.getSitemap() == null ? "-" : entry.robotsTxtEntry.getSitemap().toNormalform(true));
}
// try to load the url
- if (robotsAllowed) try {
- Request request = sb.loader.request(u, true, false);
- final Response response = sb.loader.load(request, CacheStrategy.NOCACHE, BlacklistType.CRAWLER, agent);
- if (response == null) {
- prop.put("table_list_" + row + "_access", "no response");
+ if (robotsAllowed) {
+ if (entry.response == null) {
+ prop.put("table_list_" + row + "_access", entry.error == null ? "no response" : entry.error);
} else {
- if (response.getResponseHeader().getStatusCode() == 200) {
- prop.put("table_list_" + row + "_access", "200 ok, last-modified = " + response.lastModified());
+ if (entry.response.getResponseHeader().getStatusCode() == 200) {
+ prop.put("table_list_" + row + "_access", "200 ok, last-modified = " + entry.response.lastModified());
} else {
- prop.put("table_list_" + row + "_access", response.getResponseHeader().getStatusCode() + " - load failed");
+ prop.put("table_list_" + row + "_access", entry.response.getResponseHeader().getStatusCode() + " - load failed");
}
}
- } catch (final IOException e) {
- prop.put("table_list_" + row + "_access", "error response: " + e.getMessage());
} else {
prop.put("table_list_" + row + "_access", "not loaded - prevented by robots.txt");
}
diff --git a/source/net/yacy/cora/document/id/DigestURL.java b/source/net/yacy/cora/document/id/DigestURL.java
index b45a58553..a9baceee0 100644
--- a/source/net/yacy/cora/document/id/DigestURL.java
+++ b/source/net/yacy/cora/document/id/DigestURL.java
@@ -47,6 +47,8 @@ import net.yacy.cora.util.ConcurrentLog;
*/
public class DigestURL extends MultiProtocolURL implements Serializable {
+ public static final DigestURL POISON = new DigestURL(); // poison pill for concurrent link generators
+
private static final long serialVersionUID = -1173233022912141885L;
// class variables
@@ -100,6 +102,13 @@ public class DigestURL extends MultiProtocolURL implements Serializable {
return h;
}
+ /**
+ * DigestURI to generate a poison pill
+ */
+ private DigestURL() {
+ super();
+ this.hash = null;
+ }
/**
* DigestURI from File
diff --git a/source/net/yacy/crawler/robots/RobotsTxt.java b/source/net/yacy/crawler/robots/RobotsTxt.java
index a109bf505..fb8b9f7f0 100644
--- a/source/net/yacy/crawler/robots/RobotsTxt.java
+++ b/source/net/yacy/crawler/robots/RobotsTxt.java
@@ -29,10 +29,13 @@ package net.yacy.crawler.robots;
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.ArrayList;
+import java.util.Collection;
import java.util.Date;
import java.util.Map;
+import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
+import java.util.concurrent.LinkedBlockingQueue;
import java.util.regex.Pattern;
import net.yacy.cora.document.id.DigestURL;
@@ -47,6 +50,7 @@ import net.yacy.crawler.retrieval.Response;
import net.yacy.data.WorkTables;
import net.yacy.kelondro.blob.BEncodedHeap;
import net.yacy.repository.LoaderDispatcher;
+import net.yacy.repository.Blacklist.BlacklistType;
public class RobotsTxt {
@@ -327,4 +331,52 @@ public class RobotsTxt {
return sb.toString();
}
+ public static class CheckEntry {
+ public final DigestURL digestURL;
+ public final RobotsTxtEntry robotsTxtEntry;
+ public final Response response;
+ public final String error;
+ public CheckEntry(DigestURL digestURL, RobotsTxtEntry robotsTxtEntry, Response response, String error) {
+ this.digestURL = digestURL;
+ this.robotsTxtEntry = robotsTxtEntry;
+ this.response = response;
+ this.error = error;
+ }
+ }
+
+ public Collection massCrawlCheck(final Collection rootURLs, final ClientIdentification.Agent userAgent, final int concurrency) {
+ // put the rootURLs into a blocking queue as input for concurrent computation
+ final BlockingQueue in = new LinkedBlockingQueue();
+ try {
+ for (DigestURL u: rootURLs) in.put(u);
+ for (int i = 0; i < concurrency; i++) in.put(DigestURL.POISON);
+ } catch (InterruptedException e) {}
+ final BlockingQueue out = new LinkedBlockingQueue();
+ final Thread[] threads = new Thread[concurrency];
+ for (int i = 0; i < concurrency; i++) {
+ threads[i] = new Thread() {
+ public void run() {
+ DigestURL u;
+ try {
+ while ((u = in.take()) != DigestURL.POISON) {
+ // try to load the robots
+ RobotsTxtEntry robotsEntry = getEntry(u, userAgent);
+ boolean robotsAllowed = robotsEntry == null ? true : !robotsEntry.isDisallowed(u);
+ if (robotsAllowed) try {
+ Request request = loader.request(u, true, false);
+ Response response = loader.load(request, CacheStrategy.NOCACHE, BlacklistType.CRAWLER, userAgent);
+ out.put(new CheckEntry(u, robotsEntry, response, null));
+ } catch (final IOException e) {
+ out.put(new CheckEntry(u, robotsEntry, null, "error response: " + e.getMessage()));
+ }
+ }
+ } catch (InterruptedException e) {}
+ }
+ };
+ threads[i].start();
+ }
+ // wait for termiation
+ try {for (Thread t: threads) t.join();} catch (InterruptedException e1) {}
+ return out;
+ }
}