added concurrency for mass crawl check

12 years ago · 4948c39e48
parent 1b4fa2947d
commit 4948c39e48
3 changed files with 85 additions and 28 deletions
--- a/htroot/CrawlCheck_p.java
+++ b/htroot/CrawlCheck_p.java
@ -18,21 +18,17 @@
 *  If not, see <http://www.gnu.org/licenses/>.
 */

-import java.io.IOException;
 import java.net.MalformedURLException;
-import java.util.HashSet;
+import java.util.Collection;
+import java.util.LinkedHashSet;
 import java.util.Set;
 import java.util.regex.Pattern;

 import net.yacy.cora.document.id.DigestURL;
-import net.yacy.cora.federate.yacy.CacheStrategy;
 import net.yacy.cora.protocol.ClientIdentification;
 import net.yacy.cora.protocol.RequestHeader;
 import net.yacy.cora.util.ConcurrentLog;
-import net.yacy.crawler.retrieval.Request;
-import net.yacy.crawler.retrieval.Response;
-import net.yacy.crawler.robots.RobotsTxtEntry;
-import net.yacy.repository.Blacklist.BlacklistType;
+import net.yacy.crawler.robots.RobotsTxt.CheckEntry;
 import net.yacy.search.Switchboard;
 import net.yacy.server.serverObjects;
 import net.yacy.server.serverSwitch;
@ -49,7 +45,7 @@ public class CrawlCheck_p {
        if (post.containsKey("crawlcheck")) {
            
            // get the list of rootURls for this crawl start
-            Set<DigestURL> rootURLs = new HashSet<DigestURL>();
+            Set<DigestURL> rootURLs = new LinkedHashSet<DigestURL>();
            String crawlingStart0 = post.get("crawlingURLs","").trim();
            String[] rootURLs0 = crawlingStart0.indexOf('\n') > 0 || crawlingStart0.indexOf('\r') > 0 ? crawlingStart0.split("[\\r\\n]+") : crawlingStart0.split(Pattern.quote("|"));
            for (String crawlingStart: rootURLs0) {
@ -72,46 +68,46 @@ public class CrawlCheck_p {
                prop.put("table", 0);
            } else {
                prop.put("table", 1);
-                ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName));
+                
+                // mass check
+                final ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName));
+                final int concurrency = Math.min(rootURLs.size(), 20);
+                Collection<CheckEntry> out = sb.robots.massCrawlCheck(rootURLs, agent, concurrency);
+                // evaluate the result from the concurrent computation
                
                // make a string that is used to fill the starturls field again
                // and analyze the urls to make the table rows
                StringBuilder s = new StringBuilder(300);
                int row = 0;
-                for (DigestURL u: rootURLs) {
-                    s.append(u.toNormalform(true)).append('\n');
-                    prop.put("table_list_" + row + "_url", u.toNormalform(true));
+                for (CheckEntry entry: out) {
+                    String u = entry.digestURL.toNormalform(true);
+                    s.append(u).append('\n');
+                    prop.put("table_list_" + row + "_url", u);

                    // try to load the robots
-                    RobotsTxtEntry robotsEntry;
                    boolean robotsAllowed = true;
-                    robotsEntry = sb.robots.getEntry(u, agent);
-                    if (robotsEntry == null) {
+                    if (entry.robotsTxtEntry == null) {
                        prop.put("table_list_" + row + "_robots", "no robots");
                        prop.put("table_list_" + row + "_crawldelay", agent.minimumDelta + " ms");
                        prop.put("table_list_" + row + "_sitemap", "");
                    } else {
-                        robotsAllowed = !robotsEntry.isDisallowed(u);
+                        robotsAllowed = !entry.robotsTxtEntry.isDisallowed(entry.digestURL);
                        prop.put("table_list_" + row + "_robots", "robots exist: " + (robotsAllowed ? "crawl allowed" : "url disallowed"));
-                        prop.put("table_list_" + row + "_crawldelay", Math.max(agent.minimumDelta, robotsEntry.getCrawlDelayMillis()) + " ms");
-                        prop.put("table_list_" + row + "_sitemap", robotsEntry.getSitemap() == null ? "-" : robotsEntry.getSitemap().toNormalform(true));
+                        prop.put("table_list_" + row + "_crawldelay", Math.max(agent.minimumDelta, entry.robotsTxtEntry.getCrawlDelayMillis()) + " ms");
+                        prop.put("table_list_" + row + "_sitemap", entry.robotsTxtEntry.getSitemap() == null ? "-" : entry.robotsTxtEntry.getSitemap().toNormalform(true));
                    }
                    
                    // try to load the url
-                    if (robotsAllowed) try {
-                        Request request = sb.loader.request(u, true, false);
-                        final Response response = sb.loader.load(request, CacheStrategy.NOCACHE, BlacklistType.CRAWLER, agent);
-                        if (response == null) {
-                            prop.put("table_list_" + row + "_access", "no response");
+                    if (robotsAllowed) {
+                        if (entry.response == null) {
+                            prop.put("table_list_" + row + "_access", entry.error == null ? "no response" : entry.error);
                        } else {
-                            if (response.getResponseHeader().getStatusCode() == 200) {
-                                prop.put("table_list_" + row + "_access", "200 ok, last-modified = " + response.lastModified());
+                            if (entry.response.getResponseHeader().getStatusCode() == 200) {
+                                prop.put("table_list_" + row + "_access", "200 ok, last-modified = " + entry.response.lastModified());
                            } else {
-                                prop.put("table_list_" + row + "_access", response.getResponseHeader().getStatusCode() + " - load failed");
+                                prop.put("table_list_" + row + "_access", entry.response.getResponseHeader().getStatusCode() + " - load failed");
                            }
                        }
-                    } catch (final IOException e) {
-                        prop.put("table_list_" + row + "_access", "error response: " + e.getMessage());
                    } else {
                        prop.put("table_list_" + row + "_access", "not loaded - prevented by robots.txt");
                    }
--- a/source/net/yacy/cora/document/id/DigestURL.java
+++ b/source/net/yacy/cora/document/id/DigestURL.java
@ -47,6 +47,8 @@ import net.yacy.cora.util.ConcurrentLog;
 */
 public class DigestURL extends MultiProtocolURL implements Serializable {

+    public static final DigestURL POISON = new DigestURL(); // poison pill for concurrent link generators
+    
    private static final long serialVersionUID = -1173233022912141885L;

    // class variables
@ -100,6 +102,13 @@ public class DigestURL extends MultiProtocolURL implements Serializable {
        return h;
    }

+    /**
+     * DigestURI to generate a poison pill
+     */
+    private DigestURL() {
+        super();
+        this.hash = null;
+    }

    /**
     * DigestURI from File
--- a/source/net/yacy/crawler/robots/RobotsTxt.java
+++ b/source/net/yacy/crawler/robots/RobotsTxt.java
@ -29,10 +29,13 @@ package net.yacy.crawler.robots;
 import java.io.IOException;
 import java.net.MalformedURLException;
 import java.util.ArrayList;
+import java.util.Collection;
 import java.util.Date;
 import java.util.Map;
+import java.util.concurrent.BlockingQueue;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.ConcurrentMap;
+import java.util.concurrent.LinkedBlockingQueue;
 import java.util.regex.Pattern;

 import net.yacy.cora.document.id.DigestURL;
@ -47,6 +50,7 @@ import net.yacy.crawler.retrieval.Response;
 import net.yacy.data.WorkTables;
 import net.yacy.kelondro.blob.BEncodedHeap;
 import net.yacy.repository.LoaderDispatcher;
+import net.yacy.repository.Blacklist.BlacklistType;

 public class RobotsTxt {

@ -327,4 +331,52 @@ public class RobotsTxt {
        return sb.toString();
    }

+    public static class CheckEntry {
+        public final DigestURL digestURL;
+        public final RobotsTxtEntry robotsTxtEntry;
+        public final Response response;
+        public final String error;
+        public CheckEntry(DigestURL digestURL, RobotsTxtEntry robotsTxtEntry, Response response, String error) {
+            this.digestURL = digestURL;
+            this.robotsTxtEntry = robotsTxtEntry;
+            this.response = response;
+            this.error = error;
+        }
+    }
+    
+    public Collection<CheckEntry> massCrawlCheck(final Collection<DigestURL> rootURLs, final ClientIdentification.Agent userAgent, final int concurrency) {
+        // put the rootURLs into a blocking queue as input for concurrent computation
+        final BlockingQueue<DigestURL> in = new LinkedBlockingQueue<DigestURL>();
+        try {
+            for (DigestURL u: rootURLs) in.put(u);
+            for (int i = 0; i < concurrency; i++) in.put(DigestURL.POISON);
+        } catch (InterruptedException e) {}
+        final BlockingQueue<CheckEntry> out = new LinkedBlockingQueue<CheckEntry>();
+        final Thread[] threads = new Thread[concurrency];
+        for (int i = 0; i < concurrency; i++) {
+            threads[i] = new Thread() {
+                public void run() {
+                    DigestURL u;
+                    try {
+                        while ((u = in.take()) != DigestURL.POISON) {
+                            // try to load the robots
+                            RobotsTxtEntry robotsEntry = getEntry(u, userAgent);
+                            boolean robotsAllowed = robotsEntry == null ? true : !robotsEntry.isDisallowed(u);
+                            if (robotsAllowed) try {
+                                Request request = loader.request(u, true, false);
+                                Response response = loader.load(request, CacheStrategy.NOCACHE, BlacklistType.CRAWLER, userAgent);
+                                out.put(new CheckEntry(u, robotsEntry, response, null));
+                            } catch (final IOException e) {
+                                out.put(new CheckEntry(u, robotsEntry, null, "error response: " + e.getMessage()));
+                            }
+                        }
+                    } catch (InterruptedException e) {}
+                }
+            };
+            threads[i].start();
+        }
+        // wait for termiation
+        try {for (Thread t: threads) t.join();} catch (InterruptedException e1) {}
+        return out;
+    }
 }