added 3 new crawling steering options:

- re-crawl by age of page (enter in minutes) - auto-domain-filter - maximum number of pages per domain NOT YET TESTED! git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1949 6c8d7289-2bf4-0310-a012-ef5d649a1542
19 years ago · 63f39ac7b5
parent 1fc3b34be6
commit 63f39ac7b5
9 changed files with 137 additions and 25 deletions
--- a/htroot/IndexCreate_p.html
+++ b/htroot/IndexCreate_p.html
@ -43,6 +43,27 @@ You can define URLs as start points for Web page crawling and start crawling her
    the crawling depth.
    </td>
  </tr>
+  <tr valign="top" class="TableCellDark">
+    <td class=small>Re-Crawl Option:</td>
+    <td class=small><input name="crawlingIfOlder" type="text" size="5" maxlength="2" value="#[crawlingIfOlder]#"></td>
+    <td class=small>
+    
+    </td>
+  </tr>
+  <tr valign="top" class="TableCellDark">
+    <td class=small>Auto-Dom-Filter Depth:</td>
+    <td class=small><input name="crawlingDomFilterDepth" type="text" size="2" maxlength="2" value="#[crawlingDomFilterDepth]#"></td>
+    <td class=small>
+    
+    </td>
+  </tr>
+  <tr valign="top" class="TableCellDark">
+    <td class=small>Maximum Pages per Domain:</td>
+    <td class=small><input name="crawlingDomMaxPages" type="text" size="5" maxlength="2" value="#[crawlingDomMaxPages]#"></td>
+    <td class=small>
+    
+    </td>
+  </tr>
  <tr valign="top" class="TableCellDark">
    <td class=small>Accept URLs with '?' / dynamic URLs:</td>
    <td class=small><input type="checkbox" name="crawlingQ" align="top" #(crawlingQChecked)#::checked#(/crawlingQChecked)#></td>
--- a/htroot/IndexCreate_p.java
+++ b/htroot/IndexCreate_p.java
@ -97,8 +97,10 @@ public class IndexCreate_p {
                    env.setConfig("crawlingDepth", Integer.toString(newcrawlingdepth));
                    int recrawlIfOlder = Integer.parseInt(post.get("recrawlIfOlder", "-1"));
                    env.setConfig("crawlingIfOlder", recrawlIfOlder);
-                    int autoDomFilterDepth = Integer.parseInt(post.get("autoDomFilterDepth", "-1"));
-                    env.setConfig("crawlingautoDomFilterDepth", Integer.toString(autoDomFilterDepth));
+                    int domFilterDepth = Integer.parseInt(post.get("domFilterDepth", "-1"));
+                    env.setConfig("crawlingDomFilterDepth", Integer.toString(domFilterDepth));
+                    int domMaxPages = Integer.parseInt(post.get("domMaxPages", "-1"));
+                    env.setConfig("crawlingDomMaxPages", Integer.toString(domMaxPages));
                    boolean crawlingQ = post.get("crawlingQ", "").equals("on");
                    env.setConfig("crawlingQ", (crawlingQ) ? "true" : "false");
                    boolean storeHTCache = post.get("storeHTCache", "").equals("on");
@ -149,7 +151,7 @@ public class IndexCreate_p {
                            switchboard.urlPool.errorURL.remove(urlhash);
                            
                            // stack url
-                            plasmaCrawlProfile.entry pe = switchboard.profiles.newEntry(crawlingStartURL.getHost(), crawlingStart, newcrawlingfilter, newcrawlingfilter, newcrawlingdepth, newcrawlingdepth, recrawlIfOlder, autoDomFilterDepth, crawlingQ, storeHTCache, true, localIndexing, crawlOrder, xsstopw, xdstopw, xpstopw);
+                            plasmaCrawlProfile.entry pe = switchboard.profiles.newEntry(crawlingStartURL.getHost(), crawlingStart, newcrawlingfilter, newcrawlingfilter, newcrawlingdepth, newcrawlingdepth, recrawlIfOlder, domFilterDepth, domMaxPages, crawlingQ, storeHTCache, true, localIndexing, crawlOrder, xsstopw, xdstopw, xpstopw);
                            String reasonString = switchboard.sbStackCrawlThread.stackCrawl(crawlingStart, null, yacyCore.seedDB.mySeed.hash, "CRAWLING-ROOT", new Date(), 0, pe);
                            
                            if (reasonString == null) {
@ -210,7 +212,7 @@ public class IndexCreate_p {
                                HashMap hyperlinks = (HashMap) scraper.getAnchors();
                                
                                // creating a crawler profile
-                                plasmaCrawlProfile.entry profile = switchboard.profiles.newEntry(fileName, file.toURL().toString(), newcrawlingfilter, newcrawlingfilter, newcrawlingdepth, newcrawlingdepth, recrawlIfOlder, autoDomFilterDepth, crawlingQ, storeHTCache, true, localIndexing, crawlOrder, xsstopw, xdstopw, xpstopw);                                
+                                plasmaCrawlProfile.entry profile = switchboard.profiles.newEntry(fileName, file.toURL().toString(), newcrawlingfilter, newcrawlingfilter, newcrawlingdepth, newcrawlingdepth, recrawlIfOlder, domFilterDepth, domMaxPages, crawlingQ, storeHTCache, true, localIndexing, crawlOrder, xsstopw, xdstopw, xpstopw);                                
                                
                                // loop through the contained links
                                Iterator interator = hyperlinks.entrySet().iterator();
@ -299,6 +301,9 @@ public class IndexCreate_p {
        prop.put("proxyPrefetchDepth", env.getConfig("proxyPrefetchDepth", "0"));
        prop.put("crawlingDepth", env.getConfig("crawlingDepth", "0"));
        prop.put("crawlingFilter", env.getConfig("crawlingFilter", "0"));
+        prop.put("crawlingIfOlder", env.getConfig("crawlingIfOlder", "-1"));
+        prop.put("crawlingDomFilterDepth", env.getConfig("crawlingDomFilterDepth", "-1"));
+        prop.put("crawlingDomMaxPages", env.getConfig("crawlingDomMaxPages", "-1"));
        prop.put("crawlingQChecked", env.getConfig("crawlingQ", "").equals("true") ? 1 : 0);
        prop.put("storeHTCacheChecked", env.getConfig("storeHTCache", "").equals("true") ? 1 : 0);
        prop.put("localIndexingChecked", env.getConfig("localIndexing", "").equals("true") ? 1 : 0);
--- a/htroot/QuickCrawlLink_p.java
+++ b/htroot/QuickCrawlLink_p.java
@ -165,7 +165,8 @@ public class QuickCrawlLink_p {
                        CrawlingDepth, 
                        CrawlingDepth, 
                        60 * 24 * 30, // recrawlIfOlder (minutes); here: one month
-                        -1, // autoDomFilterDepth, if negative: no auto-filter
+                        -1, // domFilterDepth, if negative: no auto-filter
+                        -1, // domMaxPages, if negative: no count restriction
                        crawlDynamic, 
                        storeHTCache,
                        true, 
--- a/source/de/anomic/plasma/plasmaCrawlProfile.java
+++ b/source/de/anomic/plasma/plasmaCrawlProfile.java
@ -176,7 +176,7 @@ public class plasmaCrawlProfile {
    
    public entry newEntry(String name, String startURL, String generalFilter, String specificFilter,
                           int generalDepth, int specificDepth,
-                           int recrawlIfOlder /*minutes*/, int autoDomFilterDepth, 
+                           int recrawlIfOlder /*minutes*/, int domFilterDepth,  int domMaxPages,
                           boolean crawlingQ,
                           boolean storeHTCache, boolean storeTXCache,
                           boolean localIndexing, boolean remoteIndexing,
@ -184,7 +184,7 @@ public class plasmaCrawlProfile {
        
        entry ne = new entry(name, startURL, generalFilter, specificFilter,
                             generalDepth, specificDepth,
-                             recrawlIfOlder, autoDomFilterDepth,
+                             recrawlIfOlder, domFilterDepth, domMaxPages,
                             crawlingQ, storeHTCache, storeTXCache, localIndexing, remoteIndexing,
                             xsstopw, xdstopw, xpstopw);
        try {
@ -225,9 +225,11 @@ public class plasmaCrawlProfile {
        // this is a simple record structure that hold all properties of a single crawl start

        private Map mem;
+        private Map doms;
+        
        public entry(String name, String startURL, String generalFilter, String specificFilter,
                     int generalDepth, int specificDepth,
-                     int recrawlIfOlder /*minutes*/, int autoDomFilterDepth, 
+                     int recrawlIfOlder /*minutes*/, int domFilterDepth, int domMaxPages,
                     boolean crawlingQ,
                     boolean storeHTCache, boolean storeTXCache,
                     boolean localIndexing, boolean remoteIndexing,
@ -242,7 +244,8 @@ public class plasmaCrawlProfile {
            mem.put("generalDepth", Integer.toString(generalDepth));
            mem.put("specificDepth", Integer.toString(specificDepth));
            mem.put("recrawlIfOlder", Integer.toString(recrawlIfOlder));
-            mem.put("autoDomFilterDepth", Integer.toString(autoDomFilterDepth));
+            mem.put("domFilterDepth", Integer.toString(domFilterDepth));
+            mem.put("domMaxPages", Integer.toString(domMaxPages));
            mem.put("crawlingQ", (crawlingQ) ? "true" : "false"); // crawling of urls with '?'
            mem.put("storeHTCache", (storeHTCache) ? "true" : "false");
            mem.put("storeTXCache", (storeTXCache) ? "true" : "false");
@ -251,6 +254,8 @@ public class plasmaCrawlProfile {
            mem.put("xsstopw", (xsstopw) ? "true" : "false"); // exclude static stop-words
            mem.put("xdstopw", (xdstopw) ? "true" : "false"); // exclude dynamic stop-word
            mem.put("xpstopw", (xpstopw) ? "true" : "false"); // exclude parent stop-words
+
+            doms = new HashMap();
        }
        
        public String toString() {
@ -317,12 +322,27 @@ public class plasmaCrawlProfile {
                return 0;
            }
        }
-        public int autoDomFilterDepth() {
+        public int domFilterDepth() {
            // if the depth is equal or less to this depth,
-            // the the current url feeds with its domain the crawl filter
-            String r = (String) mem.get("autoDomFilterDepth");
+            // then the current url feeds with its domain the crawl filter
+            // if this is -1, all domains are feeded
+            String r = (String) mem.get("domFilterDepth");
            if (r == null) return 0; else try {
-                return Integer.parseInt(r);
+                int i = Integer.parseInt(r);
+                if (i < 0) return Integer.MAX_VALUE;
+                return i;
+            } catch (NumberFormatException e) {
+                return 0;
+            }
+        }
+        public int domMaxPages() {
+            // this is the maximum number of pages that are crawled for a single domain
+            // if -1, this means no limit
+            String r = (String) mem.get("domMaxPages");
+            if (r == null) return 0; else try {
+                int i = Integer.parseInt(r);
+                if (i < 0) return Integer.MAX_VALUE;
+                return i;
            } catch (NumberFormatException e) {
                return 0;
            }
@ -363,5 +383,32 @@ public class plasmaCrawlProfile {
            mem.put(propName,  newValue);
            profileTable.set(handle(), mem);
        }
+        public void domInc(String domain) {
+            Integer c = (Integer) doms.get(domain);
+            if (c == null) {
+                // new domain
+                doms.put(domain, new Integer(1));
+            } else {
+                // increase counter
+                doms.put(domain, new Integer(c.intValue() + 1));
+            }
+        }
+        public int domCount(String domain) {
+            Integer c = (Integer) doms.get(domain);
+            if (c == null) {
+                return 0;
+            } else {
+                return c.intValue();
+            }
+        }
+        public int domSize() {
+            return doms.size();
+        }
+        public boolean domExists(String domain) {
+            return doms.containsKey(domain);
+        }
+        public Iterator domNames() {
+            return doms.keySet().iterator();
+        }
    }
 }
--- a/source/de/anomic/plasma/plasmaCrawlStacker.java
+++ b/source/de/anomic/plasma/plasmaCrawlStacker.java
@ -311,10 +311,36 @@ public final class plasmaCrawlStacker {
            return reason;
        }
        
+        // add domain to profile domain list
+        if (currentdepth <= profile.domFilterDepth()) {
+            profile.domInc(nexturl.getHost());
+        }
+
+        // deny urls that do not match with the profile domain list
+        if (profile.domCount(nexturl.getHost()) == 0) {
+            reason = "denied_(no_match_with_domain_filter)";
+            this.log.logFine("URL '" + nexturlString + "' is not listed in granted domains. " + 
+                             "Stack processing time: " + (System.currentTimeMillis()-startTime));
+            return reason;
+        }
+
+        // deny urls that exceed allowed number of occurrences
+        if (profile.domCount(nexturl.getHost()) > profile.domMaxPages()) {
+            reason = "denied_(domain_count_exceeded)";
+            this.log.logFine("URL '" + nexturlString + "' appeared too often, a maximum of " + profile.domMaxPages() + " is allowed. "+ 
+                             "Stack processing time: " + (System.currentTimeMillis()-startTime));
+            return reason;
+        }
+
        String nexturlhash = plasmaURL.urlHash(nexturl);
-        String dbocc = "";
-        if ((dbocc = this.sb.urlPool.exists(nexturlhash)) != null) {
-            // DISTIGUISH OLD/RE-SEARCH CASES HERE!
+        String dbocc = this.sb.urlPool.exists(nexturlhash);
+        plasmaCrawlLURL.Entry oldEntry = null;
+        if (dbocc != null) try {
+            oldEntry = this.sb.urlPool.loadedURL.getEntry(nexturlhash, null);
+        } catch (IOException e) {}
+        boolean recrawl = (oldEntry != null) &&
+                          (((System.currentTimeMillis() - oldEntry.loaddate().getTime()) / 60000) > profile.recrawlIfOlder());
+        if ((dbocc != null) && (!(recrawl))) {
            reason = "double_(registered_in_" + dbocc + ")";
            /*
             urlPool.errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash,
@ -323,7 +349,7 @@ public final class plasmaCrawlStacker {
                             "Stack processing time: " + (System.currentTimeMillis()-startTime));
            return reason;
        }
-        
+
        // checking robots.txt
        if (robotsParser.isDisallowed(nexturl)) {
            reason = "denied_(robots.txt)";
@ -334,6 +360,12 @@ public final class plasmaCrawlStacker {
                             "Stack processing time: " + (System.currentTimeMillis()-startTime));
            return reason;            
        }
+
+        // show potential re-crawl
+        if (recrawl) {
+            this.log.logFine("RE-CRAWL of URL '" + nexturlString + "': this url was crawled " +
+                    ((System.currentTimeMillis() - oldEntry.loaddate().getTime()) / 60000 / 60 / 24) + " days ago.");
+        }
        
        // store information
        boolean local = ((initiatorHash.equals(plasmaURL.dummyHash)) || (initiatorHash.equals(yacyCore.seedDB.mySeed.hash)));
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@ -680,7 +680,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
            (getConfig(STR_PROXYPROFILE, "").length() == 0) ||
            (this.profiles.getEntry(getConfig(STR_PROXYPROFILE, "")) == null)) {
            // generate new default entry for proxy crawling
-            this.defaultProxyProfile = this.profiles.newEntry("proxy", "", ".*", ".*", Integer.parseInt(getConfig("proxyPrefetchDepth", "0")), Integer.parseInt(getConfig("proxyPrefetchDepth", "0")), 60 * 24 * 30, -1, false, true, true, true, getConfigBool("proxyCrawlOrder", false), true, true, true);
+            this.defaultProxyProfile = this.profiles.newEntry("proxy", "", ".*", ".*", Integer.parseInt(getConfig("proxyPrefetchDepth", "0")), Integer.parseInt(getConfig("proxyPrefetchDepth", "0")), 60 * 24 * 30, -1, -1, false, true, true, true, getConfigBool("proxyCrawlOrder", false), true, true, true);
            setConfig(STR_PROXYPROFILE, this.defaultProxyProfile.handle());
        } else {
            this.defaultProxyProfile = this.profiles.getEntry(getConfig(STR_PROXYPROFILE, ""));
@ -689,7 +689,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
            (getConfig(STR_REMOTEPROFILE, "").length() == 0) ||
            (profiles.getEntry(getConfig(STR_REMOTEPROFILE, "")) == null)) {
            // generate new default entry for remote crawling
-            defaultRemoteProfile = profiles.newEntry("remote", "", ".*", ".*", 0, 0, 60 * 24 * 30, -1, true, false, true, true, false, true, true, false);
+            defaultRemoteProfile = profiles.newEntry("remote", "", ".*", ".*", 0, 0, 60 * 24 * 30, -1, -1, true, false, true, true, false, true, true, false);
            setConfig(STR_REMOTEPROFILE, defaultRemoteProfile.handle());
        } else {
            defaultRemoteProfile = profiles.getEntry(getConfig(STR_REMOTEPROFILE, ""));
--- a/source/de/anomic/urlRedirector/urlRedirectord.java
+++ b/source/de/anomic/urlRedirector/urlRedirectord.java
@ -48,7 +48,9 @@ public class urlRedirectord implements serverHandler {
                            0,
                            // recrawlIfOlder (minutes), if negative: do not re-crawl
                            -1,
-                            // autoDomFilterDepth, if negative: no auto-filter
+                            // domFilterDepth, if negative: no auto-filter
+                            -1,
+                            // domMaxPages, if negative: no count restriction
                            -1,
                            // crawlDynamic
                            false, 
--- a/source/de/anomic/yacy/yacyClient.java
+++ b/source/de/anomic/yacy/yacyClient.java
@ -465,7 +465,7 @@ public final class yacyClient {
            for (int n = 0; n < results; n++) {
                // get one single search result
                urlEntry = urlManager.newEntry((String) result.get("resource" + n), true);
-                if (urlEntry != null && blacklist.isListed(urlEntry.url())) { continue; } // block with backlist
+                if ((urlEntry == null) || (blacklist.isListed(urlEntry.url()))) { continue; } // block with backlist
                urlEntry.store();
                int urlLength = urlEntry.url().toString().length();
                int urlComps = htmlFilterContentScraper.urlComps(urlEntry.url().toString()).length;
--- a/yacy.init
+++ b/yacy.init
@ -329,13 +329,14 @@ browserPopUpApplication=netscape
 yacyOwnSeedFile=DATA/YACYDB/mySeed.txt
 yacyDB=DATA/YACYDB

-# index sharing attributes
-# by default, sharing is on. If you want to use the proxy only for
-# local indexing, you may switch this off
+# index sharing attributes: by default, sharing is on.
+# If you want to use YaCy only for local indexing (robinson mode),
+# you may switch this off
 allowDistributeIndex=true
 allowDistributeIndexWhileCrawling=false
 allowReceiveIndex=true
-indexReceiveBlockBlacklist=false
+allowUnlimitedReceiveIndexFrom=
+indexReceiveBlockBlacklist=true

 # the frequency is the number of links per minute, that the peer allowes
 # _every_ other peer to send to this peer
@ -362,6 +363,9 @@ proxyCrawlOrder=false
 # Be careful with this number. Consider a branching factor of average 20;
 # A prefect-depth of 8 would index 25.600.000.000 pages, maybe the whole WWW.
 crawlingDepth=2
+crawlingIfOlder=525600
+crawlingDomFilterDepth=-1
+crawlingDomMaxPages=-1
 localIndexing=true

 # Filter for crawlinig; may be used to restrict a crawl to a specific domain