diff --git a/htroot/IndexCreate_p.html b/htroot/IndexCreate_p.html index 46bb97e54..c065df3bf 100644 --- a/htroot/IndexCreate_p.html +++ b/htroot/IndexCreate_p.html @@ -43,6 +43,27 @@ You can define URLs as start points for Web page crawling and start crawling her the crawling depth. + + Re-Crawl Option: + + + + + + + Auto-Dom-Filter Depth: + + + + + + + Maximum Pages per Domain: + + + + + Accept URLs with '?' / dynamic URLs: diff --git a/htroot/IndexCreate_p.java b/htroot/IndexCreate_p.java index e2b5ecec1..e53914190 100644 --- a/htroot/IndexCreate_p.java +++ b/htroot/IndexCreate_p.java @@ -97,8 +97,10 @@ public class IndexCreate_p { env.setConfig("crawlingDepth", Integer.toString(newcrawlingdepth)); int recrawlIfOlder = Integer.parseInt(post.get("recrawlIfOlder", "-1")); env.setConfig("crawlingIfOlder", recrawlIfOlder); - int autoDomFilterDepth = Integer.parseInt(post.get("autoDomFilterDepth", "-1")); - env.setConfig("crawlingautoDomFilterDepth", Integer.toString(autoDomFilterDepth)); + int domFilterDepth = Integer.parseInt(post.get("domFilterDepth", "-1")); + env.setConfig("crawlingDomFilterDepth", Integer.toString(domFilterDepth)); + int domMaxPages = Integer.parseInt(post.get("domMaxPages", "-1")); + env.setConfig("crawlingDomMaxPages", Integer.toString(domMaxPages)); boolean crawlingQ = post.get("crawlingQ", "").equals("on"); env.setConfig("crawlingQ", (crawlingQ) ? "true" : "false"); boolean storeHTCache = post.get("storeHTCache", "").equals("on"); @@ -149,7 +151,7 @@ public class IndexCreate_p { switchboard.urlPool.errorURL.remove(urlhash); // stack url - plasmaCrawlProfile.entry pe = switchboard.profiles.newEntry(crawlingStartURL.getHost(), crawlingStart, newcrawlingfilter, newcrawlingfilter, newcrawlingdepth, newcrawlingdepth, recrawlIfOlder, autoDomFilterDepth, crawlingQ, storeHTCache, true, localIndexing, crawlOrder, xsstopw, xdstopw, xpstopw); + plasmaCrawlProfile.entry pe = switchboard.profiles.newEntry(crawlingStartURL.getHost(), crawlingStart, newcrawlingfilter, newcrawlingfilter, newcrawlingdepth, newcrawlingdepth, recrawlIfOlder, domFilterDepth, domMaxPages, crawlingQ, storeHTCache, true, localIndexing, crawlOrder, xsstopw, xdstopw, xpstopw); String reasonString = switchboard.sbStackCrawlThread.stackCrawl(crawlingStart, null, yacyCore.seedDB.mySeed.hash, "CRAWLING-ROOT", new Date(), 0, pe); if (reasonString == null) { @@ -210,7 +212,7 @@ public class IndexCreate_p { HashMap hyperlinks = (HashMap) scraper.getAnchors(); // creating a crawler profile - plasmaCrawlProfile.entry profile = switchboard.profiles.newEntry(fileName, file.toURL().toString(), newcrawlingfilter, newcrawlingfilter, newcrawlingdepth, newcrawlingdepth, recrawlIfOlder, autoDomFilterDepth, crawlingQ, storeHTCache, true, localIndexing, crawlOrder, xsstopw, xdstopw, xpstopw); + plasmaCrawlProfile.entry profile = switchboard.profiles.newEntry(fileName, file.toURL().toString(), newcrawlingfilter, newcrawlingfilter, newcrawlingdepth, newcrawlingdepth, recrawlIfOlder, domFilterDepth, domMaxPages, crawlingQ, storeHTCache, true, localIndexing, crawlOrder, xsstopw, xdstopw, xpstopw); // loop through the contained links Iterator interator = hyperlinks.entrySet().iterator(); @@ -299,6 +301,9 @@ public class IndexCreate_p { prop.put("proxyPrefetchDepth", env.getConfig("proxyPrefetchDepth", "0")); prop.put("crawlingDepth", env.getConfig("crawlingDepth", "0")); prop.put("crawlingFilter", env.getConfig("crawlingFilter", "0")); + prop.put("crawlingIfOlder", env.getConfig("crawlingIfOlder", "-1")); + prop.put("crawlingDomFilterDepth", env.getConfig("crawlingDomFilterDepth", "-1")); + prop.put("crawlingDomMaxPages", env.getConfig("crawlingDomMaxPages", "-1")); prop.put("crawlingQChecked", env.getConfig("crawlingQ", "").equals("true") ? 1 : 0); prop.put("storeHTCacheChecked", env.getConfig("storeHTCache", "").equals("true") ? 1 : 0); prop.put("localIndexingChecked", env.getConfig("localIndexing", "").equals("true") ? 1 : 0); diff --git a/htroot/QuickCrawlLink_p.java b/htroot/QuickCrawlLink_p.java index 56dd73040..caaad4e85 100644 --- a/htroot/QuickCrawlLink_p.java +++ b/htroot/QuickCrawlLink_p.java @@ -165,7 +165,8 @@ public class QuickCrawlLink_p { CrawlingDepth, CrawlingDepth, 60 * 24 * 30, // recrawlIfOlder (minutes); here: one month - -1, // autoDomFilterDepth, if negative: no auto-filter + -1, // domFilterDepth, if negative: no auto-filter + -1, // domMaxPages, if negative: no count restriction crawlDynamic, storeHTCache, true, diff --git a/source/de/anomic/plasma/plasmaCrawlProfile.java b/source/de/anomic/plasma/plasmaCrawlProfile.java index 808f16590..2818bdf37 100644 --- a/source/de/anomic/plasma/plasmaCrawlProfile.java +++ b/source/de/anomic/plasma/plasmaCrawlProfile.java @@ -176,7 +176,7 @@ public class plasmaCrawlProfile { public entry newEntry(String name, String startURL, String generalFilter, String specificFilter, int generalDepth, int specificDepth, - int recrawlIfOlder /*minutes*/, int autoDomFilterDepth, + int recrawlIfOlder /*minutes*/, int domFilterDepth, int domMaxPages, boolean crawlingQ, boolean storeHTCache, boolean storeTXCache, boolean localIndexing, boolean remoteIndexing, @@ -184,7 +184,7 @@ public class plasmaCrawlProfile { entry ne = new entry(name, startURL, generalFilter, specificFilter, generalDepth, specificDepth, - recrawlIfOlder, autoDomFilterDepth, + recrawlIfOlder, domFilterDepth, domMaxPages, crawlingQ, storeHTCache, storeTXCache, localIndexing, remoteIndexing, xsstopw, xdstopw, xpstopw); try { @@ -225,9 +225,11 @@ public class plasmaCrawlProfile { // this is a simple record structure that hold all properties of a single crawl start private Map mem; + private Map doms; + public entry(String name, String startURL, String generalFilter, String specificFilter, int generalDepth, int specificDepth, - int recrawlIfOlder /*minutes*/, int autoDomFilterDepth, + int recrawlIfOlder /*minutes*/, int domFilterDepth, int domMaxPages, boolean crawlingQ, boolean storeHTCache, boolean storeTXCache, boolean localIndexing, boolean remoteIndexing, @@ -242,7 +244,8 @@ public class plasmaCrawlProfile { mem.put("generalDepth", Integer.toString(generalDepth)); mem.put("specificDepth", Integer.toString(specificDepth)); mem.put("recrawlIfOlder", Integer.toString(recrawlIfOlder)); - mem.put("autoDomFilterDepth", Integer.toString(autoDomFilterDepth)); + mem.put("domFilterDepth", Integer.toString(domFilterDepth)); + mem.put("domMaxPages", Integer.toString(domMaxPages)); mem.put("crawlingQ", (crawlingQ) ? "true" : "false"); // crawling of urls with '?' mem.put("storeHTCache", (storeHTCache) ? "true" : "false"); mem.put("storeTXCache", (storeTXCache) ? "true" : "false"); @@ -251,6 +254,8 @@ public class plasmaCrawlProfile { mem.put("xsstopw", (xsstopw) ? "true" : "false"); // exclude static stop-words mem.put("xdstopw", (xdstopw) ? "true" : "false"); // exclude dynamic stop-word mem.put("xpstopw", (xpstopw) ? "true" : "false"); // exclude parent stop-words + + doms = new HashMap(); } public String toString() { @@ -317,12 +322,27 @@ public class plasmaCrawlProfile { return 0; } } - public int autoDomFilterDepth() { + public int domFilterDepth() { // if the depth is equal or less to this depth, - // the the current url feeds with its domain the crawl filter - String r = (String) mem.get("autoDomFilterDepth"); + // then the current url feeds with its domain the crawl filter + // if this is -1, all domains are feeded + String r = (String) mem.get("domFilterDepth"); if (r == null) return 0; else try { - return Integer.parseInt(r); + int i = Integer.parseInt(r); + if (i < 0) return Integer.MAX_VALUE; + return i; + } catch (NumberFormatException e) { + return 0; + } + } + public int domMaxPages() { + // this is the maximum number of pages that are crawled for a single domain + // if -1, this means no limit + String r = (String) mem.get("domMaxPages"); + if (r == null) return 0; else try { + int i = Integer.parseInt(r); + if (i < 0) return Integer.MAX_VALUE; + return i; } catch (NumberFormatException e) { return 0; } @@ -363,5 +383,32 @@ public class plasmaCrawlProfile { mem.put(propName, newValue); profileTable.set(handle(), mem); } + public void domInc(String domain) { + Integer c = (Integer) doms.get(domain); + if (c == null) { + // new domain + doms.put(domain, new Integer(1)); + } else { + // increase counter + doms.put(domain, new Integer(c.intValue() + 1)); + } + } + public int domCount(String domain) { + Integer c = (Integer) doms.get(domain); + if (c == null) { + return 0; + } else { + return c.intValue(); + } + } + public int domSize() { + return doms.size(); + } + public boolean domExists(String domain) { + return doms.containsKey(domain); + } + public Iterator domNames() { + return doms.keySet().iterator(); + } } } diff --git a/source/de/anomic/plasma/plasmaCrawlStacker.java b/source/de/anomic/plasma/plasmaCrawlStacker.java index 9b182a731..8dd191141 100644 --- a/source/de/anomic/plasma/plasmaCrawlStacker.java +++ b/source/de/anomic/plasma/plasmaCrawlStacker.java @@ -311,10 +311,36 @@ public final class plasmaCrawlStacker { return reason; } + // add domain to profile domain list + if (currentdepth <= profile.domFilterDepth()) { + profile.domInc(nexturl.getHost()); + } + + // deny urls that do not match with the profile domain list + if (profile.domCount(nexturl.getHost()) == 0) { + reason = "denied_(no_match_with_domain_filter)"; + this.log.logFine("URL '" + nexturlString + "' is not listed in granted domains. " + + "Stack processing time: " + (System.currentTimeMillis()-startTime)); + return reason; + } + + // deny urls that exceed allowed number of occurrences + if (profile.domCount(nexturl.getHost()) > profile.domMaxPages()) { + reason = "denied_(domain_count_exceeded)"; + this.log.logFine("URL '" + nexturlString + "' appeared too often, a maximum of " + profile.domMaxPages() + " is allowed. "+ + "Stack processing time: " + (System.currentTimeMillis()-startTime)); + return reason; + } + String nexturlhash = plasmaURL.urlHash(nexturl); - String dbocc = ""; - if ((dbocc = this.sb.urlPool.exists(nexturlhash)) != null) { - // DISTIGUISH OLD/RE-SEARCH CASES HERE! + String dbocc = this.sb.urlPool.exists(nexturlhash); + plasmaCrawlLURL.Entry oldEntry = null; + if (dbocc != null) try { + oldEntry = this.sb.urlPool.loadedURL.getEntry(nexturlhash, null); + } catch (IOException e) {} + boolean recrawl = (oldEntry != null) && + (((System.currentTimeMillis() - oldEntry.loaddate().getTime()) / 60000) > profile.recrawlIfOlder()); + if ((dbocc != null) && (!(recrawl))) { reason = "double_(registered_in_" + dbocc + ")"; /* urlPool.errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash, @@ -323,7 +349,7 @@ public final class plasmaCrawlStacker { "Stack processing time: " + (System.currentTimeMillis()-startTime)); return reason; } - + // checking robots.txt if (robotsParser.isDisallowed(nexturl)) { reason = "denied_(robots.txt)"; @@ -334,6 +360,12 @@ public final class plasmaCrawlStacker { "Stack processing time: " + (System.currentTimeMillis()-startTime)); return reason; } + + // show potential re-crawl + if (recrawl) { + this.log.logFine("RE-CRAWL of URL '" + nexturlString + "': this url was crawled " + + ((System.currentTimeMillis() - oldEntry.loaddate().getTime()) / 60000 / 60 / 24) + " days ago."); + } // store information boolean local = ((initiatorHash.equals(plasmaURL.dummyHash)) || (initiatorHash.equals(yacyCore.seedDB.mySeed.hash))); diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 38998c57f..1c032fb54 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -680,7 +680,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser (getConfig(STR_PROXYPROFILE, "").length() == 0) || (this.profiles.getEntry(getConfig(STR_PROXYPROFILE, "")) == null)) { // generate new default entry for proxy crawling - this.defaultProxyProfile = this.profiles.newEntry("proxy", "", ".*", ".*", Integer.parseInt(getConfig("proxyPrefetchDepth", "0")), Integer.parseInt(getConfig("proxyPrefetchDepth", "0")), 60 * 24 * 30, -1, false, true, true, true, getConfigBool("proxyCrawlOrder", false), true, true, true); + this.defaultProxyProfile = this.profiles.newEntry("proxy", "", ".*", ".*", Integer.parseInt(getConfig("proxyPrefetchDepth", "0")), Integer.parseInt(getConfig("proxyPrefetchDepth", "0")), 60 * 24 * 30, -1, -1, false, true, true, true, getConfigBool("proxyCrawlOrder", false), true, true, true); setConfig(STR_PROXYPROFILE, this.defaultProxyProfile.handle()); } else { this.defaultProxyProfile = this.profiles.getEntry(getConfig(STR_PROXYPROFILE, "")); @@ -689,7 +689,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser (getConfig(STR_REMOTEPROFILE, "").length() == 0) || (profiles.getEntry(getConfig(STR_REMOTEPROFILE, "")) == null)) { // generate new default entry for remote crawling - defaultRemoteProfile = profiles.newEntry("remote", "", ".*", ".*", 0, 0, 60 * 24 * 30, -1, true, false, true, true, false, true, true, false); + defaultRemoteProfile = profiles.newEntry("remote", "", ".*", ".*", 0, 0, 60 * 24 * 30, -1, -1, true, false, true, true, false, true, true, false); setConfig(STR_REMOTEPROFILE, defaultRemoteProfile.handle()); } else { defaultRemoteProfile = profiles.getEntry(getConfig(STR_REMOTEPROFILE, "")); diff --git a/source/de/anomic/urlRedirector/urlRedirectord.java b/source/de/anomic/urlRedirector/urlRedirectord.java index ea51a6326..e9c4679b4 100644 --- a/source/de/anomic/urlRedirector/urlRedirectord.java +++ b/source/de/anomic/urlRedirector/urlRedirectord.java @@ -48,7 +48,9 @@ public class urlRedirectord implements serverHandler { 0, // recrawlIfOlder (minutes), if negative: do not re-crawl -1, - // autoDomFilterDepth, if negative: no auto-filter + // domFilterDepth, if negative: no auto-filter + -1, + // domMaxPages, if negative: no count restriction -1, // crawlDynamic false, diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java index 21768a45a..65da62f4c 100644 --- a/source/de/anomic/yacy/yacyClient.java +++ b/source/de/anomic/yacy/yacyClient.java @@ -465,7 +465,7 @@ public final class yacyClient { for (int n = 0; n < results; n++) { // get one single search result urlEntry = urlManager.newEntry((String) result.get("resource" + n), true); - if (urlEntry != null && blacklist.isListed(urlEntry.url())) { continue; } // block with backlist + if ((urlEntry == null) || (blacklist.isListed(urlEntry.url()))) { continue; } // block with backlist urlEntry.store(); int urlLength = urlEntry.url().toString().length(); int urlComps = htmlFilterContentScraper.urlComps(urlEntry.url().toString()).length; diff --git a/yacy.init b/yacy.init index 0294312cd..b2d1af16e 100644 --- a/yacy.init +++ b/yacy.init @@ -329,13 +329,14 @@ browserPopUpApplication=netscape yacyOwnSeedFile=DATA/YACYDB/mySeed.txt yacyDB=DATA/YACYDB -# index sharing attributes -# by default, sharing is on. If you want to use the proxy only for -# local indexing, you may switch this off +# index sharing attributes: by default, sharing is on. +# If you want to use YaCy only for local indexing (robinson mode), +# you may switch this off allowDistributeIndex=true allowDistributeIndexWhileCrawling=false allowReceiveIndex=true -indexReceiveBlockBlacklist=false +allowUnlimitedReceiveIndexFrom= +indexReceiveBlockBlacklist=true # the frequency is the number of links per minute, that the peer allowes # _every_ other peer to send to this peer @@ -362,6 +363,9 @@ proxyCrawlOrder=false # Be careful with this number. Consider a branching factor of average 20; # A prefect-depth of 8 would index 25.600.000.000 pages, maybe the whole WWW. crawlingDepth=2 +crawlingIfOlder=525600 +crawlingDomFilterDepth=-1 +crawlingDomMaxPages=-1 localIndexing=true # Filter for crawlinig; may be used to restrict a crawl to a specific domain