diff --git a/htroot/IndexCreate_p.java b/htroot/IndexCreate_p.java index 48293d705..e2b5ecec1 100644 --- a/htroot/IndexCreate_p.java +++ b/htroot/IndexCreate_p.java @@ -95,6 +95,10 @@ public class IndexCreate_p { env.setConfig("crawlingFilter", newcrawlingfilter); int newcrawlingdepth = Integer.parseInt(post.get("crawlingDepth", "0")); env.setConfig("crawlingDepth", Integer.toString(newcrawlingdepth)); + int recrawlIfOlder = Integer.parseInt(post.get("recrawlIfOlder", "-1")); + env.setConfig("crawlingIfOlder", recrawlIfOlder); + int autoDomFilterDepth = Integer.parseInt(post.get("autoDomFilterDepth", "-1")); + env.setConfig("crawlingautoDomFilterDepth", Integer.toString(autoDomFilterDepth)); boolean crawlingQ = post.get("crawlingQ", "").equals("on"); env.setConfig("crawlingQ", (crawlingQ) ? "true" : "false"); boolean storeHTCache = post.get("storeHTCache", "").equals("on"); @@ -145,7 +149,7 @@ public class IndexCreate_p { switchboard.urlPool.errorURL.remove(urlhash); // stack url - plasmaCrawlProfile.entry pe = switchboard.profiles.newEntry(crawlingStartURL.getHost(), crawlingStart, newcrawlingfilter, newcrawlingfilter, newcrawlingdepth, newcrawlingdepth, crawlingQ, storeHTCache, true, localIndexing, crawlOrder, xsstopw, xdstopw, xpstopw); + plasmaCrawlProfile.entry pe = switchboard.profiles.newEntry(crawlingStartURL.getHost(), crawlingStart, newcrawlingfilter, newcrawlingfilter, newcrawlingdepth, newcrawlingdepth, recrawlIfOlder, autoDomFilterDepth, crawlingQ, storeHTCache, true, localIndexing, crawlOrder, xsstopw, xdstopw, xpstopw); String reasonString = switchboard.sbStackCrawlThread.stackCrawl(crawlingStart, null, yacyCore.seedDB.mySeed.hash, "CRAWLING-ROOT", new Date(), 0, pe); if (reasonString == null) { @@ -206,7 +210,7 @@ public class IndexCreate_p { HashMap hyperlinks = (HashMap) scraper.getAnchors(); // creating a crawler profile - plasmaCrawlProfile.entry profile = switchboard.profiles.newEntry(fileName, file.toURL().toString(), newcrawlingfilter, newcrawlingfilter, newcrawlingdepth, newcrawlingdepth, crawlingQ, storeHTCache, true, localIndexing, crawlOrder, xsstopw, xdstopw, xpstopw); + plasmaCrawlProfile.entry profile = switchboard.profiles.newEntry(fileName, file.toURL().toString(), newcrawlingfilter, newcrawlingfilter, newcrawlingdepth, newcrawlingdepth, recrawlIfOlder, autoDomFilterDepth, crawlingQ, storeHTCache, true, localIndexing, crawlOrder, xsstopw, xdstopw, xpstopw); // loop through the contained links Iterator interator = hyperlinks.entrySet().iterator(); diff --git a/htroot/QuickCrawlLink_p.java b/htroot/QuickCrawlLink_p.java index 3a176c9f2..56dd73040 100644 --- a/htroot/QuickCrawlLink_p.java +++ b/htroot/QuickCrawlLink_p.java @@ -164,8 +164,10 @@ public class QuickCrawlLink_p { crawlingFilter, CrawlingDepth, CrawlingDepth, + 60 * 24 * 30, // recrawlIfOlder (minutes); here: one month + -1, // autoDomFilterDepth, if negative: no auto-filter crawlDynamic, - storeHTCache, + storeHTCache, true, localIndexing, remoteIndexing, diff --git a/source/de/anomic/plasma/plasmaCrawlProfile.java b/source/de/anomic/plasma/plasmaCrawlProfile.java index 3e03e01d3..808f16590 100644 --- a/source/de/anomic/plasma/plasmaCrawlProfile.java +++ b/source/de/anomic/plasma/plasmaCrawlProfile.java @@ -176,6 +176,7 @@ public class plasmaCrawlProfile { public entry newEntry(String name, String startURL, String generalFilter, String specificFilter, int generalDepth, int specificDepth, + int recrawlIfOlder /*minutes*/, int autoDomFilterDepth, boolean crawlingQ, boolean storeHTCache, boolean storeTXCache, boolean localIndexing, boolean remoteIndexing, @@ -183,6 +184,7 @@ public class plasmaCrawlProfile { entry ne = new entry(name, startURL, generalFilter, specificFilter, generalDepth, specificDepth, + recrawlIfOlder, autoDomFilterDepth, crawlingQ, storeHTCache, storeTXCache, localIndexing, remoteIndexing, xsstopw, xdstopw, xpstopw); try { @@ -225,6 +227,7 @@ public class plasmaCrawlProfile { private Map mem; public entry(String name, String startURL, String generalFilter, String specificFilter, int generalDepth, int specificDepth, + int recrawlIfOlder /*minutes*/, int autoDomFilterDepth, boolean crawlingQ, boolean storeHTCache, boolean storeTXCache, boolean localIndexing, boolean remoteIndexing, @@ -238,6 +241,8 @@ public class plasmaCrawlProfile { mem.put("specificFilter", specificFilter); mem.put("generalDepth", Integer.toString(generalDepth)); mem.put("specificDepth", Integer.toString(specificDepth)); + mem.put("recrawlIfOlder", Integer.toString(recrawlIfOlder)); + mem.put("autoDomFilterDepth", Integer.toString(autoDomFilterDepth)); mem.put("crawlingQ", (crawlingQ) ? "true" : "false"); // crawling of urls with '?' mem.put("storeHTCache", (storeHTCache) ? "true" : "false"); mem.put("storeTXCache", (storeTXCache) ? "true" : "false"); @@ -301,6 +306,27 @@ public class plasmaCrawlProfile { return 0; } } + public long recrawlIfOlder() { + // returns a long (millis) that is the minimum age that + // an antry must have to be re-crawled + String r = (String) mem.get("recrawlIfOlder"); + if (r == null) return Long.MAX_VALUE; else try { + long l = Long.parseLong(r) * ((long) 60000); + if (l < 0) return Long.MAX_VALUE; else return l; + } catch (NumberFormatException e) { + return 0; + } + } + public int autoDomFilterDepth() { + // if the depth is equal or less to this depth, + // the the current url feeds with its domain the crawl filter + String r = (String) mem.get("autoDomFilterDepth"); + if (r == null) return 0; else try { + return Integer.parseInt(r); + } catch (NumberFormatException e) { + return 0; + } + } public boolean crawlingQ() { String r = (String) mem.get("crawlingQ"); if (r == null) return false; else return (r.equals("true")); diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 79f1bbad5..38998c57f 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -680,7 +680,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser (getConfig(STR_PROXYPROFILE, "").length() == 0) || (this.profiles.getEntry(getConfig(STR_PROXYPROFILE, "")) == null)) { // generate new default entry for proxy crawling - this.defaultProxyProfile = this.profiles.newEntry("proxy", "", ".*", ".*", Integer.parseInt(getConfig("proxyPrefetchDepth", "0")), Integer.parseInt(getConfig("proxyPrefetchDepth", "0")), false, true, true, true, getConfigBool("proxyCrawlOrder", false), true, true, true); + this.defaultProxyProfile = this.profiles.newEntry("proxy", "", ".*", ".*", Integer.parseInt(getConfig("proxyPrefetchDepth", "0")), Integer.parseInt(getConfig("proxyPrefetchDepth", "0")), 60 * 24 * 30, -1, false, true, true, true, getConfigBool("proxyCrawlOrder", false), true, true, true); setConfig(STR_PROXYPROFILE, this.defaultProxyProfile.handle()); } else { this.defaultProxyProfile = this.profiles.getEntry(getConfig(STR_PROXYPROFILE, "")); @@ -689,8 +689,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser (getConfig(STR_REMOTEPROFILE, "").length() == 0) || (profiles.getEntry(getConfig(STR_REMOTEPROFILE, "")) == null)) { // generate new default entry for remote crawling - defaultRemoteProfile = profiles.newEntry("remote", "", ".*", ".*", 0, 0, true, false, true, true, false, true, true, false); -// defaultRemoteProfile = profiles.newEntry("remote", "", ".*", ".*", 0, 0, true, true, true, true, true, true, true, false); + defaultRemoteProfile = profiles.newEntry("remote", "", ".*", ".*", 0, 0, 60 * 24 * 30, -1, true, false, true, true, false, true, true, false); setConfig(STR_REMOTEPROFILE, defaultRemoteProfile.handle()); } else { defaultRemoteProfile = profiles.getEntry(getConfig(STR_REMOTEPROFILE, "")); diff --git a/source/de/anomic/urlRedirector/urlRedirectord.java b/source/de/anomic/urlRedirector/urlRedirectord.java index 30d8b1108..ea51a6326 100644 --- a/source/de/anomic/urlRedirector/urlRedirectord.java +++ b/source/de/anomic/urlRedirector/urlRedirectord.java @@ -45,7 +45,11 @@ public class urlRedirectord implements serverHandler { ".*", // depth 0, - 0, + 0, + // recrawlIfOlder (minutes), if negative: do not re-crawl + -1, + // autoDomFilterDepth, if negative: no auto-filter + -1, // crawlDynamic false, // storeHTCache