diff --git a/htroot/IndexCreate_p.html b/htroot/IndexCreate_p.html
index 46bb97e54..c065df3bf 100644
--- a/htroot/IndexCreate_p.html
+++ b/htroot/IndexCreate_p.html
@@ -43,6 +43,27 @@ You can define URLs as start points for Web page crawling and start crawling her
the crawling depth.
+
+
Re-Crawl Option:
+
+
+
+
+
+
+
Auto-Dom-Filter Depth:
+
+
+
+
+
+
+
Maximum Pages per Domain:
+
+
+
+
+
Accept URLs with '?' / dynamic URLs:
diff --git a/htroot/IndexCreate_p.java b/htroot/IndexCreate_p.java
index e2b5ecec1..e53914190 100644
--- a/htroot/IndexCreate_p.java
+++ b/htroot/IndexCreate_p.java
@@ -97,8 +97,10 @@ public class IndexCreate_p {
env.setConfig("crawlingDepth", Integer.toString(newcrawlingdepth));
int recrawlIfOlder = Integer.parseInt(post.get("recrawlIfOlder", "-1"));
env.setConfig("crawlingIfOlder", recrawlIfOlder);
- int autoDomFilterDepth = Integer.parseInt(post.get("autoDomFilterDepth", "-1"));
- env.setConfig("crawlingautoDomFilterDepth", Integer.toString(autoDomFilterDepth));
+ int domFilterDepth = Integer.parseInt(post.get("domFilterDepth", "-1"));
+ env.setConfig("crawlingDomFilterDepth", Integer.toString(domFilterDepth));
+ int domMaxPages = Integer.parseInt(post.get("domMaxPages", "-1"));
+ env.setConfig("crawlingDomMaxPages", Integer.toString(domMaxPages));
boolean crawlingQ = post.get("crawlingQ", "").equals("on");
env.setConfig("crawlingQ", (crawlingQ) ? "true" : "false");
boolean storeHTCache = post.get("storeHTCache", "").equals("on");
@@ -149,7 +151,7 @@ public class IndexCreate_p {
switchboard.urlPool.errorURL.remove(urlhash);
// stack url
- plasmaCrawlProfile.entry pe = switchboard.profiles.newEntry(crawlingStartURL.getHost(), crawlingStart, newcrawlingfilter, newcrawlingfilter, newcrawlingdepth, newcrawlingdepth, recrawlIfOlder, autoDomFilterDepth, crawlingQ, storeHTCache, true, localIndexing, crawlOrder, xsstopw, xdstopw, xpstopw);
+ plasmaCrawlProfile.entry pe = switchboard.profiles.newEntry(crawlingStartURL.getHost(), crawlingStart, newcrawlingfilter, newcrawlingfilter, newcrawlingdepth, newcrawlingdepth, recrawlIfOlder, domFilterDepth, domMaxPages, crawlingQ, storeHTCache, true, localIndexing, crawlOrder, xsstopw, xdstopw, xpstopw);
String reasonString = switchboard.sbStackCrawlThread.stackCrawl(crawlingStart, null, yacyCore.seedDB.mySeed.hash, "CRAWLING-ROOT", new Date(), 0, pe);
if (reasonString == null) {
@@ -210,7 +212,7 @@ public class IndexCreate_p {
HashMap hyperlinks = (HashMap) scraper.getAnchors();
// creating a crawler profile
- plasmaCrawlProfile.entry profile = switchboard.profiles.newEntry(fileName, file.toURL().toString(), newcrawlingfilter, newcrawlingfilter, newcrawlingdepth, newcrawlingdepth, recrawlIfOlder, autoDomFilterDepth, crawlingQ, storeHTCache, true, localIndexing, crawlOrder, xsstopw, xdstopw, xpstopw);
+ plasmaCrawlProfile.entry profile = switchboard.profiles.newEntry(fileName, file.toURL().toString(), newcrawlingfilter, newcrawlingfilter, newcrawlingdepth, newcrawlingdepth, recrawlIfOlder, domFilterDepth, domMaxPages, crawlingQ, storeHTCache, true, localIndexing, crawlOrder, xsstopw, xdstopw, xpstopw);
// loop through the contained links
Iterator interator = hyperlinks.entrySet().iterator();
@@ -299,6 +301,9 @@ public class IndexCreate_p {
prop.put("proxyPrefetchDepth", env.getConfig("proxyPrefetchDepth", "0"));
prop.put("crawlingDepth", env.getConfig("crawlingDepth", "0"));
prop.put("crawlingFilter", env.getConfig("crawlingFilter", "0"));
+ prop.put("crawlingIfOlder", env.getConfig("crawlingIfOlder", "-1"));
+ prop.put("crawlingDomFilterDepth", env.getConfig("crawlingDomFilterDepth", "-1"));
+ prop.put("crawlingDomMaxPages", env.getConfig("crawlingDomMaxPages", "-1"));
prop.put("crawlingQChecked", env.getConfig("crawlingQ", "").equals("true") ? 1 : 0);
prop.put("storeHTCacheChecked", env.getConfig("storeHTCache", "").equals("true") ? 1 : 0);
prop.put("localIndexingChecked", env.getConfig("localIndexing", "").equals("true") ? 1 : 0);
diff --git a/htroot/QuickCrawlLink_p.java b/htroot/QuickCrawlLink_p.java
index 56dd73040..caaad4e85 100644
--- a/htroot/QuickCrawlLink_p.java
+++ b/htroot/QuickCrawlLink_p.java
@@ -165,7 +165,8 @@ public class QuickCrawlLink_p {
CrawlingDepth,
CrawlingDepth,
60 * 24 * 30, // recrawlIfOlder (minutes); here: one month
- -1, // autoDomFilterDepth, if negative: no auto-filter
+ -1, // domFilterDepth, if negative: no auto-filter
+ -1, // domMaxPages, if negative: no count restriction
crawlDynamic,
storeHTCache,
true,
diff --git a/source/de/anomic/plasma/plasmaCrawlProfile.java b/source/de/anomic/plasma/plasmaCrawlProfile.java
index 808f16590..2818bdf37 100644
--- a/source/de/anomic/plasma/plasmaCrawlProfile.java
+++ b/source/de/anomic/plasma/plasmaCrawlProfile.java
@@ -176,7 +176,7 @@ public class plasmaCrawlProfile {
public entry newEntry(String name, String startURL, String generalFilter, String specificFilter,
int generalDepth, int specificDepth,
- int recrawlIfOlder /*minutes*/, int autoDomFilterDepth,
+ int recrawlIfOlder /*minutes*/, int domFilterDepth, int domMaxPages,
boolean crawlingQ,
boolean storeHTCache, boolean storeTXCache,
boolean localIndexing, boolean remoteIndexing,
@@ -184,7 +184,7 @@ public class plasmaCrawlProfile {
entry ne = new entry(name, startURL, generalFilter, specificFilter,
generalDepth, specificDepth,
- recrawlIfOlder, autoDomFilterDepth,
+ recrawlIfOlder, domFilterDepth, domMaxPages,
crawlingQ, storeHTCache, storeTXCache, localIndexing, remoteIndexing,
xsstopw, xdstopw, xpstopw);
try {
@@ -225,9 +225,11 @@ public class plasmaCrawlProfile {
// this is a simple record structure that hold all properties of a single crawl start
private Map mem;
+ private Map doms;
+
public entry(String name, String startURL, String generalFilter, String specificFilter,
int generalDepth, int specificDepth,
- int recrawlIfOlder /*minutes*/, int autoDomFilterDepth,
+ int recrawlIfOlder /*minutes*/, int domFilterDepth, int domMaxPages,
boolean crawlingQ,
boolean storeHTCache, boolean storeTXCache,
boolean localIndexing, boolean remoteIndexing,
@@ -242,7 +244,8 @@ public class plasmaCrawlProfile {
mem.put("generalDepth", Integer.toString(generalDepth));
mem.put("specificDepth", Integer.toString(specificDepth));
mem.put("recrawlIfOlder", Integer.toString(recrawlIfOlder));
- mem.put("autoDomFilterDepth", Integer.toString(autoDomFilterDepth));
+ mem.put("domFilterDepth", Integer.toString(domFilterDepth));
+ mem.put("domMaxPages", Integer.toString(domMaxPages));
mem.put("crawlingQ", (crawlingQ) ? "true" : "false"); // crawling of urls with '?'
mem.put("storeHTCache", (storeHTCache) ? "true" : "false");
mem.put("storeTXCache", (storeTXCache) ? "true" : "false");
@@ -251,6 +254,8 @@ public class plasmaCrawlProfile {
mem.put("xsstopw", (xsstopw) ? "true" : "false"); // exclude static stop-words
mem.put("xdstopw", (xdstopw) ? "true" : "false"); // exclude dynamic stop-word
mem.put("xpstopw", (xpstopw) ? "true" : "false"); // exclude parent stop-words
+
+ doms = new HashMap();
}
public String toString() {
@@ -317,12 +322,27 @@ public class plasmaCrawlProfile {
return 0;
}
}
- public int autoDomFilterDepth() {
+ public int domFilterDepth() {
// if the depth is equal or less to this depth,
- // the the current url feeds with its domain the crawl filter
- String r = (String) mem.get("autoDomFilterDepth");
+ // then the current url feeds with its domain the crawl filter
+ // if this is -1, all domains are feeded
+ String r = (String) mem.get("domFilterDepth");
if (r == null) return 0; else try {
- return Integer.parseInt(r);
+ int i = Integer.parseInt(r);
+ if (i < 0) return Integer.MAX_VALUE;
+ return i;
+ } catch (NumberFormatException e) {
+ return 0;
+ }
+ }
+ public int domMaxPages() {
+ // this is the maximum number of pages that are crawled for a single domain
+ // if -1, this means no limit
+ String r = (String) mem.get("domMaxPages");
+ if (r == null) return 0; else try {
+ int i = Integer.parseInt(r);
+ if (i < 0) return Integer.MAX_VALUE;
+ return i;
} catch (NumberFormatException e) {
return 0;
}
@@ -363,5 +383,32 @@ public class plasmaCrawlProfile {
mem.put(propName, newValue);
profileTable.set(handle(), mem);
}
+ public void domInc(String domain) {
+ Integer c = (Integer) doms.get(domain);
+ if (c == null) {
+ // new domain
+ doms.put(domain, new Integer(1));
+ } else {
+ // increase counter
+ doms.put(domain, new Integer(c.intValue() + 1));
+ }
+ }
+ public int domCount(String domain) {
+ Integer c = (Integer) doms.get(domain);
+ if (c == null) {
+ return 0;
+ } else {
+ return c.intValue();
+ }
+ }
+ public int domSize() {
+ return doms.size();
+ }
+ public boolean domExists(String domain) {
+ return doms.containsKey(domain);
+ }
+ public Iterator domNames() {
+ return doms.keySet().iterator();
+ }
}
}
diff --git a/source/de/anomic/plasma/plasmaCrawlStacker.java b/source/de/anomic/plasma/plasmaCrawlStacker.java
index 9b182a731..8dd191141 100644
--- a/source/de/anomic/plasma/plasmaCrawlStacker.java
+++ b/source/de/anomic/plasma/plasmaCrawlStacker.java
@@ -311,10 +311,36 @@ public final class plasmaCrawlStacker {
return reason;
}
+ // add domain to profile domain list
+ if (currentdepth <= profile.domFilterDepth()) {
+ profile.domInc(nexturl.getHost());
+ }
+
+ // deny urls that do not match with the profile domain list
+ if (profile.domCount(nexturl.getHost()) == 0) {
+ reason = "denied_(no_match_with_domain_filter)";
+ this.log.logFine("URL '" + nexturlString + "' is not listed in granted domains. " +
+ "Stack processing time: " + (System.currentTimeMillis()-startTime));
+ return reason;
+ }
+
+ // deny urls that exceed allowed number of occurrences
+ if (profile.domCount(nexturl.getHost()) > profile.domMaxPages()) {
+ reason = "denied_(domain_count_exceeded)";
+ this.log.logFine("URL '" + nexturlString + "' appeared too often, a maximum of " + profile.domMaxPages() + " is allowed. "+
+ "Stack processing time: " + (System.currentTimeMillis()-startTime));
+ return reason;
+ }
+
String nexturlhash = plasmaURL.urlHash(nexturl);
- String dbocc = "";
- if ((dbocc = this.sb.urlPool.exists(nexturlhash)) != null) {
- // DISTIGUISH OLD/RE-SEARCH CASES HERE!
+ String dbocc = this.sb.urlPool.exists(nexturlhash);
+ plasmaCrawlLURL.Entry oldEntry = null;
+ if (dbocc != null) try {
+ oldEntry = this.sb.urlPool.loadedURL.getEntry(nexturlhash, null);
+ } catch (IOException e) {}
+ boolean recrawl = (oldEntry != null) &&
+ (((System.currentTimeMillis() - oldEntry.loaddate().getTime()) / 60000) > profile.recrawlIfOlder());
+ if ((dbocc != null) && (!(recrawl))) {
reason = "double_(registered_in_" + dbocc + ")";
/*
urlPool.errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash,
@@ -323,7 +349,7 @@ public final class plasmaCrawlStacker {
"Stack processing time: " + (System.currentTimeMillis()-startTime));
return reason;
}
-
+
// checking robots.txt
if (robotsParser.isDisallowed(nexturl)) {
reason = "denied_(robots.txt)";
@@ -334,6 +360,12 @@ public final class plasmaCrawlStacker {
"Stack processing time: " + (System.currentTimeMillis()-startTime));
return reason;
}
+
+ // show potential re-crawl
+ if (recrawl) {
+ this.log.logFine("RE-CRAWL of URL '" + nexturlString + "': this url was crawled " +
+ ((System.currentTimeMillis() - oldEntry.loaddate().getTime()) / 60000 / 60 / 24) + " days ago.");
+ }
// store information
boolean local = ((initiatorHash.equals(plasmaURL.dummyHash)) || (initiatorHash.equals(yacyCore.seedDB.mySeed.hash)));
diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java
index 38998c57f..1c032fb54 100644
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@@ -680,7 +680,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
(getConfig(STR_PROXYPROFILE, "").length() == 0) ||
(this.profiles.getEntry(getConfig(STR_PROXYPROFILE, "")) == null)) {
// generate new default entry for proxy crawling
- this.defaultProxyProfile = this.profiles.newEntry("proxy", "", ".*", ".*", Integer.parseInt(getConfig("proxyPrefetchDepth", "0")), Integer.parseInt(getConfig("proxyPrefetchDepth", "0")), 60 * 24 * 30, -1, false, true, true, true, getConfigBool("proxyCrawlOrder", false), true, true, true);
+ this.defaultProxyProfile = this.profiles.newEntry("proxy", "", ".*", ".*", Integer.parseInt(getConfig("proxyPrefetchDepth", "0")), Integer.parseInt(getConfig("proxyPrefetchDepth", "0")), 60 * 24 * 30, -1, -1, false, true, true, true, getConfigBool("proxyCrawlOrder", false), true, true, true);
setConfig(STR_PROXYPROFILE, this.defaultProxyProfile.handle());
} else {
this.defaultProxyProfile = this.profiles.getEntry(getConfig(STR_PROXYPROFILE, ""));
@@ -689,7 +689,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
(getConfig(STR_REMOTEPROFILE, "").length() == 0) ||
(profiles.getEntry(getConfig(STR_REMOTEPROFILE, "")) == null)) {
// generate new default entry for remote crawling
- defaultRemoteProfile = profiles.newEntry("remote", "", ".*", ".*", 0, 0, 60 * 24 * 30, -1, true, false, true, true, false, true, true, false);
+ defaultRemoteProfile = profiles.newEntry("remote", "", ".*", ".*", 0, 0, 60 * 24 * 30, -1, -1, true, false, true, true, false, true, true, false);
setConfig(STR_REMOTEPROFILE, defaultRemoteProfile.handle());
} else {
defaultRemoteProfile = profiles.getEntry(getConfig(STR_REMOTEPROFILE, ""));
diff --git a/source/de/anomic/urlRedirector/urlRedirectord.java b/source/de/anomic/urlRedirector/urlRedirectord.java
index ea51a6326..e9c4679b4 100644
--- a/source/de/anomic/urlRedirector/urlRedirectord.java
+++ b/source/de/anomic/urlRedirector/urlRedirectord.java
@@ -48,7 +48,9 @@ public class urlRedirectord implements serverHandler {
0,
// recrawlIfOlder (minutes), if negative: do not re-crawl
-1,
- // autoDomFilterDepth, if negative: no auto-filter
+ // domFilterDepth, if negative: no auto-filter
+ -1,
+ // domMaxPages, if negative: no count restriction
-1,
// crawlDynamic
false,
diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java
index 21768a45a..65da62f4c 100644
--- a/source/de/anomic/yacy/yacyClient.java
+++ b/source/de/anomic/yacy/yacyClient.java
@@ -465,7 +465,7 @@ public final class yacyClient {
for (int n = 0; n < results; n++) {
// get one single search result
urlEntry = urlManager.newEntry((String) result.get("resource" + n), true);
- if (urlEntry != null && blacklist.isListed(urlEntry.url())) { continue; } // block with backlist
+ if ((urlEntry == null) || (blacklist.isListed(urlEntry.url()))) { continue; } // block with backlist
urlEntry.store();
int urlLength = urlEntry.url().toString().length();
int urlComps = htmlFilterContentScraper.urlComps(urlEntry.url().toString()).length;
diff --git a/yacy.init b/yacy.init
index 0294312cd..b2d1af16e 100644
--- a/yacy.init
+++ b/yacy.init
@@ -329,13 +329,14 @@ browserPopUpApplication=netscape
yacyOwnSeedFile=DATA/YACYDB/mySeed.txt
yacyDB=DATA/YACYDB
-# index sharing attributes
-# by default, sharing is on. If you want to use the proxy only for
-# local indexing, you may switch this off
+# index sharing attributes: by default, sharing is on.
+# If you want to use YaCy only for local indexing (robinson mode),
+# you may switch this off
allowDistributeIndex=true
allowDistributeIndexWhileCrawling=false
allowReceiveIndex=true
-indexReceiveBlockBlacklist=false
+allowUnlimitedReceiveIndexFrom=
+indexReceiveBlockBlacklist=true
# the frequency is the number of links per minute, that the peer allowes
# _every_ other peer to send to this peer
@@ -362,6 +363,9 @@ proxyCrawlOrder=false
# Be careful with this number. Consider a branching factor of average 20;
# A prefect-depth of 8 would index 25.600.000.000 pages, maybe the whole WWW.
crawlingDepth=2
+crawlingIfOlder=525600
+crawlingDomFilterDepth=-1
+crawlingDomMaxPages=-1
localIndexing=true
# Filter for crawlinig; may be used to restrict a crawl to a specific domain