some pre-work (without function yet) to implement:

- re-crawl (by age of last crawl)
- auto-crawl-filter by crawl depth (to be explained..)

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1948 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent 0a4c2e89ed
commit 1fc3b34be6

@ -95,6 +95,10 @@ public class IndexCreate_p {
env.setConfig("crawlingFilter", newcrawlingfilter);
int newcrawlingdepth = Integer.parseInt(post.get("crawlingDepth", "0"));
env.setConfig("crawlingDepth", Integer.toString(newcrawlingdepth));
int recrawlIfOlder = Integer.parseInt(post.get("recrawlIfOlder", "-1"));
env.setConfig("crawlingIfOlder", recrawlIfOlder);
int autoDomFilterDepth = Integer.parseInt(post.get("autoDomFilterDepth", "-1"));
env.setConfig("crawlingautoDomFilterDepth", Integer.toString(autoDomFilterDepth));
boolean crawlingQ = post.get("crawlingQ", "").equals("on");
env.setConfig("crawlingQ", (crawlingQ) ? "true" : "false");
boolean storeHTCache = post.get("storeHTCache", "").equals("on");
@ -145,7 +149,7 @@ public class IndexCreate_p {
switchboard.urlPool.errorURL.remove(urlhash);
// stack url
plasmaCrawlProfile.entry pe = switchboard.profiles.newEntry(crawlingStartURL.getHost(), crawlingStart, newcrawlingfilter, newcrawlingfilter, newcrawlingdepth, newcrawlingdepth, crawlingQ, storeHTCache, true, localIndexing, crawlOrder, xsstopw, xdstopw, xpstopw);
plasmaCrawlProfile.entry pe = switchboard.profiles.newEntry(crawlingStartURL.getHost(), crawlingStart, newcrawlingfilter, newcrawlingfilter, newcrawlingdepth, newcrawlingdepth, recrawlIfOlder, autoDomFilterDepth, crawlingQ, storeHTCache, true, localIndexing, crawlOrder, xsstopw, xdstopw, xpstopw);
String reasonString = switchboard.sbStackCrawlThread.stackCrawl(crawlingStart, null, yacyCore.seedDB.mySeed.hash, "CRAWLING-ROOT", new Date(), 0, pe);
if (reasonString == null) {
@ -206,7 +210,7 @@ public class IndexCreate_p {
HashMap hyperlinks = (HashMap) scraper.getAnchors();
// creating a crawler profile
plasmaCrawlProfile.entry profile = switchboard.profiles.newEntry(fileName, file.toURL().toString(), newcrawlingfilter, newcrawlingfilter, newcrawlingdepth, newcrawlingdepth, crawlingQ, storeHTCache, true, localIndexing, crawlOrder, xsstopw, xdstopw, xpstopw);
plasmaCrawlProfile.entry profile = switchboard.profiles.newEntry(fileName, file.toURL().toString(), newcrawlingfilter, newcrawlingfilter, newcrawlingdepth, newcrawlingdepth, recrawlIfOlder, autoDomFilterDepth, crawlingQ, storeHTCache, true, localIndexing, crawlOrder, xsstopw, xdstopw, xpstopw);
// loop through the contained links
Iterator interator = hyperlinks.entrySet().iterator();

@ -164,8 +164,10 @@ public class QuickCrawlLink_p {
crawlingFilter,
CrawlingDepth,
CrawlingDepth,
60 * 24 * 30, // recrawlIfOlder (minutes); here: one month
-1, // autoDomFilterDepth, if negative: no auto-filter
crawlDynamic,
storeHTCache,
storeHTCache,
true,
localIndexing,
remoteIndexing,

@ -176,6 +176,7 @@ public class plasmaCrawlProfile {
public entry newEntry(String name, String startURL, String generalFilter, String specificFilter,
int generalDepth, int specificDepth,
int recrawlIfOlder /*minutes*/, int autoDomFilterDepth,
boolean crawlingQ,
boolean storeHTCache, boolean storeTXCache,
boolean localIndexing, boolean remoteIndexing,
@ -183,6 +184,7 @@ public class plasmaCrawlProfile {
entry ne = new entry(name, startURL, generalFilter, specificFilter,
generalDepth, specificDepth,
recrawlIfOlder, autoDomFilterDepth,
crawlingQ, storeHTCache, storeTXCache, localIndexing, remoteIndexing,
xsstopw, xdstopw, xpstopw);
try {
@ -225,6 +227,7 @@ public class plasmaCrawlProfile {
private Map mem;
public entry(String name, String startURL, String generalFilter, String specificFilter,
int generalDepth, int specificDepth,
int recrawlIfOlder /*minutes*/, int autoDomFilterDepth,
boolean crawlingQ,
boolean storeHTCache, boolean storeTXCache,
boolean localIndexing, boolean remoteIndexing,
@ -238,6 +241,8 @@ public class plasmaCrawlProfile {
mem.put("specificFilter", specificFilter);
mem.put("generalDepth", Integer.toString(generalDepth));
mem.put("specificDepth", Integer.toString(specificDepth));
mem.put("recrawlIfOlder", Integer.toString(recrawlIfOlder));
mem.put("autoDomFilterDepth", Integer.toString(autoDomFilterDepth));
mem.put("crawlingQ", (crawlingQ) ? "true" : "false"); // crawling of urls with '?'
mem.put("storeHTCache", (storeHTCache) ? "true" : "false");
mem.put("storeTXCache", (storeTXCache) ? "true" : "false");
@ -301,6 +306,27 @@ public class plasmaCrawlProfile {
return 0;
}
}
public long recrawlIfOlder() {
// returns a long (millis) that is the minimum age that
// an antry must have to be re-crawled
String r = (String) mem.get("recrawlIfOlder");
if (r == null) return Long.MAX_VALUE; else try {
long l = Long.parseLong(r) * ((long) 60000);
if (l < 0) return Long.MAX_VALUE; else return l;
} catch (NumberFormatException e) {
return 0;
}
}
public int autoDomFilterDepth() {
// if the depth is equal or less to this depth,
// the the current url feeds with its domain the crawl filter
String r = (String) mem.get("autoDomFilterDepth");
if (r == null) return 0; else try {
return Integer.parseInt(r);
} catch (NumberFormatException e) {
return 0;
}
}
public boolean crawlingQ() {
String r = (String) mem.get("crawlingQ");
if (r == null) return false; else return (r.equals("true"));

@ -680,7 +680,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
(getConfig(STR_PROXYPROFILE, "").length() == 0) ||
(this.profiles.getEntry(getConfig(STR_PROXYPROFILE, "")) == null)) {
// generate new default entry for proxy crawling
this.defaultProxyProfile = this.profiles.newEntry("proxy", "", ".*", ".*", Integer.parseInt(getConfig("proxyPrefetchDepth", "0")), Integer.parseInt(getConfig("proxyPrefetchDepth", "0")), false, true, true, true, getConfigBool("proxyCrawlOrder", false), true, true, true);
this.defaultProxyProfile = this.profiles.newEntry("proxy", "", ".*", ".*", Integer.parseInt(getConfig("proxyPrefetchDepth", "0")), Integer.parseInt(getConfig("proxyPrefetchDepth", "0")), 60 * 24 * 30, -1, false, true, true, true, getConfigBool("proxyCrawlOrder", false), true, true, true);
setConfig(STR_PROXYPROFILE, this.defaultProxyProfile.handle());
} else {
this.defaultProxyProfile = this.profiles.getEntry(getConfig(STR_PROXYPROFILE, ""));
@ -689,8 +689,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
(getConfig(STR_REMOTEPROFILE, "").length() == 0) ||
(profiles.getEntry(getConfig(STR_REMOTEPROFILE, "")) == null)) {
// generate new default entry for remote crawling
defaultRemoteProfile = profiles.newEntry("remote", "", ".*", ".*", 0, 0, true, false, true, true, false, true, true, false);
// defaultRemoteProfile = profiles.newEntry("remote", "", ".*", ".*", 0, 0, true, true, true, true, true, true, true, false);
defaultRemoteProfile = profiles.newEntry("remote", "", ".*", ".*", 0, 0, 60 * 24 * 30, -1, true, false, true, true, false, true, true, false);
setConfig(STR_REMOTEPROFILE, defaultRemoteProfile.handle());
} else {
defaultRemoteProfile = profiles.getEntry(getConfig(STR_REMOTEPROFILE, ""));

@ -45,7 +45,11 @@ public class urlRedirectord implements serverHandler {
".*",
// depth
0,
0,
0,
// recrawlIfOlder (minutes), if negative: do not re-crawl
-1,
// autoDomFilterDepth, if negative: no auto-filter
-1,
// crawlDynamic
false,
// storeHTCache

Loading…
Cancel
Save