added 3 new crawling steering options:

- re-crawl by age of page (enter in minutes)
- auto-domain-filter
- maximum number of pages per domain
NOT YET TESTED!

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1949 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent 1fc3b34be6
commit 63f39ac7b5

@ -43,6 +43,27 @@ You can define URLs as start points for Web page crawling and start crawling her
the crawling depth.
</td>
</tr>
<tr valign="top" class="TableCellDark">
<td class=small>Re-Crawl Option:</td>
<td class=small><input name="crawlingIfOlder" type="text" size="5" maxlength="2" value="#[crawlingIfOlder]#"></td>
<td class=small>
</td>
</tr>
<tr valign="top" class="TableCellDark">
<td class=small>Auto-Dom-Filter Depth:</td>
<td class=small><input name="crawlingDomFilterDepth" type="text" size="2" maxlength="2" value="#[crawlingDomFilterDepth]#"></td>
<td class=small>
</td>
</tr>
<tr valign="top" class="TableCellDark">
<td class=small>Maximum Pages per Domain:</td>
<td class=small><input name="crawlingDomMaxPages" type="text" size="5" maxlength="2" value="#[crawlingDomMaxPages]#"></td>
<td class=small>
</td>
</tr>
<tr valign="top" class="TableCellDark">
<td class=small>Accept URLs with '?' / dynamic URLs:</td>
<td class=small><input type="checkbox" name="crawlingQ" align="top" #(crawlingQChecked)#::checked#(/crawlingQChecked)#></td>

@ -97,8 +97,10 @@ public class IndexCreate_p {
env.setConfig("crawlingDepth", Integer.toString(newcrawlingdepth));
int recrawlIfOlder = Integer.parseInt(post.get("recrawlIfOlder", "-1"));
env.setConfig("crawlingIfOlder", recrawlIfOlder);
int autoDomFilterDepth = Integer.parseInt(post.get("autoDomFilterDepth", "-1"));
env.setConfig("crawlingautoDomFilterDepth", Integer.toString(autoDomFilterDepth));
int domFilterDepth = Integer.parseInt(post.get("domFilterDepth", "-1"));
env.setConfig("crawlingDomFilterDepth", Integer.toString(domFilterDepth));
int domMaxPages = Integer.parseInt(post.get("domMaxPages", "-1"));
env.setConfig("crawlingDomMaxPages", Integer.toString(domMaxPages));
boolean crawlingQ = post.get("crawlingQ", "").equals("on");
env.setConfig("crawlingQ", (crawlingQ) ? "true" : "false");
boolean storeHTCache = post.get("storeHTCache", "").equals("on");
@ -149,7 +151,7 @@ public class IndexCreate_p {
switchboard.urlPool.errorURL.remove(urlhash);
// stack url
plasmaCrawlProfile.entry pe = switchboard.profiles.newEntry(crawlingStartURL.getHost(), crawlingStart, newcrawlingfilter, newcrawlingfilter, newcrawlingdepth, newcrawlingdepth, recrawlIfOlder, autoDomFilterDepth, crawlingQ, storeHTCache, true, localIndexing, crawlOrder, xsstopw, xdstopw, xpstopw);
plasmaCrawlProfile.entry pe = switchboard.profiles.newEntry(crawlingStartURL.getHost(), crawlingStart, newcrawlingfilter, newcrawlingfilter, newcrawlingdepth, newcrawlingdepth, recrawlIfOlder, domFilterDepth, domMaxPages, crawlingQ, storeHTCache, true, localIndexing, crawlOrder, xsstopw, xdstopw, xpstopw);
String reasonString = switchboard.sbStackCrawlThread.stackCrawl(crawlingStart, null, yacyCore.seedDB.mySeed.hash, "CRAWLING-ROOT", new Date(), 0, pe);
if (reasonString == null) {
@ -210,7 +212,7 @@ public class IndexCreate_p {
HashMap hyperlinks = (HashMap) scraper.getAnchors();
// creating a crawler profile
plasmaCrawlProfile.entry profile = switchboard.profiles.newEntry(fileName, file.toURL().toString(), newcrawlingfilter, newcrawlingfilter, newcrawlingdepth, newcrawlingdepth, recrawlIfOlder, autoDomFilterDepth, crawlingQ, storeHTCache, true, localIndexing, crawlOrder, xsstopw, xdstopw, xpstopw);
plasmaCrawlProfile.entry profile = switchboard.profiles.newEntry(fileName, file.toURL().toString(), newcrawlingfilter, newcrawlingfilter, newcrawlingdepth, newcrawlingdepth, recrawlIfOlder, domFilterDepth, domMaxPages, crawlingQ, storeHTCache, true, localIndexing, crawlOrder, xsstopw, xdstopw, xpstopw);
// loop through the contained links
Iterator interator = hyperlinks.entrySet().iterator();
@ -299,6 +301,9 @@ public class IndexCreate_p {
prop.put("proxyPrefetchDepth", env.getConfig("proxyPrefetchDepth", "0"));
prop.put("crawlingDepth", env.getConfig("crawlingDepth", "0"));
prop.put("crawlingFilter", env.getConfig("crawlingFilter", "0"));
prop.put("crawlingIfOlder", env.getConfig("crawlingIfOlder", "-1"));
prop.put("crawlingDomFilterDepth", env.getConfig("crawlingDomFilterDepth", "-1"));
prop.put("crawlingDomMaxPages", env.getConfig("crawlingDomMaxPages", "-1"));
prop.put("crawlingQChecked", env.getConfig("crawlingQ", "").equals("true") ? 1 : 0);
prop.put("storeHTCacheChecked", env.getConfig("storeHTCache", "").equals("true") ? 1 : 0);
prop.put("localIndexingChecked", env.getConfig("localIndexing", "").equals("true") ? 1 : 0);

@ -165,7 +165,8 @@ public class QuickCrawlLink_p {
CrawlingDepth,
CrawlingDepth,
60 * 24 * 30, // recrawlIfOlder (minutes); here: one month
-1, // autoDomFilterDepth, if negative: no auto-filter
-1, // domFilterDepth, if negative: no auto-filter
-1, // domMaxPages, if negative: no count restriction
crawlDynamic,
storeHTCache,
true,

@ -176,7 +176,7 @@ public class plasmaCrawlProfile {
public entry newEntry(String name, String startURL, String generalFilter, String specificFilter,
int generalDepth, int specificDepth,
int recrawlIfOlder /*minutes*/, int autoDomFilterDepth,
int recrawlIfOlder /*minutes*/, int domFilterDepth, int domMaxPages,
boolean crawlingQ,
boolean storeHTCache, boolean storeTXCache,
boolean localIndexing, boolean remoteIndexing,
@ -184,7 +184,7 @@ public class plasmaCrawlProfile {
entry ne = new entry(name, startURL, generalFilter, specificFilter,
generalDepth, specificDepth,
recrawlIfOlder, autoDomFilterDepth,
recrawlIfOlder, domFilterDepth, domMaxPages,
crawlingQ, storeHTCache, storeTXCache, localIndexing, remoteIndexing,
xsstopw, xdstopw, xpstopw);
try {
@ -225,9 +225,11 @@ public class plasmaCrawlProfile {
// this is a simple record structure that hold all properties of a single crawl start
private Map mem;
private Map doms;
public entry(String name, String startURL, String generalFilter, String specificFilter,
int generalDepth, int specificDepth,
int recrawlIfOlder /*minutes*/, int autoDomFilterDepth,
int recrawlIfOlder /*minutes*/, int domFilterDepth, int domMaxPages,
boolean crawlingQ,
boolean storeHTCache, boolean storeTXCache,
boolean localIndexing, boolean remoteIndexing,
@ -242,7 +244,8 @@ public class plasmaCrawlProfile {
mem.put("generalDepth", Integer.toString(generalDepth));
mem.put("specificDepth", Integer.toString(specificDepth));
mem.put("recrawlIfOlder", Integer.toString(recrawlIfOlder));
mem.put("autoDomFilterDepth", Integer.toString(autoDomFilterDepth));
mem.put("domFilterDepth", Integer.toString(domFilterDepth));
mem.put("domMaxPages", Integer.toString(domMaxPages));
mem.put("crawlingQ", (crawlingQ) ? "true" : "false"); // crawling of urls with '?'
mem.put("storeHTCache", (storeHTCache) ? "true" : "false");
mem.put("storeTXCache", (storeTXCache) ? "true" : "false");
@ -251,6 +254,8 @@ public class plasmaCrawlProfile {
mem.put("xsstopw", (xsstopw) ? "true" : "false"); // exclude static stop-words
mem.put("xdstopw", (xdstopw) ? "true" : "false"); // exclude dynamic stop-word
mem.put("xpstopw", (xpstopw) ? "true" : "false"); // exclude parent stop-words
doms = new HashMap();
}
public String toString() {
@ -317,12 +322,27 @@ public class plasmaCrawlProfile {
return 0;
}
}
public int autoDomFilterDepth() {
public int domFilterDepth() {
// if the depth is equal or less to this depth,
// the the current url feeds with its domain the crawl filter
String r = (String) mem.get("autoDomFilterDepth");
// then the current url feeds with its domain the crawl filter
// if this is -1, all domains are feeded
String r = (String) mem.get("domFilterDepth");
if (r == null) return 0; else try {
return Integer.parseInt(r);
int i = Integer.parseInt(r);
if (i < 0) return Integer.MAX_VALUE;
return i;
} catch (NumberFormatException e) {
return 0;
}
}
public int domMaxPages() {
// this is the maximum number of pages that are crawled for a single domain
// if -1, this means no limit
String r = (String) mem.get("domMaxPages");
if (r == null) return 0; else try {
int i = Integer.parseInt(r);
if (i < 0) return Integer.MAX_VALUE;
return i;
} catch (NumberFormatException e) {
return 0;
}
@ -363,5 +383,32 @@ public class plasmaCrawlProfile {
mem.put(propName, newValue);
profileTable.set(handle(), mem);
}
public void domInc(String domain) {
Integer c = (Integer) doms.get(domain);
if (c == null) {
// new domain
doms.put(domain, new Integer(1));
} else {
// increase counter
doms.put(domain, new Integer(c.intValue() + 1));
}
}
public int domCount(String domain) {
Integer c = (Integer) doms.get(domain);
if (c == null) {
return 0;
} else {
return c.intValue();
}
}
public int domSize() {
return doms.size();
}
public boolean domExists(String domain) {
return doms.containsKey(domain);
}
public Iterator domNames() {
return doms.keySet().iterator();
}
}
}

@ -311,10 +311,36 @@ public final class plasmaCrawlStacker {
return reason;
}
// add domain to profile domain list
if (currentdepth <= profile.domFilterDepth()) {
profile.domInc(nexturl.getHost());
}
// deny urls that do not match with the profile domain list
if (profile.domCount(nexturl.getHost()) == 0) {
reason = "denied_(no_match_with_domain_filter)";
this.log.logFine("URL '" + nexturlString + "' is not listed in granted domains. " +
"Stack processing time: " + (System.currentTimeMillis()-startTime));
return reason;
}
// deny urls that exceed allowed number of occurrences
if (profile.domCount(nexturl.getHost()) > profile.domMaxPages()) {
reason = "denied_(domain_count_exceeded)";
this.log.logFine("URL '" + nexturlString + "' appeared too often, a maximum of " + profile.domMaxPages() + " is allowed. "+
"Stack processing time: " + (System.currentTimeMillis()-startTime));
return reason;
}
String nexturlhash = plasmaURL.urlHash(nexturl);
String dbocc = "";
if ((dbocc = this.sb.urlPool.exists(nexturlhash)) != null) {
// DISTIGUISH OLD/RE-SEARCH CASES HERE!
String dbocc = this.sb.urlPool.exists(nexturlhash);
plasmaCrawlLURL.Entry oldEntry = null;
if (dbocc != null) try {
oldEntry = this.sb.urlPool.loadedURL.getEntry(nexturlhash, null);
} catch (IOException e) {}
boolean recrawl = (oldEntry != null) &&
(((System.currentTimeMillis() - oldEntry.loaddate().getTime()) / 60000) > profile.recrawlIfOlder());
if ((dbocc != null) && (!(recrawl))) {
reason = "double_(registered_in_" + dbocc + ")";
/*
urlPool.errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash,
@ -323,7 +349,7 @@ public final class plasmaCrawlStacker {
"Stack processing time: " + (System.currentTimeMillis()-startTime));
return reason;
}
// checking robots.txt
if (robotsParser.isDisallowed(nexturl)) {
reason = "denied_(robots.txt)";
@ -334,6 +360,12 @@ public final class plasmaCrawlStacker {
"Stack processing time: " + (System.currentTimeMillis()-startTime));
return reason;
}
// show potential re-crawl
if (recrawl) {
this.log.logFine("RE-CRAWL of URL '" + nexturlString + "': this url was crawled " +
((System.currentTimeMillis() - oldEntry.loaddate().getTime()) / 60000 / 60 / 24) + " days ago.");
}
// store information
boolean local = ((initiatorHash.equals(plasmaURL.dummyHash)) || (initiatorHash.equals(yacyCore.seedDB.mySeed.hash)));

@ -680,7 +680,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
(getConfig(STR_PROXYPROFILE, "").length() == 0) ||
(this.profiles.getEntry(getConfig(STR_PROXYPROFILE, "")) == null)) {
// generate new default entry for proxy crawling
this.defaultProxyProfile = this.profiles.newEntry("proxy", "", ".*", ".*", Integer.parseInt(getConfig("proxyPrefetchDepth", "0")), Integer.parseInt(getConfig("proxyPrefetchDepth", "0")), 60 * 24 * 30, -1, false, true, true, true, getConfigBool("proxyCrawlOrder", false), true, true, true);
this.defaultProxyProfile = this.profiles.newEntry("proxy", "", ".*", ".*", Integer.parseInt(getConfig("proxyPrefetchDepth", "0")), Integer.parseInt(getConfig("proxyPrefetchDepth", "0")), 60 * 24 * 30, -1, -1, false, true, true, true, getConfigBool("proxyCrawlOrder", false), true, true, true);
setConfig(STR_PROXYPROFILE, this.defaultProxyProfile.handle());
} else {
this.defaultProxyProfile = this.profiles.getEntry(getConfig(STR_PROXYPROFILE, ""));
@ -689,7 +689,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
(getConfig(STR_REMOTEPROFILE, "").length() == 0) ||
(profiles.getEntry(getConfig(STR_REMOTEPROFILE, "")) == null)) {
// generate new default entry for remote crawling
defaultRemoteProfile = profiles.newEntry("remote", "", ".*", ".*", 0, 0, 60 * 24 * 30, -1, true, false, true, true, false, true, true, false);
defaultRemoteProfile = profiles.newEntry("remote", "", ".*", ".*", 0, 0, 60 * 24 * 30, -1, -1, true, false, true, true, false, true, true, false);
setConfig(STR_REMOTEPROFILE, defaultRemoteProfile.handle());
} else {
defaultRemoteProfile = profiles.getEntry(getConfig(STR_REMOTEPROFILE, ""));

@ -48,7 +48,9 @@ public class urlRedirectord implements serverHandler {
0,
// recrawlIfOlder (minutes), if negative: do not re-crawl
-1,
// autoDomFilterDepth, if negative: no auto-filter
// domFilterDepth, if negative: no auto-filter
-1,
// domMaxPages, if negative: no count restriction
-1,
// crawlDynamic
false,

@ -465,7 +465,7 @@ public final class yacyClient {
for (int n = 0; n < results; n++) {
// get one single search result
urlEntry = urlManager.newEntry((String) result.get("resource" + n), true);
if (urlEntry != null && blacklist.isListed(urlEntry.url())) { continue; } // block with backlist
if ((urlEntry == null) || (blacklist.isListed(urlEntry.url()))) { continue; } // block with backlist
urlEntry.store();
int urlLength = urlEntry.url().toString().length();
int urlComps = htmlFilterContentScraper.urlComps(urlEntry.url().toString()).length;

@ -329,13 +329,14 @@ browserPopUpApplication=netscape
yacyOwnSeedFile=DATA/YACYDB/mySeed.txt
yacyDB=DATA/YACYDB
# index sharing attributes
# by default, sharing is on. If you want to use the proxy only for
# local indexing, you may switch this off
# index sharing attributes: by default, sharing is on.
# If you want to use YaCy only for local indexing (robinson mode),
# you may switch this off
allowDistributeIndex=true
allowDistributeIndexWhileCrawling=false
allowReceiveIndex=true
indexReceiveBlockBlacklist=false
allowUnlimitedReceiveIndexFrom=
indexReceiveBlockBlacklist=true
# the frequency is the number of links per minute, that the peer allowes
# _every_ other peer to send to this peer
@ -362,6 +363,9 @@ proxyCrawlOrder=false
# Be careful with this number. Consider a branching factor of average 20;
# A prefect-depth of 8 would index 25.600.000.000 pages, maybe the whole WWW.
crawlingDepth=2
crawlingIfOlder=525600
crawlingDomFilterDepth=-1
crawlingDomMaxPages=-1
localIndexing=true
# Filter for crawlinig; may be used to restrict a crawl to a specific domain

Loading…
Cancel
Save