extended crawling constraints:

- removed never-used secondary crawl depth
- added a must-not-match filter that can be used to exclude urls from a crawl
- added stub for crawl tags which will be used to identify search results that had been produced from specific crawls
please update the yacybar: replace property name 'crawlFilter' with 'mustmatch'.
Additionally, a new parameter named 'mustnotmatch' can be used, which should be by default the empty sring (match-never)

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5342 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent 96174b2b56
commit dba7ef5144

@ -30,7 +30,8 @@
<td><strong>Status</strong></td>
<td><strong>Start URL</strong></td>
<td><strong>Depth</strong></td>
<td><strong>Filter</strong></td>
<td><strong>Must Match</strong></td>
<td><strong>Must Not Match</strong></td>
<td><strong>MaxAge</strong></td>
<td><strong>Auto Filter Depth</strong></td>
<td><strong>Auto Filter Content</strong></td>
@ -48,7 +49,8 @@
<td>#(status)#terminated::active#(/status)#</td>
<td><a href="#[startURL]#">#[startURL]#</a></td>
<td>#[depth]#</td>
<td>#[filter]#</td>
<td>#[mustmatch]#</td>
<td>#[mustnotmatch]#</td>
<td>#[crawlingIfOlder]#</td>
<td>#[crawlingDomFilterDepth]#</td>
<td>#{crawlingDomFilterContent}##[item]#<br />#{/crawlingDomFilterContent}#</td>

@ -62,10 +62,8 @@ public class CrawlProfileEditor_p {
static {
labels.add(new eentry(entry.NAME, "Name", true, eentry.STRING));
labels.add(new eentry(entry.START_URL, "Start URL", true, eentry.STRING));
labels.add(new eentry(entry.GENERAL_FILTER, "General Filter", false, eentry.STRING));
labels.add(new eentry(entry.SPECIFIC_FILTER, "Specific Filter", false, eentry.STRING));
labels.add(new eentry(entry.GENERAL_DEPTH, "General Depth", false, eentry.INTEGER));
labels.add(new eentry(entry.SPECIFIC_DEPTH, "Specific Depth", false, eentry.INTEGER));
labels.add(new eentry(entry.FILTER_MUSTMATCH, "General Filter", false, eentry.STRING));
labels.add(new eentry(entry.DEPTH, "General Depth", false, eentry.INTEGER));
labels.add(new eentry(entry.RECRAWL_IF_OLDER, "Recrawl If Older", false, eentry.INTEGER));
labels.add(new eentry(entry.DOM_FILTER_DEPTH, "Domain Filter Depth", false, eentry.INTEGER));
labels.add(new eentry(entry.DOM_MAX_PAGES, "Domain Max. Pages", false, eentry.INTEGER));
@ -214,8 +212,9 @@ public class CrawlProfileEditor_p {
prop.put("crawlProfiles_" + count + "_name", profile.name());
prop.putXML("crawlProfiles_" + count + "_startURL", profile.startURL());
prop.put("crawlProfiles_" + count + "_handle", profile.handle());
prop.put("crawlProfiles_" + count + "_depth", profile.generalDepth());
prop.put("crawlProfiles_" + count + "_filter", profile.generalFilter());
prop.put("crawlProfiles_" + count + "_depth", profile.depth());
prop.put("crawlProfiles_" + count + "_mustmatch", profile.mustMatchPattern().toString());
prop.put("crawlProfiles_" + count + "_mustnotmatch", profile.mustNotMatchPattern().toString());
prop.put("crawlProfiles_" + count + "_crawlingIfOlder", (profile.recrawlIfOlder() == 0L) ? "no re-crawl" : ""+ SimpleDateFormat.getDateTimeInstance().format(profile.recrawlIfOlder()));
prop.put("crawlProfiles_" + count + "_crawlingDomFilterDepth", (profile.domFilterDepth() == Integer.MAX_VALUE) ? "inactive" : Integer.toString(profile.domFilterDepth()));

@ -6,7 +6,8 @@
<status>#(status)#terminated::active#(/status)#</status>
<starturl>#[startURL]#</starturl>
<depth>#[depth]#</depth>
<filter>#[filter]#</filter>
<mustmatch>#[mustmatch]#</mustmatch>
<mustnotmatch>#[mustnotmatch]#</mustnotmatch>
<crawlingIfOlder>#[crawlingIfOlder]#</crawlingIfOlder>
<crawlingDomFilterDepth>#[crawlingDomFilterDepth]#</crawlingDomFilterDepth>
<crawlingDomFilterContent>

@ -100,18 +100,30 @@
</td>
</tr>
<tr valign="top" class="TableCellDark">
<td><label for="crawlingFilter">Crawling Filter</label>:</td>
<td><label for="mustmatch">Must-Match Filter</label>:</td>
<td>
<input type="radio" name="range" value="wide" checked="checked" />Use filter&nbsp;&nbsp;
<input name="crawlingFilter" id="crawlingFilter" type="text" size="20" maxlength="100" value="#[crawlingFilter]#" /><br />
<input name="mustmatch" id="mustmatch" type="text" size="60" maxlength="100" value="#[mustmatch]#" /><br />
<input type="radio" name="range" value="domain" />Restrict to start domain<br />
<input type="radio" name="range" value="subpath" />Restrict to sub-path
</td>
<td>
The filter is an emacs-like regular expression that must match with the URLs which are used to be crawled; default is 'catch all'.
The filter is an emacs-like regular expression that must match with the URLs which are used to be crawled;
default is 'catch all'.
You can also use an automatic domain-restriction to fully crawl a single domain.
</td>
</tr>
<tr valign="top" class="TableCellDark">
<td><label for="mustnotmatch">Must-Not-Match Filter</label>:</td>
<td>
<input name="mustnotmatch" id="mustnotmatch" type="text" size="80" maxlength="100" value="#[mustnotmatch]#" />
</td>
<td>
This filter must not match to allow that the page is accepted for crawling.
The empty string is a never-match filter which should do well for most cases.
If you don't know what this means, please leave this field empty.
</td>
</tr>
<tr valign="top" class="TableCellLight">
<td>Re-crawl known URLs:</td>
<td>

@ -24,6 +24,7 @@
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import de.anomic.crawler.CrawlProfile;
import de.anomic.http.httpRequestHeader;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaSwitchboardConstants;
@ -44,7 +45,8 @@ public class CrawlStart_p {
prop.put("starturl", (intranet) ? repository : "http://");
prop.put("proxyPrefetchDepth", env.getConfig("proxyPrefetchDepth", "0"));
prop.put("crawlingDepth", env.getConfig("crawlingDepth", "0"));
prop.put("crawlingFilter", (intranet) ? repository + ".*" : ".*");
prop.put("mustmatch", (intranet) ? repository + ".*" : CrawlProfile.MATCH_ALL);
prop.put("mustnotmatch", CrawlProfile.MATCH_NEVER);
prop.put("crawlingIfOlderCheck", "0");
prop.put("crawlingIfOlderUnitYearCheck", "0");

@ -91,7 +91,8 @@ public class QuickCrawlLink_p {
final String title = post.get("title",null);
// getting other parameters if set
final String crawlingFilter = post.get("crawlingFilter", ".*");
final String crawlingMustMatch = post.get("mustmatch", CrawlProfile.MATCH_ALL);
final String crawlingMustNotMatch = post.get("mustnotmatch", CrawlProfile.MATCH_NEVER);
final int CrawlingDepth = Integer.parseInt(post.get("crawlingDepth", "0"));
final boolean crawlDynamic = post.get("crawlingQ", "").equals("on");
final boolean indexText = post.get("indexText", "on").equals("on");
@ -129,11 +130,11 @@ public class QuickCrawlLink_p {
try {
pe = sb.webIndex.profilesActiveCrawls.newEntry(
crawlingStartURL.getHost(),
crawlingStartURL,
crawlingFilter,
crawlingFilter,
CrawlingDepth,
CrawlingDepth,
crawlingStartURL,
CrawlProfile.KEYWORDS_USER,
crawlingMustMatch,
crawlingMustNotMatch,
CrawlingDepth,
60 * 24 * 30, // recrawlIfOlder (minutes); here: one month
-1, // domFilterDepth, if negative: no auto-filter
-1, // domMaxPages, if negative: no count restriction

@ -123,16 +123,16 @@ public class WatchCrawler_p {
crawlingStart = (crawlingStartURL == null) ? null : crawlingStartURL.toNormalform(true, true);
// set the crawling filter
String newcrawlingfilter = post.get("crawlingFilter", ".*");
if (newcrawlingfilter.length() < 2) newcrawlingfilter = ".*"; // avoid that all urls are filtered out if bad value was submitted
String newcrawlingMustMatch = post.get("mustmatch", CrawlProfile.MATCH_ALL);
String newcrawlingMustNotMatch = post.get("mustnotmatch", CrawlProfile.MATCH_NEVER);
if (newcrawlingMustMatch.length() < 2) newcrawlingMustMatch = CrawlProfile.MATCH_ALL; // avoid that all urls are filtered out if bad value was submitted
// special cases:
if (crawlingStartURL!= null && fullDomain) {
newcrawlingfilter = ".*" + crawlingStartURL.getHost() + ".*";
newcrawlingMustMatch = ".*" + crawlingStartURL.getHost() + ".*";
}
if (crawlingStart!= null && subPath && (pos = crawlingStart.lastIndexOf("/")) > 0) {
newcrawlingfilter = crawlingStart.substring(0, pos + 1) + ".*";
newcrawlingMustMatch = crawlingStart.substring(0, pos + 1) + ".*";
}
env.setConfig("crawlingFilter", newcrawlingfilter);
final boolean crawlOrder = post.get("crawlOrder", "off").equals("on");
env.setConfig("crawlOrder", (crawlOrder) ? "true" : "false");
@ -183,12 +183,12 @@ public class WatchCrawler_p {
if ((crawlingStart == null || crawlingStartURL == null) /* || (!(crawlingStart.matches(newcrawlingfilter))) */) {
// print error message
prop.put("info", "4"); //crawlfilter does not match url
prop.putHTML("info_newcrawlingfilter", newcrawlingfilter);
prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
prop.putHTML("info_crawlingStart", crawlingStart);
} else try {
// check if the crawl filter works correctly
Pattern.compile(newcrawlingfilter);
Pattern.compile(newcrawlingMustMatch);
// stack request
// first delete old entry, if exists
@ -201,8 +201,12 @@ public class WatchCrawler_p {
// stack url
sb.webIndex.profilesPassiveCrawls.removeEntry(crawlingStartURL.hash()); // if there is an old entry, delete it
final CrawlProfile.entry pe = sb.webIndex.profilesActiveCrawls.newEntry(
crawlingStartURL.getHost(), crawlingStartURL, newcrawlingfilter, newcrawlingfilter,
newcrawlingdepth, newcrawlingdepth,
crawlingStartURL.getHost(),
crawlingStartURL,
CrawlProfile.KEYWORDS_USER,
newcrawlingMustMatch,
newcrawlingMustNotMatch,
newcrawlingdepth,
crawlingIfOlder, crawlingDomFilterDepth, crawlingDomMaxPages,
crawlingQ,
indexText, indexMedia,
@ -270,7 +274,7 @@ public class WatchCrawler_p {
}
} catch (final PatternSyntaxException e) {
prop.put("info", "4"); //crawlfilter does not match url
prop.putHTML("info_newcrawlingfilter", newcrawlingfilter);
prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
prop.putHTML("info_error", e.getMessage());
} catch (final Exception e) {
// mist
@ -286,7 +290,7 @@ public class WatchCrawler_p {
final String fileName = post.get("crawlingFile");
try {
// check if the crawl filter works correctly
Pattern.compile(newcrawlingfilter);
Pattern.compile(newcrawlingMustMatch);
// loading the file content
final File file = new File(fileName);
@ -306,7 +310,21 @@ public class WatchCrawler_p {
// creating a crawler profile
final yacyURL crawlURL = new yacyURL("file://" + file.toString(), null);
final CrawlProfile.entry profile = sb.webIndex.profilesActiveCrawls.newEntry(fileName, crawlURL, newcrawlingfilter, newcrawlingfilter, newcrawlingdepth, newcrawlingdepth, crawlingIfOlder, crawlingDomFilterDepth, crawlingDomMaxPages, crawlingQ, indexText, indexMedia, storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw);
final CrawlProfile.entry profile = sb.webIndex.profilesActiveCrawls.newEntry(
fileName, crawlURL, CrawlProfile.KEYWORDS_USER,
newcrawlingMustMatch,
CrawlProfile.MATCH_NEVER,
newcrawlingdepth,
crawlingIfOlder,
crawlingDomFilterDepth,
crawlingDomMaxPages,
crawlingQ,
indexText,
indexMedia,
storeHTCache,
true,
crawlOrder,
xsstopw, xdstopw, xpstopw);
// pause local crawl here
sb.pauseCrawlJob(plasmaSwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
@ -333,7 +351,7 @@ public class WatchCrawler_p {
} catch (final PatternSyntaxException e) {
// print error message
prop.put("info", "4"); //crawlfilter does not match url
prop.putHTML("info_newcrawlingfilter", newcrawlingfilter);
prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
prop.putHTML("info_error", e.getMessage());
} catch (final Exception e) {
// mist
@ -353,8 +371,10 @@ public class WatchCrawler_p {
// create a new profile
final CrawlProfile.entry pe = sb.webIndex.profilesActiveCrawls.newEntry(
sitemapURLStr, sitemapURL, newcrawlingfilter, newcrawlingfilter,
newcrawlingdepth, newcrawlingdepth,
sitemapURLStr, sitemapURL, CrawlProfile.KEYWORDS_USER,
newcrawlingMustMatch,
CrawlProfile.MATCH_NEVER,
newcrawlingdepth,
crawlingIfOlder, crawlingDomFilterDepth, crawlingDomMaxPages,
crawlingQ,
indexText, indexMedia,

@ -104,7 +104,6 @@ public class sharedBlacklist_p {
final String Hash = post.get("hash");
// generate the download URL
String downloadURL = null;
String downloadURLOld = null;
if( sb.webIndex.seedDB != null ){ //no nullpointer error..
final yacySeed seed = sb.webIndex.seedDB.getConnected(Hash);
@ -113,8 +112,6 @@ public class sharedBlacklist_p {
final String Port = seed.get(yacySeed.PORT, "8080");
final String peerName = seed.get(yacySeed.NAME, "<" + IP + ":" + Port + ">");
prop.putHTML("page_source", peerName);
downloadURL = "http://" + IP + ":" + Port + "/xml/blacklists.xml";
downloadURLOld = "http://" + IP + ":" + Port + "/yacy/list.html?col=black";
} else {
prop.put("status", STATUS_PEER_UNKNOWN);//YaCy-Peer not found

@ -28,6 +28,8 @@ import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
import de.anomic.kelondro.kelondroBLOB;
import de.anomic.kelondro.kelondroBLOBHeap;
@ -43,6 +45,20 @@ import de.anomic.yacy.yacyURL;
public class CrawlProfile {
public static final String MATCH_ALL = ".*";
public static final String MATCH_NEVER = "";
public static final HashSet<String> NO_KEYWORDS = new HashSet<String>(0);
public static final HashSet<String> KEYWORDS_PROXY = word2set("xproxy");
public static final HashSet<String> KEYWORDS_REMOTE = word2set("xremote");
public static final HashSet<String> KEYWORDS_USER = word2set("xuser");
public static final HashSet<String> KEYWORDS_SNIPPET = word2set("xsnippet");
private static final HashSet<String> word2set(String word) {
HashSet<String> s = new HashSet<String>(1);
s.add(word);
return s;
}
static HashMap<String, Map<String, DomProfile>> domsCache = new HashMap<String, Map<String, DomProfile>>();
kelondroMap profileTable;
@ -145,8 +161,11 @@ public class CrawlProfile {
return ne;
}
public entry newEntry(final String name, final yacyURL startURL, final String generalFilter, final String specificFilter,
final int generalDepth, final int specificDepth,
public entry newEntry( final String name,
final yacyURL startURL,
final Set<String> keywords,
final String mustmatch, final String mustnotmatch,
final int generalDepth,
final long recrawlIfOlder /*date*/, final int domFilterDepth, final int domMaxPages,
final boolean crawlingQ,
final boolean indexText, final boolean indexMedia,
@ -154,8 +173,11 @@ public class CrawlProfile {
final boolean remoteIndexing,
final boolean xsstopw, final boolean xdstopw, final boolean xpstopw) {
final entry ne = new entry(name, startURL, generalFilter, specificFilter,
generalDepth, specificDepth,
final entry ne = new entry(
name, startURL,
keywords,
mustmatch, mustnotmatch,
generalDepth,
recrawlIfOlder, domFilterDepth, domMaxPages,
crawlingQ,
indexText, indexMedia,
@ -235,10 +257,9 @@ public class CrawlProfile {
public static final String HANDLE = "handle";
public static final String NAME = "name";
public static final String START_URL = "startURL";
public static final String GENERAL_FILTER = "generalFilter";
public static final String SPECIFIC_FILTER = "specificFilter";
public static final String GENERAL_DEPTH = "generalDepth";
public static final String SPECIFIC_DEPTH = "specificDepth";
public static final String FILTER_MUSTMATCH = "generalFilter";
public static final String FILTER_MUSTNOTMATCH = "nevermatch";
public static final String DEPTH = "generalDepth";
public static final String RECRAWL_IF_OLDER = "recrawlIfOlder";
public static final String DOM_FILTER_DEPTH = "domFilterDepth";
public static final String DOM_MAX_PAGES = "domMaxPages";
@ -254,10 +275,16 @@ public class CrawlProfile {
Map<String, String> mem;
private Map<String, DomProfile> doms;
private Pattern mustmatch = null, mustnotmatch = null;
public entry(final String name, final yacyURL startURL, final String generalFilter, final String specificFilter,
final int generalDepth, final int specificDepth,
final long recrawlIfOlder /*date*/, final int domFilterDepth, final int domMaxPages,
public entry(final String name, final yacyURL startURL,
final Set<String> keywords,
final String mustmatch,
final String mustnotmatch,
final int depth,
final long recrawlIfOlder /*date*/,
final int domFilterDepth, final int domMaxPages,
final boolean crawlingQ,
final boolean indexText, final boolean indexMedia,
final boolean storeHTCache, final boolean storeTXCache,
@ -269,10 +296,9 @@ public class CrawlProfile {
mem.put(HANDLE, handle);
mem.put(NAME, name);
mem.put(START_URL, (startURL == null) ? "" : startURL.toNormalform(true, false));
mem.put(GENERAL_FILTER, (generalFilter == null) ? ".*" : generalFilter);
mem.put(SPECIFIC_FILTER, (specificFilter == null) ? ".*" : specificFilter);
mem.put(GENERAL_DEPTH, Integer.toString(generalDepth));
mem.put(SPECIFIC_DEPTH, Integer.toString(specificDepth));
mem.put(FILTER_MUSTMATCH, (mustmatch == null) ? MATCH_ALL : mustmatch);
mem.put(FILTER_MUSTNOTMATCH, (mustnotmatch == null) ? MATCH_NEVER : mustnotmatch);
mem.put(DEPTH, Integer.toString(depth));
mem.put(RECRAWL_IF_OLDER, Long.toString(recrawlIfOlder));
mem.put(DOM_FILTER_DEPTH, Integer.toString(domFilterDepth));
mem.put(DOM_MAX_PAGES, Integer.toString(domMaxPages));
@ -322,27 +348,24 @@ public class CrawlProfile {
final String r = mem.get(START_URL);
return r;
}
public String generalFilter() {
final String r = mem.get(GENERAL_FILTER);
if (r == null) return ".*";
return r;
}
public String specificFilter() {
final String r = mem.get(SPECIFIC_FILTER);
if (r == null) return ".*";
return r;
public Pattern mustMatchPattern() {
if (this.mustmatch == null) {
String r = mem.get(FILTER_MUSTMATCH);
if (r == null) r = MATCH_ALL;
this.mustmatch = Pattern.compile(r);
}
return this.mustmatch;
}
public int generalDepth() {
final String r = mem.get(GENERAL_DEPTH);
if (r == null) return 0;
try {
return Integer.parseInt(r);
} catch (final NumberFormatException e) {
return 0;
public Pattern mustNotMatchPattern() {
if (this.mustnotmatch == null) {
String r = mem.get(FILTER_MUSTNOTMATCH);
if (r == null) r = MATCH_NEVER;
this.mustnotmatch = Pattern.compile(r);
}
return this.mustnotmatch;
}
public int specificDepth() {
final String r = mem.get(SPECIFIC_DEPTH);
public int depth() {
final String r = mem.get(DEPTH);
if (r == null) return 0;
try {
return Integer.parseInt(r);
@ -497,4 +520,5 @@ public class CrawlProfile {
return domname;
}
}
}

@ -232,8 +232,9 @@ public class CrawlQueues {
+ ", initiator=" + urlEntry.initiator()
+ ", crawlOrder=" + ((profile.remoteIndexing()) ? "true" : "false")
+ ", depth=" + urlEntry.depth()
+ ", crawlDepth=" + profile.generalDepth()
+ ", filter=" + profile.generalFilter()
+ ", crawlDepth=" + profile.depth()
+ ", must-match=" + profile.mustMatchPattern().toString()
+ ", must-not-match=" + profile.mustNotMatchPattern().toString()
+ ", permission=" + ((sb.webIndex.seedDB == null) ? "undefined" : (((sb.webIndex.seedDB.mySeed().isSenior()) || (sb.webIndex.seedDB.mySeed().isPrincipal())) ? "true" : "false")));
processLocalCrawling(urlEntry, stats);

@ -409,14 +409,22 @@ public final class CrawlStacker extends Thread {
return errorMsg;
}
// filter deny
if ((entry.depth() > 0) && (!(entry.url().toString().matches(profile.generalFilter())))) {
reason = "url does not match general filter";
if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' does not match crawling filter '" + profile.generalFilter() + "'. " +
// filter with must-match
if ((entry.depth() > 0) && !profile.mustMatchPattern().matcher(entry.url().toString()).matches()) {
reason = "url does not match must-match filter";
if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' does not match must-match crawling filter '" + profile.mustMatchPattern().toString() + "'. " +
"Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
return reason;
}
// filter with must-not-match
if ((entry.depth() > 0) && profile.mustNotMatchPattern().matcher(entry.url().toString()).matches()) {
reason = "url matches must-not-match filter";
if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' does matches do-not-match crawling filter '" + profile.mustNotMatchPattern().toString() + "'. " +
"Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
return reason;
}
// deny cgi
if (entry.url().isCGI()) {
reason = "cgi url not allowed";
@ -486,7 +494,7 @@ public final class CrawlStacker extends Thread {
final boolean remote = profile.handle().equals(this.sb.webIndex.defaultRemoteProfile.handle());
final boolean global =
(profile.remoteIndexing()) /* granted */ &&
(entry.depth() == profile.generalDepth()) /* leaf node */ &&
(entry.depth() == profile.depth()) /* leaf node */ &&
//(initiatorHash.equals(yacyCore.seedDB.mySeed.hash)) /* not proxy */ &&
(
(sb.webIndex.seedDB.mySeed().isSenior()) ||

@ -330,26 +330,27 @@ public class SitemapParser extends DefaultHandler {
}
private CrawlProfile.entry createProfile(final String domainName, final yacyURL sitemapURL) {
return this.sb.webIndex.profilesActiveCrawls.newEntry(domainName, sitemapURL,
// crawlingFilter
".*", ".*",
// Depth
0, 0,
// force recrawling
0,
// disable Auto-Dom-Filter
-1, -1,
// allow crawling of dynamic URLs
true,
// index text + media
true, true,
// don't store downloaded pages to Web Cache
false,
// store to TX cache
true,
// remote Indexing disabled
false,
// exclude stop-words
true, true, true);
return this.sb.webIndex.profilesActiveCrawls.newEntry(
domainName, sitemapURL, CrawlProfile.KEYWORDS_USER,
// crawling Filter
CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER,
// Depth
0,
// force recrawling
0,
// disable Auto-Dom-Filter
-1, -1,
// allow crawling of dynamic URLs
true,
// index text + media
true, true,
// don't store downloaded pages to Web Cache
false,
// store to TX cache
true,
// remote Indexing disabled
false,
// exclude stop-words
true, true, true);
}
}

@ -226,22 +226,22 @@ public class bookmarksDB {
int pos = 0;
// set crawlingStart to BookmarkUrl
String crawlingStart = bm.getUrl();
String newcrawlingfilter = crawlingfilter;
String newcrawlingMustMatch = crawlingfilter;
yacyURL crawlingStartURL = new yacyURL(crawlingStart, null);
// set the crawling filter
if (newcrawlingfilter.length() < 2) newcrawlingfilter = ".*"; // avoid that all urls are filtered out if bad value was submitted
if (newcrawlingMustMatch.length() < 2) newcrawlingMustMatch = ".*"; // avoid that all urls are filtered out if bad value was submitted
if (crawlingStartURL!= null && newcrawlingfilter.equals("dom")) {
newcrawlingfilter = ".*" + crawlingStartURL.getHost() + ".*";
if (crawlingStartURL!= null && newcrawlingMustMatch.equals("dom")) {
newcrawlingMustMatch = ".*" + crawlingStartURL.getHost() + ".*";
}
if (crawlingStart!= null && newcrawlingfilter.equals("sub") && (pos = crawlingStart.lastIndexOf("/")) > 0) {
newcrawlingfilter = crawlingStart.substring(0, pos + 1) + ".*";
if (crawlingStart!= null && newcrawlingMustMatch.equals("sub") && (pos = crawlingStart.lastIndexOf("/")) > 0) {
newcrawlingMustMatch = crawlingStart.substring(0, pos + 1) + ".*";
}
// check if the crawl filter works correctly
Pattern.compile(newcrawlingfilter);
Pattern.compile(newcrawlingMustMatch);
String urlhash = crawlingStartURL.hash();
sb.webIndex.removeURL(urlhash);
@ -251,8 +251,10 @@ public class bookmarksDB {
// stack url
sb.webIndex.profilesPassiveCrawls.removeEntry(crawlingStartURL.hash()); // if there is an old entry, delete it
CrawlProfile.entry pe = sb.webIndex.profilesActiveCrawls.newEntry(
folder+"/"+crawlingStartURL, crawlingStartURL, newcrawlingfilter, newcrawlingfilter,
newcrawlingdepth, newcrawlingdepth,
folder+"/"+crawlingStartURL, crawlingStartURL, CrawlProfile.KEYWORDS_USER,
newcrawlingMustMatch,
CrawlProfile.MATCH_NEVER,
newcrawlingdepth,
sb.webIndex.profilesActiveCrawls.getRecrawlDate(crawlingIfOlder), crawlingDomFilterDepth, crawlingDomMaxPages,
crawlingQ,
indexText, indexMedia,

@ -1558,8 +1558,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
if (this.log.isFine()) log.logFine("processResourceStack processCase=" + processCase +
", depth=" + entry.depth() +
", maxDepth=" + ((entry.profile() == null) ? "null" : Integer.toString(entry.profile().generalDepth())) +
", filter=" + ((entry.profile() == null) ? "null" : entry.profile().generalFilter()) +
", maxDepth=" + ((entry.profile() == null) ? "null" : Integer.toString(entry.profile().depth())) +
", must-match=" + ((entry.profile() == null) ? "null" : entry.profile().mustMatchPattern().toString()) +
", must-not-match=" + ((entry.profile() == null) ? "null" : entry.profile().mustNotMatchPattern().toString()) +
", initiatorHash=" + entry.initiator() +
//", responseHeader=" + ((entry.responseHeader() == null) ? "null" : entry.responseHeader().toString()) +
", url=" + entry.url()); // DEBUG
@ -1591,7 +1592,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
final long stackStartTime = System.currentTimeMillis();
if (
((processCase == plasmaSwitchboardConstants.PROCESSCASE_4_PROXY_LOAD) || (processCase == plasmaSwitchboardConstants.PROCESSCASE_5_LOCAL_CRAWLING)) &&
((entry.profile() == null) || (entry.depth() < entry.profile().generalDepth()))
((entry.profile() == null) || (entry.depth() < entry.profile().depth()))
) {
final Map<yacyURL, String> hl = document.getHyperlinks();
final Iterator<Map.Entry<yacyURL, String>> i = hl.entrySet().iterator();

@ -279,8 +279,7 @@ public final class plasmaWordIndex implements indexRI {
if (this.defaultProxyProfile == null) {
// generate new default entry for proxy crawling
this.defaultProxyProfile = this.profilesActiveCrawls.newEntry("proxy", null, ".*", ".*",
0 /*Integer.parseInt(getConfig(PROXY_PREFETCH_DEPTH, "0"))*/,
this.defaultProxyProfile = this.profilesActiveCrawls.newEntry("proxy", null, CrawlProfile.KEYWORDS_PROXY, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER,
0 /*Integer.parseInt(getConfig(PROXY_PREFETCH_DEPTH, "0"))*/,
this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), -1, -1, false,
true /*getConfigBool(PROXY_INDEXING_LOCAL_TEXT, true)*/,
@ -290,27 +289,27 @@ public final class plasmaWordIndex implements indexRI {
}
if (this.defaultRemoteProfile == null) {
// generate new default entry for remote crawling
defaultRemoteProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_REMOTE, null, ".*", ".*", 0, 0,
defaultRemoteProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_REMOTE, null, CrawlProfile.KEYWORDS_REMOTE, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
-1, -1, -1, true, true, true, false, true, false, true, true, false);
}
if (this.defaultTextSnippetLocalProfile == null) {
// generate new default entry for snippet fetch and optional crawling
defaultTextSnippetLocalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, ".*", ".*", 0, 0,
defaultTextSnippetLocalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.KEYWORDS_SNIPPET, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, -1, true, false, false, false, false, false, true, true, false);
}
if (this.defaultTextSnippetGlobalProfile == null) {
// generate new default entry for snippet fetch and optional crawling
defaultTextSnippetGlobalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, ".*", ".*", 0, 0,
defaultTextSnippetGlobalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, CrawlProfile.KEYWORDS_SNIPPET, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, -1, true, true, true, true, true, false, true, true, false);
}
if (this.defaultMediaSnippetLocalProfile == null) {
// generate new default entry for snippet fetch and optional crawling
defaultMediaSnippetLocalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, ".*", ".*", 0, 0,
defaultMediaSnippetLocalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, CrawlProfile.KEYWORDS_SNIPPET, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, false, false, false, false, true, true, false);
}
if (this.defaultMediaSnippetGlobalProfile == null) {
// generate new default entry for snippet fetch and optional crawling
defaultMediaSnippetGlobalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, ".*", ".*", 0, 0,
defaultMediaSnippetGlobalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, CrawlProfile.KEYWORDS_SNIPPET, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, true, true, true, false, true, true, false);
}
}

@ -37,12 +37,12 @@ public class urlRedirectord implements serverHandler, Cloneable {
// name
"URL Redirector",
// start URL
null,
null,
// keywords
CrawlProfile.KEYWORDS_USER,
// crawling filter
".*",
".*",
CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER,
// depth
0,
0,
// recrawlIfOlder (minutes), if negative: do not re-crawl
-1,

Loading…
Cancel
Save