From dba7ef51440ae3992260759f58478334af73e8af Mon Sep 17 00:00:00 2001 From: orbiter Date: Fri, 14 Nov 2008 09:58:56 +0000 Subject: [PATCH] extended crawling constraints: - removed never-used secondary crawl depth - added a must-not-match filter that can be used to exclude urls from a crawl - added stub for crawl tags which will be used to identify search results that had been produced from specific crawls please update the yacybar: replace property name 'crawlFilter' with 'mustmatch'. Additionally, a new parameter named 'mustnotmatch' can be used, which should be by default the empty sring (match-never) git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5342 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/CrawlProfileEditor_p.html | 6 +- htroot/CrawlProfileEditor_p.java | 11 ++- htroot/CrawlProfileEditor_p.xml | 3 +- htroot/CrawlStart_p.html | 18 +++- htroot/CrawlStart_p.java | 4 +- htroot/QuickCrawlLink_p.java | 13 +-- htroot/WatchCrawler_p.java | 52 +++++++---- htroot/sharedBlacklist_p.java | 3 - source/de/anomic/crawler/CrawlProfile.java | 90 ++++++++++++------- source/de/anomic/crawler/CrawlQueues.java | 5 +- source/de/anomic/crawler/CrawlStacker.java | 20 +++-- source/de/anomic/data/SitemapParser.java | 43 ++++----- source/de/anomic/data/bookmarksDB.java | 20 +++-- .../de/anomic/plasma/plasmaSwitchboard.java | 7 +- source/de/anomic/plasma/plasmaWordIndex.java | 13 ++- .../anomic/urlRedirector/urlRedirectord.java | 8 +- 16 files changed, 193 insertions(+), 123 deletions(-) diff --git a/htroot/CrawlProfileEditor_p.html b/htroot/CrawlProfileEditor_p.html index f1bc73dab..0c83b5688 100644 --- a/htroot/CrawlProfileEditor_p.html +++ b/htroot/CrawlProfileEditor_p.html @@ -30,7 +30,8 @@ Status Start URL Depth - Filter + Must Match + Must Not Match MaxAge Auto Filter Depth Auto Filter Content @@ -48,7 +49,8 @@ #(status)#terminated::active#(/status)# #[startURL]# #[depth]# - #[filter]# + #[mustmatch]# + #[mustnotmatch]# #[crawlingIfOlder]# #[crawlingDomFilterDepth]# #{crawlingDomFilterContent}##[item]#
#{/crawlingDomFilterContent}# diff --git a/htroot/CrawlProfileEditor_p.java b/htroot/CrawlProfileEditor_p.java index be6120a0a..44c9ab5f0 100644 --- a/htroot/CrawlProfileEditor_p.java +++ b/htroot/CrawlProfileEditor_p.java @@ -62,10 +62,8 @@ public class CrawlProfileEditor_p { static { labels.add(new eentry(entry.NAME, "Name", true, eentry.STRING)); labels.add(new eentry(entry.START_URL, "Start URL", true, eentry.STRING)); - labels.add(new eentry(entry.GENERAL_FILTER, "General Filter", false, eentry.STRING)); - labels.add(new eentry(entry.SPECIFIC_FILTER, "Specific Filter", false, eentry.STRING)); - labels.add(new eentry(entry.GENERAL_DEPTH, "General Depth", false, eentry.INTEGER)); - labels.add(new eentry(entry.SPECIFIC_DEPTH, "Specific Depth", false, eentry.INTEGER)); + labels.add(new eentry(entry.FILTER_MUSTMATCH, "General Filter", false, eentry.STRING)); + labels.add(new eentry(entry.DEPTH, "General Depth", false, eentry.INTEGER)); labels.add(new eentry(entry.RECRAWL_IF_OLDER, "Recrawl If Older", false, eentry.INTEGER)); labels.add(new eentry(entry.DOM_FILTER_DEPTH, "Domain Filter Depth", false, eentry.INTEGER)); labels.add(new eentry(entry.DOM_MAX_PAGES, "Domain Max. Pages", false, eentry.INTEGER)); @@ -214,8 +212,9 @@ public class CrawlProfileEditor_p { prop.put("crawlProfiles_" + count + "_name", profile.name()); prop.putXML("crawlProfiles_" + count + "_startURL", profile.startURL()); prop.put("crawlProfiles_" + count + "_handle", profile.handle()); - prop.put("crawlProfiles_" + count + "_depth", profile.generalDepth()); - prop.put("crawlProfiles_" + count + "_filter", profile.generalFilter()); + prop.put("crawlProfiles_" + count + "_depth", profile.depth()); + prop.put("crawlProfiles_" + count + "_mustmatch", profile.mustMatchPattern().toString()); + prop.put("crawlProfiles_" + count + "_mustnotmatch", profile.mustNotMatchPattern().toString()); prop.put("crawlProfiles_" + count + "_crawlingIfOlder", (profile.recrawlIfOlder() == 0L) ? "no re-crawl" : ""+ SimpleDateFormat.getDateTimeInstance().format(profile.recrawlIfOlder())); prop.put("crawlProfiles_" + count + "_crawlingDomFilterDepth", (profile.domFilterDepth() == Integer.MAX_VALUE) ? "inactive" : Integer.toString(profile.domFilterDepth())); diff --git a/htroot/CrawlProfileEditor_p.xml b/htroot/CrawlProfileEditor_p.xml index 9194dc61a..5b5f54bee 100644 --- a/htroot/CrawlProfileEditor_p.xml +++ b/htroot/CrawlProfileEditor_p.xml @@ -6,7 +6,8 @@ #(status)#terminated::active#(/status)# #[startURL]# #[depth]# - #[filter]# + #[mustmatch]# + #[mustnotmatch]# #[crawlingIfOlder]# #[crawlingDomFilterDepth]# diff --git a/htroot/CrawlStart_p.html b/htroot/CrawlStart_p.html index eddc9baaf..85bdb6357 100644 --- a/htroot/CrawlStart_p.html +++ b/htroot/CrawlStart_p.html @@ -100,18 +100,30 @@ - : + : Use filter   -
+
Restrict to start domain
Restrict to sub-path - The filter is an emacs-like regular expression that must match with the URLs which are used to be crawled; default is 'catch all'. + The filter is an emacs-like regular expression that must match with the URLs which are used to be crawled; + default is 'catch all'. You can also use an automatic domain-restriction to fully crawl a single domain. + + : + + + + + This filter must not match to allow that the page is accepted for crawling. + The empty string is a never-match filter which should do well for most cases. + If you don't know what this means, please leave this field empty. + + Re-crawl known URLs: diff --git a/htroot/CrawlStart_p.java b/htroot/CrawlStart_p.java index 15077ecde..05a889303 100644 --- a/htroot/CrawlStart_p.java +++ b/htroot/CrawlStart_p.java @@ -24,6 +24,7 @@ // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +import de.anomic.crawler.CrawlProfile; import de.anomic.http.httpRequestHeader; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboardConstants; @@ -44,7 +45,8 @@ public class CrawlStart_p { prop.put("starturl", (intranet) ? repository : "http://"); prop.put("proxyPrefetchDepth", env.getConfig("proxyPrefetchDepth", "0")); prop.put("crawlingDepth", env.getConfig("crawlingDepth", "0")); - prop.put("crawlingFilter", (intranet) ? repository + ".*" : ".*"); + prop.put("mustmatch", (intranet) ? repository + ".*" : CrawlProfile.MATCH_ALL); + prop.put("mustnotmatch", CrawlProfile.MATCH_NEVER); prop.put("crawlingIfOlderCheck", "0"); prop.put("crawlingIfOlderUnitYearCheck", "0"); diff --git a/htroot/QuickCrawlLink_p.java b/htroot/QuickCrawlLink_p.java index dfa10b9f4..5af704396 100644 --- a/htroot/QuickCrawlLink_p.java +++ b/htroot/QuickCrawlLink_p.java @@ -91,7 +91,8 @@ public class QuickCrawlLink_p { final String title = post.get("title",null); // getting other parameters if set - final String crawlingFilter = post.get("crawlingFilter", ".*"); + final String crawlingMustMatch = post.get("mustmatch", CrawlProfile.MATCH_ALL); + final String crawlingMustNotMatch = post.get("mustnotmatch", CrawlProfile.MATCH_NEVER); final int CrawlingDepth = Integer.parseInt(post.get("crawlingDepth", "0")); final boolean crawlDynamic = post.get("crawlingQ", "").equals("on"); final boolean indexText = post.get("indexText", "on").equals("on"); @@ -129,11 +130,11 @@ public class QuickCrawlLink_p { try { pe = sb.webIndex.profilesActiveCrawls.newEntry( crawlingStartURL.getHost(), - crawlingStartURL, - crawlingFilter, - crawlingFilter, - CrawlingDepth, - CrawlingDepth, + crawlingStartURL, + CrawlProfile.KEYWORDS_USER, + crawlingMustMatch, + crawlingMustNotMatch, + CrawlingDepth, 60 * 24 * 30, // recrawlIfOlder (minutes); here: one month -1, // domFilterDepth, if negative: no auto-filter -1, // domMaxPages, if negative: no count restriction diff --git a/htroot/WatchCrawler_p.java b/htroot/WatchCrawler_p.java index ea3a3651d..831a5a0de 100644 --- a/htroot/WatchCrawler_p.java +++ b/htroot/WatchCrawler_p.java @@ -123,16 +123,16 @@ public class WatchCrawler_p { crawlingStart = (crawlingStartURL == null) ? null : crawlingStartURL.toNormalform(true, true); // set the crawling filter - String newcrawlingfilter = post.get("crawlingFilter", ".*"); - if (newcrawlingfilter.length() < 2) newcrawlingfilter = ".*"; // avoid that all urls are filtered out if bad value was submitted - + String newcrawlingMustMatch = post.get("mustmatch", CrawlProfile.MATCH_ALL); + String newcrawlingMustNotMatch = post.get("mustnotmatch", CrawlProfile.MATCH_NEVER); + if (newcrawlingMustMatch.length() < 2) newcrawlingMustMatch = CrawlProfile.MATCH_ALL; // avoid that all urls are filtered out if bad value was submitted + // special cases: if (crawlingStartURL!= null && fullDomain) { - newcrawlingfilter = ".*" + crawlingStartURL.getHost() + ".*"; + newcrawlingMustMatch = ".*" + crawlingStartURL.getHost() + ".*"; } if (crawlingStart!= null && subPath && (pos = crawlingStart.lastIndexOf("/")) > 0) { - newcrawlingfilter = crawlingStart.substring(0, pos + 1) + ".*"; + newcrawlingMustMatch = crawlingStart.substring(0, pos + 1) + ".*"; } - env.setConfig("crawlingFilter", newcrawlingfilter); final boolean crawlOrder = post.get("crawlOrder", "off").equals("on"); env.setConfig("crawlOrder", (crawlOrder) ? "true" : "false"); @@ -183,12 +183,12 @@ public class WatchCrawler_p { if ((crawlingStart == null || crawlingStartURL == null) /* || (!(crawlingStart.matches(newcrawlingfilter))) */) { // print error message prop.put("info", "4"); //crawlfilter does not match url - prop.putHTML("info_newcrawlingfilter", newcrawlingfilter); + prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch); prop.putHTML("info_crawlingStart", crawlingStart); } else try { // check if the crawl filter works correctly - Pattern.compile(newcrawlingfilter); + Pattern.compile(newcrawlingMustMatch); // stack request // first delete old entry, if exists @@ -201,8 +201,12 @@ public class WatchCrawler_p { // stack url sb.webIndex.profilesPassiveCrawls.removeEntry(crawlingStartURL.hash()); // if there is an old entry, delete it final CrawlProfile.entry pe = sb.webIndex.profilesActiveCrawls.newEntry( - crawlingStartURL.getHost(), crawlingStartURL, newcrawlingfilter, newcrawlingfilter, - newcrawlingdepth, newcrawlingdepth, + crawlingStartURL.getHost(), + crawlingStartURL, + CrawlProfile.KEYWORDS_USER, + newcrawlingMustMatch, + newcrawlingMustNotMatch, + newcrawlingdepth, crawlingIfOlder, crawlingDomFilterDepth, crawlingDomMaxPages, crawlingQ, indexText, indexMedia, @@ -270,7 +274,7 @@ public class WatchCrawler_p { } } catch (final PatternSyntaxException e) { prop.put("info", "4"); //crawlfilter does not match url - prop.putHTML("info_newcrawlingfilter", newcrawlingfilter); + prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch); prop.putHTML("info_error", e.getMessage()); } catch (final Exception e) { // mist @@ -286,7 +290,7 @@ public class WatchCrawler_p { final String fileName = post.get("crawlingFile"); try { // check if the crawl filter works correctly - Pattern.compile(newcrawlingfilter); + Pattern.compile(newcrawlingMustMatch); // loading the file content final File file = new File(fileName); @@ -306,7 +310,21 @@ public class WatchCrawler_p { // creating a crawler profile final yacyURL crawlURL = new yacyURL("file://" + file.toString(), null); - final CrawlProfile.entry profile = sb.webIndex.profilesActiveCrawls.newEntry(fileName, crawlURL, newcrawlingfilter, newcrawlingfilter, newcrawlingdepth, newcrawlingdepth, crawlingIfOlder, crawlingDomFilterDepth, crawlingDomMaxPages, crawlingQ, indexText, indexMedia, storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw); + final CrawlProfile.entry profile = sb.webIndex.profilesActiveCrawls.newEntry( + fileName, crawlURL, CrawlProfile.KEYWORDS_USER, + newcrawlingMustMatch, + CrawlProfile.MATCH_NEVER, + newcrawlingdepth, + crawlingIfOlder, + crawlingDomFilterDepth, + crawlingDomMaxPages, + crawlingQ, + indexText, + indexMedia, + storeHTCache, + true, + crawlOrder, + xsstopw, xdstopw, xpstopw); // pause local crawl here sb.pauseCrawlJob(plasmaSwitchboardConstants.CRAWLJOB_LOCAL_CRAWL); @@ -333,7 +351,7 @@ public class WatchCrawler_p { } catch (final PatternSyntaxException e) { // print error message prop.put("info", "4"); //crawlfilter does not match url - prop.putHTML("info_newcrawlingfilter", newcrawlingfilter); + prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch); prop.putHTML("info_error", e.getMessage()); } catch (final Exception e) { // mist @@ -353,8 +371,10 @@ public class WatchCrawler_p { // create a new profile final CrawlProfile.entry pe = sb.webIndex.profilesActiveCrawls.newEntry( - sitemapURLStr, sitemapURL, newcrawlingfilter, newcrawlingfilter, - newcrawlingdepth, newcrawlingdepth, + sitemapURLStr, sitemapURL, CrawlProfile.KEYWORDS_USER, + newcrawlingMustMatch, + CrawlProfile.MATCH_NEVER, + newcrawlingdepth, crawlingIfOlder, crawlingDomFilterDepth, crawlingDomMaxPages, crawlingQ, indexText, indexMedia, diff --git a/htroot/sharedBlacklist_p.java b/htroot/sharedBlacklist_p.java index c750e1228..684a160fd 100644 --- a/htroot/sharedBlacklist_p.java +++ b/htroot/sharedBlacklist_p.java @@ -104,7 +104,6 @@ public class sharedBlacklist_p { final String Hash = post.get("hash"); // generate the download URL - String downloadURL = null; String downloadURLOld = null; if( sb.webIndex.seedDB != null ){ //no nullpointer error.. final yacySeed seed = sb.webIndex.seedDB.getConnected(Hash); @@ -113,8 +112,6 @@ public class sharedBlacklist_p { final String Port = seed.get(yacySeed.PORT, "8080"); final String peerName = seed.get(yacySeed.NAME, "<" + IP + ":" + Port + ">"); prop.putHTML("page_source", peerName); - - downloadURL = "http://" + IP + ":" + Port + "/xml/blacklists.xml"; downloadURLOld = "http://" + IP + ":" + Port + "/yacy/list.html?col=black"; } else { prop.put("status", STATUS_PEER_UNKNOWN);//YaCy-Peer not found diff --git a/source/de/anomic/crawler/CrawlProfile.java b/source/de/anomic/crawler/CrawlProfile.java index d2cfbe625..99ae67397 100644 --- a/source/de/anomic/crawler/CrawlProfile.java +++ b/source/de/anomic/crawler/CrawlProfile.java @@ -28,6 +28,8 @@ import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.Map; +import java.util.Set; +import java.util.regex.Pattern; import de.anomic.kelondro.kelondroBLOB; import de.anomic.kelondro.kelondroBLOBHeap; @@ -43,6 +45,20 @@ import de.anomic.yacy.yacyURL; public class CrawlProfile { + public static final String MATCH_ALL = ".*"; + public static final String MATCH_NEVER = ""; + public static final HashSet NO_KEYWORDS = new HashSet(0); + public static final HashSet KEYWORDS_PROXY = word2set("xproxy"); + public static final HashSet KEYWORDS_REMOTE = word2set("xremote"); + public static final HashSet KEYWORDS_USER = word2set("xuser"); + public static final HashSet KEYWORDS_SNIPPET = word2set("xsnippet"); + + private static final HashSet word2set(String word) { + HashSet s = new HashSet(1); + s.add(word); + return s; + } + static HashMap> domsCache = new HashMap>(); kelondroMap profileTable; @@ -145,8 +161,11 @@ public class CrawlProfile { return ne; } - public entry newEntry(final String name, final yacyURL startURL, final String generalFilter, final String specificFilter, - final int generalDepth, final int specificDepth, + public entry newEntry( final String name, + final yacyURL startURL, + final Set keywords, + final String mustmatch, final String mustnotmatch, + final int generalDepth, final long recrawlIfOlder /*date*/, final int domFilterDepth, final int domMaxPages, final boolean crawlingQ, final boolean indexText, final boolean indexMedia, @@ -154,8 +173,11 @@ public class CrawlProfile { final boolean remoteIndexing, final boolean xsstopw, final boolean xdstopw, final boolean xpstopw) { - final entry ne = new entry(name, startURL, generalFilter, specificFilter, - generalDepth, specificDepth, + final entry ne = new entry( + name, startURL, + keywords, + mustmatch, mustnotmatch, + generalDepth, recrawlIfOlder, domFilterDepth, domMaxPages, crawlingQ, indexText, indexMedia, @@ -235,10 +257,9 @@ public class CrawlProfile { public static final String HANDLE = "handle"; public static final String NAME = "name"; public static final String START_URL = "startURL"; - public static final String GENERAL_FILTER = "generalFilter"; - public static final String SPECIFIC_FILTER = "specificFilter"; - public static final String GENERAL_DEPTH = "generalDepth"; - public static final String SPECIFIC_DEPTH = "specificDepth"; + public static final String FILTER_MUSTMATCH = "generalFilter"; + public static final String FILTER_MUSTNOTMATCH = "nevermatch"; + public static final String DEPTH = "generalDepth"; public static final String RECRAWL_IF_OLDER = "recrawlIfOlder"; public static final String DOM_FILTER_DEPTH = "domFilterDepth"; public static final String DOM_MAX_PAGES = "domMaxPages"; @@ -254,10 +275,16 @@ public class CrawlProfile { Map mem; private Map doms; + private Pattern mustmatch = null, mustnotmatch = null; + - public entry(final String name, final yacyURL startURL, final String generalFilter, final String specificFilter, - final int generalDepth, final int specificDepth, - final long recrawlIfOlder /*date*/, final int domFilterDepth, final int domMaxPages, + public entry(final String name, final yacyURL startURL, + final Set keywords, + final String mustmatch, + final String mustnotmatch, + final int depth, + final long recrawlIfOlder /*date*/, + final int domFilterDepth, final int domMaxPages, final boolean crawlingQ, final boolean indexText, final boolean indexMedia, final boolean storeHTCache, final boolean storeTXCache, @@ -269,10 +296,9 @@ public class CrawlProfile { mem.put(HANDLE, handle); mem.put(NAME, name); mem.put(START_URL, (startURL == null) ? "" : startURL.toNormalform(true, false)); - mem.put(GENERAL_FILTER, (generalFilter == null) ? ".*" : generalFilter); - mem.put(SPECIFIC_FILTER, (specificFilter == null) ? ".*" : specificFilter); - mem.put(GENERAL_DEPTH, Integer.toString(generalDepth)); - mem.put(SPECIFIC_DEPTH, Integer.toString(specificDepth)); + mem.put(FILTER_MUSTMATCH, (mustmatch == null) ? MATCH_ALL : mustmatch); + mem.put(FILTER_MUSTNOTMATCH, (mustnotmatch == null) ? MATCH_NEVER : mustnotmatch); + mem.put(DEPTH, Integer.toString(depth)); mem.put(RECRAWL_IF_OLDER, Long.toString(recrawlIfOlder)); mem.put(DOM_FILTER_DEPTH, Integer.toString(domFilterDepth)); mem.put(DOM_MAX_PAGES, Integer.toString(domMaxPages)); @@ -322,27 +348,24 @@ public class CrawlProfile { final String r = mem.get(START_URL); return r; } - public String generalFilter() { - final String r = mem.get(GENERAL_FILTER); - if (r == null) return ".*"; - return r; - } - public String specificFilter() { - final String r = mem.get(SPECIFIC_FILTER); - if (r == null) return ".*"; - return r; + public Pattern mustMatchPattern() { + if (this.mustmatch == null) { + String r = mem.get(FILTER_MUSTMATCH); + if (r == null) r = MATCH_ALL; + this.mustmatch = Pattern.compile(r); + } + return this.mustmatch; } - public int generalDepth() { - final String r = mem.get(GENERAL_DEPTH); - if (r == null) return 0; - try { - return Integer.parseInt(r); - } catch (final NumberFormatException e) { - return 0; + public Pattern mustNotMatchPattern() { + if (this.mustnotmatch == null) { + String r = mem.get(FILTER_MUSTNOTMATCH); + if (r == null) r = MATCH_NEVER; + this.mustnotmatch = Pattern.compile(r); } + return this.mustnotmatch; } - public int specificDepth() { - final String r = mem.get(SPECIFIC_DEPTH); + public int depth() { + final String r = mem.get(DEPTH); if (r == null) return 0; try { return Integer.parseInt(r); @@ -497,4 +520,5 @@ public class CrawlProfile { return domname; } } + } diff --git a/source/de/anomic/crawler/CrawlQueues.java b/source/de/anomic/crawler/CrawlQueues.java index 36450e566..2be78d90f 100644 --- a/source/de/anomic/crawler/CrawlQueues.java +++ b/source/de/anomic/crawler/CrawlQueues.java @@ -232,8 +232,9 @@ public class CrawlQueues { + ", initiator=" + urlEntry.initiator() + ", crawlOrder=" + ((profile.remoteIndexing()) ? "true" : "false") + ", depth=" + urlEntry.depth() - + ", crawlDepth=" + profile.generalDepth() - + ", filter=" + profile.generalFilter() + + ", crawlDepth=" + profile.depth() + + ", must-match=" + profile.mustMatchPattern().toString() + + ", must-not-match=" + profile.mustNotMatchPattern().toString() + ", permission=" + ((sb.webIndex.seedDB == null) ? "undefined" : (((sb.webIndex.seedDB.mySeed().isSenior()) || (sb.webIndex.seedDB.mySeed().isPrincipal())) ? "true" : "false"))); processLocalCrawling(urlEntry, stats); diff --git a/source/de/anomic/crawler/CrawlStacker.java b/source/de/anomic/crawler/CrawlStacker.java index 8fea71b6e..660ed7fe7 100644 --- a/source/de/anomic/crawler/CrawlStacker.java +++ b/source/de/anomic/crawler/CrawlStacker.java @@ -409,14 +409,22 @@ public final class CrawlStacker extends Thread { return errorMsg; } - // filter deny - if ((entry.depth() > 0) && (!(entry.url().toString().matches(profile.generalFilter())))) { - reason = "url does not match general filter"; - if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' does not match crawling filter '" + profile.generalFilter() + "'. " + + // filter with must-match + if ((entry.depth() > 0) && !profile.mustMatchPattern().matcher(entry.url().toString()).matches()) { + reason = "url does not match must-match filter"; + if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' does not match must-match crawling filter '" + profile.mustMatchPattern().toString() + "'. " + "Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms"); return reason; } - + + // filter with must-not-match + if ((entry.depth() > 0) && profile.mustNotMatchPattern().matcher(entry.url().toString()).matches()) { + reason = "url matches must-not-match filter"; + if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' does matches do-not-match crawling filter '" + profile.mustNotMatchPattern().toString() + "'. " + + "Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms"); + return reason; + } + // deny cgi if (entry.url().isCGI()) { reason = "cgi url not allowed"; @@ -486,7 +494,7 @@ public final class CrawlStacker extends Thread { final boolean remote = profile.handle().equals(this.sb.webIndex.defaultRemoteProfile.handle()); final boolean global = (profile.remoteIndexing()) /* granted */ && - (entry.depth() == profile.generalDepth()) /* leaf node */ && + (entry.depth() == profile.depth()) /* leaf node */ && //(initiatorHash.equals(yacyCore.seedDB.mySeed.hash)) /* not proxy */ && ( (sb.webIndex.seedDB.mySeed().isSenior()) || diff --git a/source/de/anomic/data/SitemapParser.java b/source/de/anomic/data/SitemapParser.java index e67c9d7b7..ab91f2ec7 100644 --- a/source/de/anomic/data/SitemapParser.java +++ b/source/de/anomic/data/SitemapParser.java @@ -330,26 +330,27 @@ public class SitemapParser extends DefaultHandler { } private CrawlProfile.entry createProfile(final String domainName, final yacyURL sitemapURL) { - return this.sb.webIndex.profilesActiveCrawls.newEntry(domainName, sitemapURL, - // crawlingFilter - ".*", ".*", - // Depth - 0, 0, - // force recrawling - 0, - // disable Auto-Dom-Filter - -1, -1, - // allow crawling of dynamic URLs - true, - // index text + media - true, true, - // don't store downloaded pages to Web Cache - false, - // store to TX cache - true, - // remote Indexing disabled - false, - // exclude stop-words - true, true, true); + return this.sb.webIndex.profilesActiveCrawls.newEntry( + domainName, sitemapURL, CrawlProfile.KEYWORDS_USER, + // crawling Filter + CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, + // Depth + 0, + // force recrawling + 0, + // disable Auto-Dom-Filter + -1, -1, + // allow crawling of dynamic URLs + true, + // index text + media + true, true, + // don't store downloaded pages to Web Cache + false, + // store to TX cache + true, + // remote Indexing disabled + false, + // exclude stop-words + true, true, true); } } diff --git a/source/de/anomic/data/bookmarksDB.java b/source/de/anomic/data/bookmarksDB.java index 7f89cd545..6cbb6578e 100644 --- a/source/de/anomic/data/bookmarksDB.java +++ b/source/de/anomic/data/bookmarksDB.java @@ -226,22 +226,22 @@ public class bookmarksDB { int pos = 0; // set crawlingStart to BookmarkUrl String crawlingStart = bm.getUrl(); - String newcrawlingfilter = crawlingfilter; + String newcrawlingMustMatch = crawlingfilter; yacyURL crawlingStartURL = new yacyURL(crawlingStart, null); // set the crawling filter - if (newcrawlingfilter.length() < 2) newcrawlingfilter = ".*"; // avoid that all urls are filtered out if bad value was submitted + if (newcrawlingMustMatch.length() < 2) newcrawlingMustMatch = ".*"; // avoid that all urls are filtered out if bad value was submitted - if (crawlingStartURL!= null && newcrawlingfilter.equals("dom")) { - newcrawlingfilter = ".*" + crawlingStartURL.getHost() + ".*"; + if (crawlingStartURL!= null && newcrawlingMustMatch.equals("dom")) { + newcrawlingMustMatch = ".*" + crawlingStartURL.getHost() + ".*"; } - if (crawlingStart!= null && newcrawlingfilter.equals("sub") && (pos = crawlingStart.lastIndexOf("/")) > 0) { - newcrawlingfilter = crawlingStart.substring(0, pos + 1) + ".*"; + if (crawlingStart!= null && newcrawlingMustMatch.equals("sub") && (pos = crawlingStart.lastIndexOf("/")) > 0) { + newcrawlingMustMatch = crawlingStart.substring(0, pos + 1) + ".*"; } // check if the crawl filter works correctly - Pattern.compile(newcrawlingfilter); + Pattern.compile(newcrawlingMustMatch); String urlhash = crawlingStartURL.hash(); sb.webIndex.removeURL(urlhash); @@ -251,8 +251,10 @@ public class bookmarksDB { // stack url sb.webIndex.profilesPassiveCrawls.removeEntry(crawlingStartURL.hash()); // if there is an old entry, delete it CrawlProfile.entry pe = sb.webIndex.profilesActiveCrawls.newEntry( - folder+"/"+crawlingStartURL, crawlingStartURL, newcrawlingfilter, newcrawlingfilter, - newcrawlingdepth, newcrawlingdepth, + folder+"/"+crawlingStartURL, crawlingStartURL, CrawlProfile.KEYWORDS_USER, + newcrawlingMustMatch, + CrawlProfile.MATCH_NEVER, + newcrawlingdepth, sb.webIndex.profilesActiveCrawls.getRecrawlDate(crawlingIfOlder), crawlingDomFilterDepth, crawlingDomMaxPages, crawlingQ, indexText, indexMedia, diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index e5ae12e4d..90c9b0a7a 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -1558,8 +1558,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch