From 60b1e23f0537ddb2e8299189a2dcc180978df223 Mon Sep 17 00:00:00 2001 From: orbiter Date: Sun, 16 Sep 2012 21:27:55 +0200 Subject: [PATCH] added new crawl options: - indexUrlMustMatch and indexUrlMustNotMatch which can be used to select loaded pages for indexing. Default patterns are in such a way that all loaded pages are also indexed (as before) but when doing an expert crawl start, then the user may select only specific urls to be indexed. - crawlerNoDepthLimitMatch is a new pattern that can be used to remove the crawl depth limitation. This filter a never-match by default (which causes that the depth is used) but the user can select paths which will be loaded completely even if a crawl depth is reached. --- htroot/CrawlStartExpert_p.html | 27 ++++++++-- htroot/CrawlStartExpert_p.java | 3 ++ htroot/Crawler_p.java | 12 +++-- source/net/yacy/search/Switchboard.java | 65 +++++++++++-------------- 4 files changed, 64 insertions(+), 43 deletions(-) diff --git a/htroot/CrawlStartExpert_p.html b/htroot/CrawlStartExpert_p.html index 8d3514708..5baece971 100644 --- a/htroot/CrawlStartExpert_p.html +++ b/htroot/CrawlStartExpert_p.html @@ -92,7 +92,8 @@ Crawling Depth:     - also all linked non-parsable documents + also all linked non-parsable documents
+ Unlimited crawl depth for URLs matching with: This defines how often the Crawler will follow links (of links..) embedded in websites. @@ -150,7 +151,7 @@ - : + : Use filter  
@@ -165,7 +166,7 @@ - : + : @@ -176,6 +177,26 @@ If you don't know what this means, please leave this field empty. + + : + +
+ + + The filter is a regular expression + that must match with the URLs to allow that the content of the url is indexed. + + + + : + + + + + The filter is a regular expression + that must not match with the URLs to allow that the content of the url is indexed. + + : diff --git a/htroot/CrawlStartExpert_p.java b/htroot/CrawlStartExpert_p.java index 2ab83f865..8bde5e4e7 100644 --- a/htroot/CrawlStartExpert_p.java +++ b/htroot/CrawlStartExpert_p.java @@ -43,9 +43,12 @@ public class CrawlStartExpert_p { prop.put("starturl", /*(intranet) ? repository :*/ ""); prop.put("proxyPrefetchDepth", env.getConfig("proxyPrefetchDepth", "0")); prop.put("crawlingDepth", Math.min(3, env.getConfigLong("crawlingDepth", 0))); + prop.put("crawlingDepthExtension", CrawlProfile.MATCH_NEVER_STRING); prop.put("directDocByURLChecked", sb.getConfigBool("crawlingDirectDocByURL", true) ? "1" : "0"); prop.put("mustmatch", /*(intranet) ? repository + ".*" :*/ CrawlProfile.MATCH_ALL_STRING); prop.put("mustnotmatch", CrawlProfile.MATCH_NEVER_STRING); + prop.put("indexmustmatch", CrawlProfile.MATCH_ALL_STRING); + prop.put("indexmustnotmatch", CrawlProfile.MATCH_NEVER_STRING); prop.put("ipMustmatch", sb.getConfig("crawlingIPMustMatch", CrawlProfile.MATCH_ALL_STRING)); prop.put("ipMustnotmatch", sb.getConfig("crawlingIPMustNotMatch", CrawlProfile.MATCH_NEVER_STRING)); prop.put("countryMustMatch", sb.getConfig("crawlingCountryMustMatch", "")); diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index 1399e254d..8c486ab07 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -204,9 +204,15 @@ public class Crawler_p { sb.setConfig("crawlingIPMustNotMatch", ipMustNotMatch); if (countryMustMatch.length() > 0) sb.setConfig("crawlingCountryMustMatch", countryMustMatch); + String crawlerNoDepthLimitMatch = post.get("crawlingDepthExtension", CrawlProfile.MATCH_NEVER_STRING); + final String indexUrlMustMatch = post.get("indexmustmatch", CrawlProfile.MATCH_ALL_STRING); + final String indexUrlMustNotMatch = post.get("indexmustnotmatch", CrawlProfile.MATCH_NEVER_STRING); + final boolean crawlOrder = post.get("crawlOrder", "off").equals("on"); env.setConfig("crawlOrder", crawlOrder); + if (crawlOrder) crawlerNoDepthLimitMatch = CrawlProfile.MATCH_NEVER_STRING; // without limitation the crawl order does not work + int newcrawlingdepth = post.getInt("crawlingDepth", 8); env.setConfig("crawlingDepth", Integer.toString(newcrawlingdepth)); if ((crawlOrder) && (newcrawlingdepth > 8)) newcrawlingdepth = 8; @@ -347,9 +353,9 @@ public class Crawler_p { ipMustMatch, ipMustNotMatch, countryMustMatch, - CrawlProfile.MATCH_NEVER_STRING, - CrawlProfile.MATCH_ALL_STRING, - CrawlProfile.MATCH_NEVER_STRING, + crawlerNoDepthLimitMatch, + indexUrlMustMatch, + indexUrlMustNotMatch, newcrawlingdepth, directDocByURL, crawlingIfOlder, diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 38a527ed1..7bd1c994f 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -2284,28 +2284,14 @@ public final class Switchboard extends serverSwitch final EventOrigin processCase = response.processCase(this.peers.mySeed().hash); if ( this.log.isFine() ) { - this.log.logFine("processResourceStack processCase=" - + processCase - + ", depth=" - + response.depth() - + ", maxDepth=" - + ((response.profile() == null) ? "null" : Integer.toString(response.profile().depth())) - + ", must-match=" - + ((response.profile() == null) ? "null" : response - .profile() - .urlMustMatchPattern() - .toString()) - + ", must-not-match=" - + ((response.profile() == null) ? "null" : response - .profile() - .urlMustNotMatchPattern() - .toString()) - + ", initiatorHash=" - + ((response.initiator() == null) ? "null" : ASCII.String(response.initiator())) - + - //", responseHeader=" + ((entry.responseHeader() == null) ? "null" : entry.responseHeader().toString()) + - ", url=" - + response.url()); // DEBUG + this.log.logFine( + "processResourceStack processCase=" + processCase + + ", depth=" + response.depth() + + ", maxDepth=" + ((response.profile() == null) ? "null" : Integer.toString(response.profile().depth())) + + ", must-match=" + ((response.profile() == null) ? "null" : response.profile().urlMustMatchPattern().toString()) + + ", must-not-match=" + ((response.profile() == null) ? "null" : response.profile().urlMustNotMatchPattern().toString()) + + ", initiatorHash=" + ((response.initiator() == null) ? "null" : ASCII.String(response.initiator())) + + ", url=" + response.url()); // DEBUG } // PARSE CONTENT @@ -2353,8 +2339,13 @@ public final class Switchboard extends serverSwitch // put anchors on crawl stack final long stackStartTime = System.currentTimeMillis(); - if ( ((processCase == EventOrigin.PROXY_LOAD) || (processCase == EventOrigin.LOCAL_CRAWLING)) - && ((response.profile() == null) || (response.depth() < response.profile().depth())) ) { + if ((processCase == EventOrigin.PROXY_LOAD || processCase == EventOrigin.LOCAL_CRAWLING) && + ( + response.profile() == null || + response.depth() < response.profile().depth() || + response.profile().crawlerNoDepthLimitMatchPattern().matcher(response.url().toNormalform(false, false)).matches() + ) + ) { // get the hyperlinks final Map hl = Document.getHyperlinks(documents); @@ -2415,24 +2406,25 @@ public final class Switchboard extends serverSwitch public IndexingQueueEntry condenseDocument(final IndexingQueueEntry in) { in.queueEntry.updateStatus(Response.QUEUE_STATE_CONDENSING); - if ( !in.queueEntry.profile().indexText() && !in.queueEntry.profile().indexMedia() ) { - if ( this.log.isInfo() ) { - this.log.logInfo("Not Condensed Resource '" - + in.queueEntry.url().toNormalform(false, true) - + "': indexing not wanted by crawl profile"); - } + CrawlProfile profile = in.queueEntry.profile(); + String urls = in.queueEntry.url().toNormalform(false, true); + + // check profile attributes which prevent indexing (while crawling is allowed) + if (!profile.indexText() && !profile.indexMedia()) { + if (this.log.isInfo()) this.log.logInfo("Not Condensed Resource '" + urls + "': indexing of this media type not wanted by crawl profile"); return new IndexingQueueEntry(in.queueEntry, in.documents, null); } - + if (!profile.indexUrlMustMatchPattern().matcher(urls).matches() || + profile.indexUrlMustNotMatchPattern().matcher(urls).matches() ) { + if (this.log.isInfo()) this.log.logInfo("Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url"); + return new IndexingQueueEntry(in.queueEntry, in.documents, null); + } + // check which files may take part in the indexing process final List doclist = new ArrayList(); for ( final Document document : in.documents ) { if ( document.indexingDenied() ) { - if ( this.log.isInfo() ) { - this.log.logInfo("Not Condensed Resource '" - + in.queueEntry.url().toNormalform(false, true) - + "': denied by document-attached noindexing rule"); - } + if ( this.log.isInfo() ) this.log.logInfo("Not Condensed Resource '" + urls + "': denied by document-attached noindexing rule"); addURLtoErrorDB( in.queueEntry.url(), in.queueEntry.referrerHash(), @@ -2459,7 +2451,6 @@ public final class Switchboard extends serverSwitch // update image result list statistics // its good to do this concurrently here, because it needs a DNS lookup // to compute a URL hash which is necessary for a double-check - final CrawlProfile profile = in.queueEntry.profile(); ResultImages.registerImages(in.queueEntry.url(), in.documents[i], (profile == null) ? true : !profile.remoteIndexing());