added new crawl options:

- indexUrlMustMatch and indexUrlMustNotMatch which can be used to select
loaded pages for indexing. Default patterns are in such a way that all
loaded pages are also indexed (as before) but when doing an expert crawl
start, then the user may select only specific urls to be indexed.
- crawlerNoDepthLimitMatch is a new pattern that can be used to remove
the crawl depth limitation. This filter a never-match by default (which
causes that the depth is used) but the user can select paths which will
be loaded completely even if a crawl depth is reached.
pull/1/head
orbiter 13 years ago
parent 4987921d3d
commit 60b1e23f05

@ -92,7 +92,8 @@
<td>Crawling Depth:</td>
<td>
<input name="crawlingDepth" id="crawlingDepth" type="text" size="2" maxlength="2" value="#[crawlingDepth]#" />&nbsp;&nbsp;&nbsp;
<input type="checkbox" name="directDocByURL" id="directDocByURL" #(directDocByURLChecked)#::checked="checked"#(/directDocByURLChecked)# />also all linked non-parsable documents
<input type="checkbox" name="directDocByURL" id="directDocByURL" #(directDocByURLChecked)#::checked="checked"#(/directDocByURLChecked)# />also all linked non-parsable documents<br/>
Unlimited crawl depth for URLs matching with: <input name="crawlingDepthExtension" id="crawlingDepthExtension" type="text" size="30" maxlength="100" value="#[crawlingDepthExtension]#" />
</td>
<td>
This defines how often the Crawler will follow links (of links..) embedded in websites.
@ -150,7 +151,7 @@
</td>
</tr>
<tr valign="top" class="TableCellLight">
<td><label for="mustmatch">Must-Match Filter for URLs</label>:</td>
<td><label for="mustmatch">Must-Match Filter for URLs for crawling</label>:</td>
<td>
<input type="radio" name="range" id="rangeWide" value="wide" checked="checked" />Use filter&nbsp;&nbsp;
<input name="mustmatch" id="mustmatch" type="text" size="60" maxlength="100" value="#[mustmatch]#" /><br />
@ -165,7 +166,7 @@
</td>
</tr>
<tr valign="top" class="TableCellDark">
<td><label for="mustnotmatch">Must-Not-Match Filter for URLs</label>:</td>
<td><label for="mustnotmatch">Must-Not-Match Filter for URLs for crawling</label>:</td>
<td>
<input name="mustnotmatch" id="mustnotmatch" type="text" size="60" maxlength="100" value="#[mustnotmatch]#" />
</td>
@ -176,6 +177,26 @@
If you don't know what this means, please leave this field empty.
</td>
</tr>
<tr valign="top" class="TableCellLight">
<td><label for="indexmustmatch">Must-Match Filter for URLs for indexing</label>:</td>
<td>
<input name="indexmustmatch" id="indexmustmatch" type="text" size="60" maxlength="100" value="#[indexmustmatch]#" /><br />
</td>
<td>
The filter is a <b><a href="http://download.oracle.com/javase/6/docs/api/java/util/regex/Pattern.html">regular expression</a></b>
that <b>must match</b> with the URLs to allow that the content of the url is indexed.
</td>
</tr>
<tr valign="top" class="TableCellDark">
<td><label for="indexmustnotmatch">Must-Not-Match Filter for URLs for indexing</label>:</td>
<td>
<input name="indexmustnotmatch" id="indexmustnotmatch" type="text" size="60" maxlength="100" value="#[indexmustnotmatch]#" />
</td>
<td>
The filter is a <b><a href="http://download.oracle.com/javase/6/docs/api/java/util/regex/Pattern.html">regular expression</a></b>
that <b>must not match</b> with the URLs to allow that the content of the url is indexed.
</td>
</tr>
<tr valign="top" class="TableCellLight">
<td><label for="ipMustmatch">Must-Match Filter for IPs</label>:</td>
<td>

@ -43,9 +43,12 @@ public class CrawlStartExpert_p {
prop.put("starturl", /*(intranet) ? repository :*/ "");
prop.put("proxyPrefetchDepth", env.getConfig("proxyPrefetchDepth", "0"));
prop.put("crawlingDepth", Math.min(3, env.getConfigLong("crawlingDepth", 0)));
prop.put("crawlingDepthExtension", CrawlProfile.MATCH_NEVER_STRING);
prop.put("directDocByURLChecked", sb.getConfigBool("crawlingDirectDocByURL", true) ? "1" : "0");
prop.put("mustmatch", /*(intranet) ? repository + ".*" :*/ CrawlProfile.MATCH_ALL_STRING);
prop.put("mustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
prop.put("indexmustmatch", CrawlProfile.MATCH_ALL_STRING);
prop.put("indexmustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
prop.put("ipMustmatch", sb.getConfig("crawlingIPMustMatch", CrawlProfile.MATCH_ALL_STRING));
prop.put("ipMustnotmatch", sb.getConfig("crawlingIPMustNotMatch", CrawlProfile.MATCH_NEVER_STRING));
prop.put("countryMustMatch", sb.getConfig("crawlingCountryMustMatch", ""));

@ -204,9 +204,15 @@ public class Crawler_p {
sb.setConfig("crawlingIPMustNotMatch", ipMustNotMatch);
if (countryMustMatch.length() > 0) sb.setConfig("crawlingCountryMustMatch", countryMustMatch);
String crawlerNoDepthLimitMatch = post.get("crawlingDepthExtension", CrawlProfile.MATCH_NEVER_STRING);
final String indexUrlMustMatch = post.get("indexmustmatch", CrawlProfile.MATCH_ALL_STRING);
final String indexUrlMustNotMatch = post.get("indexmustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
final boolean crawlOrder = post.get("crawlOrder", "off").equals("on");
env.setConfig("crawlOrder", crawlOrder);
if (crawlOrder) crawlerNoDepthLimitMatch = CrawlProfile.MATCH_NEVER_STRING; // without limitation the crawl order does not work
int newcrawlingdepth = post.getInt("crawlingDepth", 8);
env.setConfig("crawlingDepth", Integer.toString(newcrawlingdepth));
if ((crawlOrder) && (newcrawlingdepth > 8)) newcrawlingdepth = 8;
@ -347,9 +353,9 @@ public class Crawler_p {
ipMustMatch,
ipMustNotMatch,
countryMustMatch,
CrawlProfile.MATCH_NEVER_STRING,
CrawlProfile.MATCH_ALL_STRING,
CrawlProfile.MATCH_NEVER_STRING,
crawlerNoDepthLimitMatch,
indexUrlMustMatch,
indexUrlMustNotMatch,
newcrawlingdepth,
directDocByURL,
crawlingIfOlder,

@ -2284,28 +2284,14 @@ public final class Switchboard extends serverSwitch
final EventOrigin processCase = response.processCase(this.peers.mySeed().hash);
if ( this.log.isFine() ) {
this.log.logFine("processResourceStack processCase="
+ processCase
+ ", depth="
+ response.depth()
+ ", maxDepth="
+ ((response.profile() == null) ? "null" : Integer.toString(response.profile().depth()))
+ ", must-match="
+ ((response.profile() == null) ? "null" : response
.profile()
.urlMustMatchPattern()
.toString())
+ ", must-not-match="
+ ((response.profile() == null) ? "null" : response
.profile()
.urlMustNotMatchPattern()
.toString())
+ ", initiatorHash="
+ ((response.initiator() == null) ? "null" : ASCII.String(response.initiator()))
+
//", responseHeader=" + ((entry.responseHeader() == null) ? "null" : entry.responseHeader().toString()) +
", url="
+ response.url()); // DEBUG
this.log.logFine(
"processResourceStack processCase=" + processCase
+ ", depth=" + response.depth()
+ ", maxDepth=" + ((response.profile() == null) ? "null" : Integer.toString(response.profile().depth()))
+ ", must-match=" + ((response.profile() == null) ? "null" : response.profile().urlMustMatchPattern().toString())
+ ", must-not-match=" + ((response.profile() == null) ? "null" : response.profile().urlMustNotMatchPattern().toString())
+ ", initiatorHash=" + ((response.initiator() == null) ? "null" : ASCII.String(response.initiator()))
+ ", url=" + response.url()); // DEBUG
}
// PARSE CONTENT
@ -2353,8 +2339,13 @@ public final class Switchboard extends serverSwitch
// put anchors on crawl stack
final long stackStartTime = System.currentTimeMillis();
if ( ((processCase == EventOrigin.PROXY_LOAD) || (processCase == EventOrigin.LOCAL_CRAWLING))
&& ((response.profile() == null) || (response.depth() < response.profile().depth())) ) {
if ((processCase == EventOrigin.PROXY_LOAD || processCase == EventOrigin.LOCAL_CRAWLING) &&
(
response.profile() == null ||
response.depth() < response.profile().depth() ||
response.profile().crawlerNoDepthLimitMatchPattern().matcher(response.url().toNormalform(false, false)).matches()
)
) {
// get the hyperlinks
final Map<MultiProtocolURI, String> hl = Document.getHyperlinks(documents);
@ -2415,12 +2406,17 @@ public final class Switchboard extends serverSwitch
public IndexingQueueEntry condenseDocument(final IndexingQueueEntry in) {
in.queueEntry.updateStatus(Response.QUEUE_STATE_CONDENSING);
if ( !in.queueEntry.profile().indexText() && !in.queueEntry.profile().indexMedia() ) {
if ( this.log.isInfo() ) {
this.log.logInfo("Not Condensed Resource '"
+ in.queueEntry.url().toNormalform(false, true)
+ "': indexing not wanted by crawl profile");
CrawlProfile profile = in.queueEntry.profile();
String urls = in.queueEntry.url().toNormalform(false, true);
// check profile attributes which prevent indexing (while crawling is allowed)
if (!profile.indexText() && !profile.indexMedia()) {
if (this.log.isInfo()) this.log.logInfo("Not Condensed Resource '" + urls + "': indexing of this media type not wanted by crawl profile");
return new IndexingQueueEntry(in.queueEntry, in.documents, null);
}
if (!profile.indexUrlMustMatchPattern().matcher(urls).matches() ||
profile.indexUrlMustNotMatchPattern().matcher(urls).matches() ) {
if (this.log.isInfo()) this.log.logInfo("Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url");
return new IndexingQueueEntry(in.queueEntry, in.documents, null);
}
@ -2428,11 +2424,7 @@ public final class Switchboard extends serverSwitch
final List<Document> doclist = new ArrayList<Document>();
for ( final Document document : in.documents ) {
if ( document.indexingDenied() ) {
if ( this.log.isInfo() ) {
this.log.logInfo("Not Condensed Resource '"
+ in.queueEntry.url().toNormalform(false, true)
+ "': denied by document-attached noindexing rule");
}
if ( this.log.isInfo() ) this.log.logInfo("Not Condensed Resource '" + urls + "': denied by document-attached noindexing rule");
addURLtoErrorDB(
in.queueEntry.url(),
in.queueEntry.referrerHash(),
@ -2459,7 +2451,6 @@ public final class Switchboard extends serverSwitch
// update image result list statistics
// its good to do this concurrently here, because it needs a DNS lookup
// to compute a URL hash which is necessary for a double-check
final CrawlProfile profile = in.queueEntry.profile();
ResultImages.registerImages(in.queueEntry.url(), in.documents[i], (profile == null)
? true
: !profile.remoteIndexing());

Loading…
Cancel
Save