diff --git a/htroot/CrawlStartExpert_p.html b/htroot/CrawlStartExpert_p.html
index 8d3514708..5baece971 100644
--- a/htroot/CrawlStartExpert_p.html
+++ b/htroot/CrawlStartExpert_p.html
@@ -92,7 +92,8 @@
: |
diff --git a/htroot/CrawlStartExpert_p.java b/htroot/CrawlStartExpert_p.java
index 2ab83f865..8bde5e4e7 100644
--- a/htroot/CrawlStartExpert_p.java
+++ b/htroot/CrawlStartExpert_p.java
@@ -43,9 +43,12 @@ public class CrawlStartExpert_p {
prop.put("starturl", /*(intranet) ? repository :*/ "");
prop.put("proxyPrefetchDepth", env.getConfig("proxyPrefetchDepth", "0"));
prop.put("crawlingDepth", Math.min(3, env.getConfigLong("crawlingDepth", 0)));
+ prop.put("crawlingDepthExtension", CrawlProfile.MATCH_NEVER_STRING);
prop.put("directDocByURLChecked", sb.getConfigBool("crawlingDirectDocByURL", true) ? "1" : "0");
prop.put("mustmatch", /*(intranet) ? repository + ".*" :*/ CrawlProfile.MATCH_ALL_STRING);
prop.put("mustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
+ prop.put("indexmustmatch", CrawlProfile.MATCH_ALL_STRING);
+ prop.put("indexmustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
prop.put("ipMustmatch", sb.getConfig("crawlingIPMustMatch", CrawlProfile.MATCH_ALL_STRING));
prop.put("ipMustnotmatch", sb.getConfig("crawlingIPMustNotMatch", CrawlProfile.MATCH_NEVER_STRING));
prop.put("countryMustMatch", sb.getConfig("crawlingCountryMustMatch", ""));
diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java
index 1399e254d..8c486ab07 100644
--- a/htroot/Crawler_p.java
+++ b/htroot/Crawler_p.java
@@ -204,9 +204,15 @@ public class Crawler_p {
sb.setConfig("crawlingIPMustNotMatch", ipMustNotMatch);
if (countryMustMatch.length() > 0) sb.setConfig("crawlingCountryMustMatch", countryMustMatch);
+ String crawlerNoDepthLimitMatch = post.get("crawlingDepthExtension", CrawlProfile.MATCH_NEVER_STRING);
+ final String indexUrlMustMatch = post.get("indexmustmatch", CrawlProfile.MATCH_ALL_STRING);
+ final String indexUrlMustNotMatch = post.get("indexmustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
+
final boolean crawlOrder = post.get("crawlOrder", "off").equals("on");
env.setConfig("crawlOrder", crawlOrder);
+ if (crawlOrder) crawlerNoDepthLimitMatch = CrawlProfile.MATCH_NEVER_STRING; // without limitation the crawl order does not work
+
int newcrawlingdepth = post.getInt("crawlingDepth", 8);
env.setConfig("crawlingDepth", Integer.toString(newcrawlingdepth));
if ((crawlOrder) && (newcrawlingdepth > 8)) newcrawlingdepth = 8;
@@ -347,9 +353,9 @@ public class Crawler_p {
ipMustMatch,
ipMustNotMatch,
countryMustMatch,
- CrawlProfile.MATCH_NEVER_STRING,
- CrawlProfile.MATCH_ALL_STRING,
- CrawlProfile.MATCH_NEVER_STRING,
+ crawlerNoDepthLimitMatch,
+ indexUrlMustMatch,
+ indexUrlMustNotMatch,
newcrawlingdepth,
directDocByURL,
crawlingIfOlder,
diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java
index 38a527ed1..7bd1c994f 100644
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@@ -2284,28 +2284,14 @@ public final class Switchboard extends serverSwitch
final EventOrigin processCase = response.processCase(this.peers.mySeed().hash);
if ( this.log.isFine() ) {
- this.log.logFine("processResourceStack processCase="
- + processCase
- + ", depth="
- + response.depth()
- + ", maxDepth="
- + ((response.profile() == null) ? "null" : Integer.toString(response.profile().depth()))
- + ", must-match="
- + ((response.profile() == null) ? "null" : response
- .profile()
- .urlMustMatchPattern()
- .toString())
- + ", must-not-match="
- + ((response.profile() == null) ? "null" : response
- .profile()
- .urlMustNotMatchPattern()
- .toString())
- + ", initiatorHash="
- + ((response.initiator() == null) ? "null" : ASCII.String(response.initiator()))
- +
- //", responseHeader=" + ((entry.responseHeader() == null) ? "null" : entry.responseHeader().toString()) +
- ", url="
- + response.url()); // DEBUG
+ this.log.logFine(
+ "processResourceStack processCase=" + processCase
+ + ", depth=" + response.depth()
+ + ", maxDepth=" + ((response.profile() == null) ? "null" : Integer.toString(response.profile().depth()))
+ + ", must-match=" + ((response.profile() == null) ? "null" : response.profile().urlMustMatchPattern().toString())
+ + ", must-not-match=" + ((response.profile() == null) ? "null" : response.profile().urlMustNotMatchPattern().toString())
+ + ", initiatorHash=" + ((response.initiator() == null) ? "null" : ASCII.String(response.initiator()))
+ + ", url=" + response.url()); // DEBUG
}
// PARSE CONTENT
@@ -2353,8 +2339,13 @@ public final class Switchboard extends serverSwitch
// put anchors on crawl stack
final long stackStartTime = System.currentTimeMillis();
- if ( ((processCase == EventOrigin.PROXY_LOAD) || (processCase == EventOrigin.LOCAL_CRAWLING))
- && ((response.profile() == null) || (response.depth() < response.profile().depth())) ) {
+ if ((processCase == EventOrigin.PROXY_LOAD || processCase == EventOrigin.LOCAL_CRAWLING) &&
+ (
+ response.profile() == null ||
+ response.depth() < response.profile().depth() ||
+ response.profile().crawlerNoDepthLimitMatchPattern().matcher(response.url().toNormalform(false, false)).matches()
+ )
+ ) {
// get the hyperlinks
final Map hl = Document.getHyperlinks(documents);
@@ -2415,24 +2406,25 @@ public final class Switchboard extends serverSwitch
public IndexingQueueEntry condenseDocument(final IndexingQueueEntry in) {
in.queueEntry.updateStatus(Response.QUEUE_STATE_CONDENSING);
- if ( !in.queueEntry.profile().indexText() && !in.queueEntry.profile().indexMedia() ) {
- if ( this.log.isInfo() ) {
- this.log.logInfo("Not Condensed Resource '"
- + in.queueEntry.url().toNormalform(false, true)
- + "': indexing not wanted by crawl profile");
- }
+ CrawlProfile profile = in.queueEntry.profile();
+ String urls = in.queueEntry.url().toNormalform(false, true);
+
+ // check profile attributes which prevent indexing (while crawling is allowed)
+ if (!profile.indexText() && !profile.indexMedia()) {
+ if (this.log.isInfo()) this.log.logInfo("Not Condensed Resource '" + urls + "': indexing of this media type not wanted by crawl profile");
return new IndexingQueueEntry(in.queueEntry, in.documents, null);
}
-
+ if (!profile.indexUrlMustMatchPattern().matcher(urls).matches() ||
+ profile.indexUrlMustNotMatchPattern().matcher(urls).matches() ) {
+ if (this.log.isInfo()) this.log.logInfo("Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url");
+ return new IndexingQueueEntry(in.queueEntry, in.documents, null);
+ }
+
// check which files may take part in the indexing process
final List doclist = new ArrayList();
for ( final Document document : in.documents ) {
if ( document.indexingDenied() ) {
- if ( this.log.isInfo() ) {
- this.log.logInfo("Not Condensed Resource '"
- + in.queueEntry.url().toNormalform(false, true)
- + "': denied by document-attached noindexing rule");
- }
+ if ( this.log.isInfo() ) this.log.logInfo("Not Condensed Resource '" + urls + "': denied by document-attached noindexing rule");
addURLtoErrorDB(
in.queueEntry.url(),
in.queueEntry.referrerHash(),
@@ -2459,7 +2451,6 @@ public final class Switchboard extends serverSwitch
// update image result list statistics
// its good to do this concurrently here, because it needs a DNS lookup
// to compute a URL hash which is necessary for a double-check
- final CrawlProfile profile = in.queueEntry.profile();
ResultImages.registerImages(in.queueEntry.url(), in.documents[i], (profile == null)
? true
: !profile.remoteIndexing());
|