From 60b1e23f0537ddb2e8299189a2dcc180978df223 Mon Sep 17 00:00:00 2001
From: orbiter <mc@yacy.net>
Date: Sun, 16 Sep 2012 21:27:55 +0200
Subject: [PATCH] added new crawl options: - indexUrlMustMatch and
 indexUrlMustNotMatch which can be used to select loaded pages for indexing.
 Default patterns are in such a way that all loaded pages are also indexed (as
 before) but when doing an expert crawl start, then the user may select only
 specific urls to be indexed. - crawlerNoDepthLimitMatch is a new pattern that
 can be used to remove the crawl depth limitation. This filter a never-match
 by default (which causes that the depth is used) but the user can select
 paths which will be loaded completely even if a crawl depth is reached.

---
 htroot/CrawlStartExpert_p.html          | 27 ++++++++--
 htroot/CrawlStartExpert_p.java          |  3 ++
 htroot/Crawler_p.java                   | 12 +++--
 source/net/yacy/search/Switchboard.java | 65 +++++++++++--------------
 4 files changed, 64 insertions(+), 43 deletions(-)
diff --git a/htroot/CrawlStartExpert_p.html b/htroot/CrawlStartExpert_p.html
index 8d3514708..5baece971 100644
--- a/htroot/CrawlStartExpert_p.html
+++ b/htroot/CrawlStartExpert_p.html
@@ -92,7 +92,8 @@
           <td>Crawling Depth:</td>
           <td>
             <input name="crawlingDepth" id="crawlingDepth" type="text" size="2" maxlength="2" value="#[crawlingDepth]#" />&nbsp;&nbsp;&nbsp;
-            <input type="checkbox" name="directDocByURL" id="directDocByURL" #(directDocByURLChecked)#::checked="checked"#(/directDocByURLChecked)# />also all linked non-parsable documents
+            <input type="checkbox" name="directDocByURL" id="directDocByURL" #(directDocByURLChecked)#::checked="checked"#(/directDocByURLChecked)# />also all linked non-parsable documents<br/>
+            Unlimited crawl depth for URLs matching with: <input name="crawlingDepthExtension" id="crawlingDepthExtension" type="text" size="30" maxlength="100" value="#[crawlingDepthExtension]#" />
           </td>
           <td>
             This defines how often the Crawler will follow links (of links..) embedded in websites.
@@ -150,7 +151,7 @@
           </td>
         </tr>
         <tr valign="top" class="TableCellLight">
-          <td><label for="mustmatch">Must-Match Filter for URLs</label>:</td>
+          <td><label for="mustmatch">Must-Match Filter for URLs for crawling</label>:</td>
           <td>
 			<input type="radio" name="range" id="rangeWide" value="wide" checked="checked" />Use filter&nbsp;&nbsp;
 			<input name="mustmatch" id="mustmatch" type="text" size="60" maxlength="100" value="#[mustmatch]#" /><br />
@@ -165,7 +166,7 @@
           </td>
         </tr>
         <tr valign="top" class="TableCellDark">
-          <td><label for="mustnotmatch">Must-Not-Match Filter for URLs</label>:</td>
+          <td><label for="mustnotmatch">Must-Not-Match Filter for URLs for crawling</label>:</td>
           <td>
 			<input name="mustnotmatch" id="mustnotmatch" type="text" size="60" maxlength="100" value="#[mustnotmatch]#" />
 		  </td>
@@ -176,6 +177,26 @@
             If you don't know what this means, please leave this field empty.
           </td>
         </tr>
+        <tr valign="top" class="TableCellLight">
+          <td><label for="indexmustmatch">Must-Match Filter for URLs for indexing</label>:</td>
+          <td>
+			<input name="indexmustmatch" id="indexmustmatch" type="text" size="60" maxlength="100" value="#[indexmustmatch]#" /><br />
+		  </td>
+          <td>
+            The filter is a <b><a href="http://download.oracle.com/javase/6/docs/api/java/util/regex/Pattern.html">regular expression</a></b>
+            that <b>must match</b> with the URLs to allow that the content of the url is indexed.
+          </td>
+        </tr>
+        <tr valign="top" class="TableCellDark">
+          <td><label for="indexmustnotmatch">Must-Not-Match Filter for URLs for indexing</label>:</td>
+          <td>
+			<input name="indexmustnotmatch" id="indexmustnotmatch" type="text" size="60" maxlength="100" value="#[indexmustnotmatch]#" />
+		  </td>
+          <td>
+            The filter is a <b><a href="http://download.oracle.com/javase/6/docs/api/java/util/regex/Pattern.html">regular expression</a></b>
+            that <b>must not match</b> with the URLs to allow that the content of the url is indexed.
+          </td>
+        </tr>
         <tr valign="top" class="TableCellLight">
           <td><label for="ipMustmatch">Must-Match Filter for IPs</label>:</td>
           <td>
diff --git a/htroot/CrawlStartExpert_p.java b/htroot/CrawlStartExpert_p.java
index 2ab83f865..8bde5e4e7 100644
--- a/htroot/CrawlStartExpert_p.java
+++ b/htroot/CrawlStartExpert_p.java
@@ -43,9 +43,12 @@ public class CrawlStartExpert_p {
         prop.put("starturl", /*(intranet) ? repository :*/ "");
         prop.put("proxyPrefetchDepth", env.getConfig("proxyPrefetchDepth", "0"));
         prop.put("crawlingDepth", Math.min(3, env.getConfigLong("crawlingDepth", 0)));
+        prop.put("crawlingDepthExtension", CrawlProfile.MATCH_NEVER_STRING);
         prop.put("directDocByURLChecked", sb.getConfigBool("crawlingDirectDocByURL", true) ? "1" : "0");
         prop.put("mustmatch", /*(intranet) ? repository + ".*" :*/ CrawlProfile.MATCH_ALL_STRING);
         prop.put("mustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
+        prop.put("indexmustmatch", CrawlProfile.MATCH_ALL_STRING);
+        prop.put("indexmustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
         prop.put("ipMustmatch", sb.getConfig("crawlingIPMustMatch", CrawlProfile.MATCH_ALL_STRING));
         prop.put("ipMustnotmatch", sb.getConfig("crawlingIPMustNotMatch", CrawlProfile.MATCH_NEVER_STRING));
         prop.put("countryMustMatch", sb.getConfig("crawlingCountryMustMatch", ""));
diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java
index 1399e254d..8c486ab07 100644
--- a/htroot/Crawler_p.java
+++ b/htroot/Crawler_p.java
@@ -204,9 +204,15 @@ public class Crawler_p {
                 sb.setConfig("crawlingIPMustNotMatch", ipMustNotMatch);
                 if (countryMustMatch.length() > 0) sb.setConfig("crawlingCountryMustMatch", countryMustMatch);
 
+                String crawlerNoDepthLimitMatch = post.get("crawlingDepthExtension", CrawlProfile.MATCH_NEVER_STRING);
+                final String indexUrlMustMatch = post.get("indexmustmatch", CrawlProfile.MATCH_ALL_STRING);
+                final String indexUrlMustNotMatch = post.get("indexmustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
+
                 final boolean crawlOrder = post.get("crawlOrder", "off").equals("on");
                 env.setConfig("crawlOrder", crawlOrder);
 
+                if (crawlOrder) crawlerNoDepthLimitMatch = CrawlProfile.MATCH_NEVER_STRING; // without limitation the crawl order does not work
+                
                 int newcrawlingdepth = post.getInt("crawlingDepth", 8);
                 env.setConfig("crawlingDepth", Integer.toString(newcrawlingdepth));
                 if ((crawlOrder) && (newcrawlingdepth > 8)) newcrawlingdepth = 8;
@@ -347,9 +353,9 @@ public class Crawler_p {
                         ipMustMatch,
                         ipMustNotMatch,
                         countryMustMatch,
-                        CrawlProfile.MATCH_NEVER_STRING,
-                        CrawlProfile.MATCH_ALL_STRING,
-                        CrawlProfile.MATCH_NEVER_STRING,
+                        crawlerNoDepthLimitMatch,
+                        indexUrlMustMatch,
+                        indexUrlMustNotMatch,
                         newcrawlingdepth,
                         directDocByURL,
                         crawlingIfOlder,
diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java
index 38a527ed1..7bd1c994f 100644
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@@ -2284,28 +2284,14 @@ public final class Switchboard extends serverSwitch
         final EventOrigin processCase = response.processCase(this.peers.mySeed().hash);
 
         if ( this.log.isFine() ) {
-            this.log.logFine("processResourceStack processCase="
-                + processCase
-                + ", depth="
-                + response.depth()
-                + ", maxDepth="
-                + ((response.profile() == null) ? "null" : Integer.toString(response.profile().depth()))
-                + ", must-match="
-                + ((response.profile() == null) ? "null" : response
-                    .profile()
-                    .urlMustMatchPattern()
-                    .toString())
-                + ", must-not-match="
-                + ((response.profile() == null) ? "null" : response
-                    .profile()
-                    .urlMustNotMatchPattern()
-                    .toString())
-                + ", initiatorHash="
-                + ((response.initiator() == null) ? "null" : ASCII.String(response.initiator()))
-                +
-                //", responseHeader=" + ((entry.responseHeader() == null) ? "null" : entry.responseHeader().toString()) +
-                ", url="
-                + response.url()); // DEBUG
+            this.log.logFine(
+                "processResourceStack processCase=" + processCase
+                + ", depth=" + response.depth()
+                + ", maxDepth=" + ((response.profile() == null) ? "null" : Integer.toString(response.profile().depth()))
+                + ", must-match=" + ((response.profile() == null) ? "null" : response.profile().urlMustMatchPattern().toString())
+                + ", must-not-match=" + ((response.profile() == null) ? "null" : response.profile().urlMustNotMatchPattern().toString())
+                + ", initiatorHash=" + ((response.initiator() == null) ? "null" : ASCII.String(response.initiator()))
+                + ", url=" + response.url()); // DEBUG
         }
 
         // PARSE CONTENT
@@ -2353,8 +2339,13 @@ public final class Switchboard extends serverSwitch
 
         // put anchors on crawl stack
         final long stackStartTime = System.currentTimeMillis();
-        if ( ((processCase == EventOrigin.PROXY_LOAD) || (processCase == EventOrigin.LOCAL_CRAWLING))
-            && ((response.profile() == null) || (response.depth() < response.profile().depth())) ) {
+        if ((processCase == EventOrigin.PROXY_LOAD || processCase == EventOrigin.LOCAL_CRAWLING) &&
+            (
+                response.profile() == null ||
+                response.depth() < response.profile().depth() ||
+                response.profile().crawlerNoDepthLimitMatchPattern().matcher(response.url().toNormalform(false, false)).matches()
+            )
+           ) {
             // get the hyperlinks
             final Map<MultiProtocolURI, String> hl = Document.getHyperlinks(documents);
 
@@ -2415,24 +2406,25 @@ public final class Switchboard extends serverSwitch
 
     public IndexingQueueEntry condenseDocument(final IndexingQueueEntry in) {
         in.queueEntry.updateStatus(Response.QUEUE_STATE_CONDENSING);
-        if ( !in.queueEntry.profile().indexText() && !in.queueEntry.profile().indexMedia() ) {
-            if ( this.log.isInfo() ) {
-                this.log.logInfo("Not Condensed Resource '"
-                    + in.queueEntry.url().toNormalform(false, true)
-                    + "': indexing not wanted by crawl profile");
-            }
+        CrawlProfile profile = in.queueEntry.profile();
+        String urls = in.queueEntry.url().toNormalform(false, true);
+        
+        // check profile attributes which prevent indexing (while crawling is allowed)
+        if (!profile.indexText() && !profile.indexMedia()) {
+            if (this.log.isInfo()) this.log.logInfo("Not Condensed Resource '" + urls + "': indexing of this media type not wanted by crawl profile");
             return new IndexingQueueEntry(in.queueEntry, in.documents, null);
         }
-
+        if (!profile.indexUrlMustMatchPattern().matcher(urls).matches() ||
+             profile.indexUrlMustNotMatchPattern().matcher(urls).matches() ) {
+            if (this.log.isInfo()) this.log.logInfo("Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url");
+            return new IndexingQueueEntry(in.queueEntry, in.documents, null);
+        }
+        
         // check which files may take part in the indexing process
         final List<Document> doclist = new ArrayList<Document>();
         for ( final Document document : in.documents ) {
             if ( document.indexingDenied() ) {
-                if ( this.log.isInfo() ) {
-                    this.log.logInfo("Not Condensed Resource '"
-                        + in.queueEntry.url().toNormalform(false, true)
-                        + "': denied by document-attached noindexing rule");
-                }
+                if ( this.log.isInfo() ) this.log.logInfo("Not Condensed Resource '" + urls + "': denied by document-attached noindexing rule");
                 addURLtoErrorDB(
                     in.queueEntry.url(),
                     in.queueEntry.referrerHash(),
@@ -2459,7 +2451,6 @@ public final class Switchboard extends serverSwitch
             // update image result list statistics
             // its good to do this concurrently here, because it needs a DNS lookup
             // to compute a URL hash which is necessary for a double-check
-            final CrawlProfile profile = in.queueEntry.profile();
             ResultImages.registerImages(in.queueEntry.url(), in.documents[i], (profile == null)
                 ? true
                 : !profile.remoteIndexing());