added new crawl options:

- indexUrlMustMatch and indexUrlMustNotMatch which can be used to select loaded pages for indexing. Default patterns are in such a way that all loaded pages are also indexed (as before) but when doing an expert crawl start, then the user may select only specific urls to be indexed. - crawlerNoDepthLimitMatch is a new pattern that can be used to remove the crawl depth limitation. This filter a never-match by default (which causes that the depth is used) but the user can select paths which will be loaded completely even if a crawl depth is reached.
13 years ago · 60b1e23f05
parent 4987921d3d
commit 60b1e23f05
4 changed files with 64 additions and 43 deletions
--- a/htroot/CrawlStartExpert_p.html
+++ b/htroot/CrawlStartExpert_p.html
@ -92,7 +92,8 @@
          <td>Crawling Depth:</td>
          <td>
            <input name="crawlingDepth" id="crawlingDepth" type="text" size="2" maxlength="2" value="#[crawlingDepth]#" />&nbsp;&nbsp;&nbsp;
-            <input type="checkbox" name="directDocByURL" id="directDocByURL" #(directDocByURLChecked)#::checked="checked"#(/directDocByURLChecked)# />also all linked non-parsable documents
+            <input type="checkbox" name="directDocByURL" id="directDocByURL" #(directDocByURLChecked)#::checked="checked"#(/directDocByURLChecked)# />also all linked non-parsable documents<br/>
+            Unlimited crawl depth for URLs matching with: <input name="crawlingDepthExtension" id="crawlingDepthExtension" type="text" size="30" maxlength="100" value="#[crawlingDepthExtension]#" />
          </td>
          <td>
            This defines how often the Crawler will follow links (of links..) embedded in websites.
@ -150,7 +151,7 @@
          </td>
        </tr>
        <tr valign="top" class="TableCellLight">
-          <td><label for="mustmatch">Must-Match Filter for URLs</label>:</td>
+          <td><label for="mustmatch">Must-Match Filter for URLs for crawling</label>:</td>
          <td>
 			<input type="radio" name="range" id="rangeWide" value="wide" checked="checked" />Use filter&nbsp;&nbsp;
 			<input name="mustmatch" id="mustmatch" type="text" size="60" maxlength="100" value="#[mustmatch]#" /><br />
@ -165,7 +166,7 @@
          </td>
        </tr>
        <tr valign="top" class="TableCellDark">
-          <td><label for="mustnotmatch">Must-Not-Match Filter for URLs</label>:</td>
+          <td><label for="mustnotmatch">Must-Not-Match Filter for URLs for crawling</label>:</td>
          <td>
 			<input name="mustnotmatch" id="mustnotmatch" type="text" size="60" maxlength="100" value="#[mustnotmatch]#" />
 		  </td>
@ -176,6 +177,26 @@
            If you don't know what this means, please leave this field empty.
          </td>
        </tr>
+        <tr valign="top" class="TableCellLight">
+          <td><label for="indexmustmatch">Must-Match Filter for URLs for indexing</label>:</td>
+          <td>
+			<input name="indexmustmatch" id="indexmustmatch" type="text" size="60" maxlength="100" value="#[indexmustmatch]#" /><br />
+		  </td>
+          <td>
+            The filter is a <b><a href="http://download.oracle.com/javase/6/docs/api/java/util/regex/Pattern.html">regular expression</a></b>
+            that <b>must match</b> with the URLs to allow that the content of the url is indexed.
+          </td>
+        </tr>
+        <tr valign="top" class="TableCellDark">
+          <td><label for="indexmustnotmatch">Must-Not-Match Filter for URLs for indexing</label>:</td>
+          <td>
+			<input name="indexmustnotmatch" id="indexmustnotmatch" type="text" size="60" maxlength="100" value="#[indexmustnotmatch]#" />
+		  </td>
+          <td>
+            The filter is a <b><a href="http://download.oracle.com/javase/6/docs/api/java/util/regex/Pattern.html">regular expression</a></b>
+            that <b>must not match</b> with the URLs to allow that the content of the url is indexed.
+          </td>
+        </tr>
        <tr valign="top" class="TableCellLight">
          <td><label for="ipMustmatch">Must-Match Filter for IPs</label>:</td>
          <td>
--- a/htroot/CrawlStartExpert_p.java
+++ b/htroot/CrawlStartExpert_p.java
@ -43,9 +43,12 @@ public class CrawlStartExpert_p {
        prop.put("starturl", /*(intranet) ? repository :*/ "");
        prop.put("proxyPrefetchDepth", env.getConfig("proxyPrefetchDepth", "0"));
        prop.put("crawlingDepth", Math.min(3, env.getConfigLong("crawlingDepth", 0)));
+        prop.put("crawlingDepthExtension", CrawlProfile.MATCH_NEVER_STRING);
        prop.put("directDocByURLChecked", sb.getConfigBool("crawlingDirectDocByURL", true) ? "1" : "0");
        prop.put("mustmatch", /*(intranet) ? repository + ".*" :*/ CrawlProfile.MATCH_ALL_STRING);
        prop.put("mustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
+        prop.put("indexmustmatch", CrawlProfile.MATCH_ALL_STRING);
+        prop.put("indexmustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
        prop.put("ipMustmatch", sb.getConfig("crawlingIPMustMatch", CrawlProfile.MATCH_ALL_STRING));
        prop.put("ipMustnotmatch", sb.getConfig("crawlingIPMustNotMatch", CrawlProfile.MATCH_NEVER_STRING));
        prop.put("countryMustMatch", sb.getConfig("crawlingCountryMustMatch", ""));
--- a/htroot/Crawler_p.java
+++ b/htroot/Crawler_p.java
@ -204,9 +204,15 @@ public class Crawler_p {
                sb.setConfig("crawlingIPMustNotMatch", ipMustNotMatch);
                if (countryMustMatch.length() > 0) sb.setConfig("crawlingCountryMustMatch", countryMustMatch);

+                String crawlerNoDepthLimitMatch = post.get("crawlingDepthExtension", CrawlProfile.MATCH_NEVER_STRING);
+                final String indexUrlMustMatch = post.get("indexmustmatch", CrawlProfile.MATCH_ALL_STRING);
+                final String indexUrlMustNotMatch = post.get("indexmustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
+
                final boolean crawlOrder = post.get("crawlOrder", "off").equals("on");
                env.setConfig("crawlOrder", crawlOrder);

+                if (crawlOrder) crawlerNoDepthLimitMatch = CrawlProfile.MATCH_NEVER_STRING; // without limitation the crawl order does not work
+                
                int newcrawlingdepth = post.getInt("crawlingDepth", 8);
                env.setConfig("crawlingDepth", Integer.toString(newcrawlingdepth));
                if ((crawlOrder) && (newcrawlingdepth > 8)) newcrawlingdepth = 8;
@ -347,9 +353,9 @@ public class Crawler_p {
                        ipMustMatch,
                        ipMustNotMatch,
                        countryMustMatch,
-                        CrawlProfile.MATCH_NEVER_STRING,
-                        CrawlProfile.MATCH_ALL_STRING,
-                        CrawlProfile.MATCH_NEVER_STRING,
+                        crawlerNoDepthLimitMatch,
+                        indexUrlMustMatch,
+                        indexUrlMustNotMatch,
                        newcrawlingdepth,
                        directDocByURL,
                        crawlingIfOlder,
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@ -2284,28 +2284,14 @@ public final class Switchboard extends serverSwitch
        final EventOrigin processCase = response.processCase(this.peers.mySeed().hash);

        if ( this.log.isFine() ) {
-            this.log.logFine("processResourceStack processCase="
-                + processCase
-                + ", depth="
-                + response.depth()
-                + ", maxDepth="
-                + ((response.profile() == null) ? "null" : Integer.toString(response.profile().depth()))
-                + ", must-match="
-                + ((response.profile() == null) ? "null" : response
-                    .profile()
-                    .urlMustMatchPattern()
-                    .toString())
-                + ", must-not-match="
-                + ((response.profile() == null) ? "null" : response
-                    .profile()
-                    .urlMustNotMatchPattern()
-                    .toString())
-                + ", initiatorHash="
-                + ((response.initiator() == null) ? "null" : ASCII.String(response.initiator()))
-                +
-                //", responseHeader=" + ((entry.responseHeader() == null) ? "null" : entry.responseHeader().toString()) +
-                ", url="
-                + response.url()); // DEBUG
+            this.log.logFine(
+                "processResourceStack processCase=" + processCase
+                + ", depth=" + response.depth()
+                + ", maxDepth=" + ((response.profile() == null) ? "null" : Integer.toString(response.profile().depth()))
+                + ", must-match=" + ((response.profile() == null) ? "null" : response.profile().urlMustMatchPattern().toString())
+                + ", must-not-match=" + ((response.profile() == null) ? "null" : response.profile().urlMustNotMatchPattern().toString())
+                + ", initiatorHash=" + ((response.initiator() == null) ? "null" : ASCII.String(response.initiator()))
+                + ", url=" + response.url()); // DEBUG
        }

        // PARSE CONTENT
@ -2353,8 +2339,13 @@ public final class Switchboard extends serverSwitch

        // put anchors on crawl stack
        final long stackStartTime = System.currentTimeMillis();
-        if ( ((processCase == EventOrigin.PROXY_LOAD) || (processCase == EventOrigin.LOCAL_CRAWLING))
-            && ((response.profile() == null) || (response.depth() < response.profile().depth())) ) {
+        if ((processCase == EventOrigin.PROXY_LOAD || processCase == EventOrigin.LOCAL_CRAWLING) &&
+            (
+                response.profile() == null ||
+                response.depth() < response.profile().depth() ||
+                response.profile().crawlerNoDepthLimitMatchPattern().matcher(response.url().toNormalform(false, false)).matches()
+            )
+           ) {
            // get the hyperlinks
            final Map<MultiProtocolURI, String> hl = Document.getHyperlinks(documents);

@ -2415,12 +2406,17 @@ public final class Switchboard extends serverSwitch

    public IndexingQueueEntry condenseDocument(final IndexingQueueEntry in) {
        in.queueEntry.updateStatus(Response.QUEUE_STATE_CONDENSING);
-        if ( !in.queueEntry.profile().indexText() && !in.queueEntry.profile().indexMedia() ) {
-            if ( this.log.isInfo() ) {
-                this.log.logInfo("Not Condensed Resource '"
-                    + in.queueEntry.url().toNormalform(false, true)
-                    + "': indexing not wanted by crawl profile");
+        CrawlProfile profile = in.queueEntry.profile();
+        String urls = in.queueEntry.url().toNormalform(false, true);
+        
+        // check profile attributes which prevent indexing (while crawling is allowed)
+        if (!profile.indexText() && !profile.indexMedia()) {
+            if (this.log.isInfo()) this.log.logInfo("Not Condensed Resource '" + urls + "': indexing of this media type not wanted by crawl profile");
+            return new IndexingQueueEntry(in.queueEntry, in.documents, null);
        }
+        if (!profile.indexUrlMustMatchPattern().matcher(urls).matches() ||
+             profile.indexUrlMustNotMatchPattern().matcher(urls).matches() ) {
+            if (this.log.isInfo()) this.log.logInfo("Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url");
            return new IndexingQueueEntry(in.queueEntry, in.documents, null);
        }
        
@ -2428,11 +2424,7 @@ public final class Switchboard extends serverSwitch
        final List<Document> doclist = new ArrayList<Document>();
        for ( final Document document : in.documents ) {
            if ( document.indexingDenied() ) {
-                if ( this.log.isInfo() ) {
-                    this.log.logInfo("Not Condensed Resource '"
-                        + in.queueEntry.url().toNormalform(false, true)
-                        + "': denied by document-attached noindexing rule");
-                }
+                if ( this.log.isInfo() ) this.log.logInfo("Not Condensed Resource '" + urls + "': denied by document-attached noindexing rule");
                addURLtoErrorDB(
                    in.queueEntry.url(),
                    in.queueEntry.referrerHash(),
@ -2459,7 +2451,6 @@ public final class Switchboard extends serverSwitch
            // update image result list statistics
            // its good to do this concurrently here, because it needs a DNS lookup
            // to compute a URL hash which is necessary for a double-check
-            final CrawlProfile profile = in.queueEntry.profile();
            ResultImages.registerImages(in.queueEntry.url(), in.documents[i], (profile == null)
                ? true
                : !profile.remoteIndexing());