New optional crawl filter on the URL a doc must match to crawl its links

For finer control over which parsed documents can trigger an addition of their links to the crawl stack, complementary to the existing crawl depth parameter.
6 years ago · 6b45cd5799
parent 8d3e029247
commit 6b45cd5799
6 changed files with 207 additions and 86 deletions
--- a/htroot/CrawlProfileEditor_p.xml
+++ b/htroot/CrawlProfileEditor_p.xml
@ -23,6 +23,8 @@
 		<crawlerAlwaysCheckMediaType>#(crawlerAlwaysCheckMediaType)#false::true#(/crawlerAlwaysCheckMediaType)#</crawlerAlwaysCheckMediaType>
 		<crawlerURLMustMatch>#[crawlerURLMustMatch]#</crawlerURLMustMatch>
 		<crawlerURLMustNotMatch>#[crawlerURLMustNotMatch]#</crawlerURLMustNotMatch>
+		<crawlerOriginURLMustMatch>#[crawlerOriginURLMustMatch]#</crawlerOriginURLMustMatch>
+		<crawlerOriginURLMustNotMatch>#[crawlerOriginURLMustNotMatch]#</crawlerOriginURLMustNotMatch>
 		<crawlerIPMustMatch>#[crawlerIPMustMatch]#</crawlerIPMustMatch>
 		<crawlerIPMustNotMatch>#[crawlerIPMustNotMatch]#</crawlerIPMustNotMatch>
 		<crawlerCountryMustMatch>#[crawlerCountryMustMatch]#</crawlerCountryMustMatch>
--- a/htroot/CrawlStartExpert.html
+++ b/htroot/CrawlStartExpert.html
@ -174,6 +174,7 @@

                // remove if MATCH_NEVER_STRING
                disableIf('mustnotmatch', defaultMatchNone);
+                disableIf('crawlerOriginURLMustNotMatch', defaultMatchNone);
                disableIf('ipMustnotmatch', defaultMatchNone);
                disableIf('indexmustnotmatch', defaultMatchNone);
                disableIf('indexcontentmustnotmatch', defaultMatchNone);
@ -183,6 +184,7 @@

                // remove if MATCH_ALL_STRING
                disableIf('mustmatch', defaultMatchAll);
+                disableIf('crawlerOriginURLMustMatch', defaultMatchAll);
                disableIf('ipMustmatch', defaultMatchAll);
                disableIf('indexmustmatch', defaultMatchAll);
                disableIf('indexcontentmustmatch', defaultMatchAll);
@ -354,6 +356,29 @@
            <tr><td><img src="env/grafics/minus.gif" alt=""> must-not-match</td><td><input name="mustnotmatch" id="mustnotmatch" type="text" size="55" maxlength="100000" value="#[mustnotmatch]#" /></td></tr>
 			</table>
 	        </dd>
+	        
+	        <dt>Load Filter on URL origin of links</dt>
+	        <dd>
+	        	<span class="info" style="float:right">
+	        		<img src="env/grafics/i16.gif" width="16" height="16" alt="info"/>
+	        		<span style="right:0px;">
+            			The filter is a <b><a href="https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html" target="_blank">regular expression</a></b>.
+            			Example: to allow loading only links from pages on example.org domain, set the must-match filter to '.*example.org.*'. 
+            			Attention: you can test the functionality of your regular expressions using the <a href="RegexTest.html">Regular Expression Tester</a> within YaCy.
+            		</span>
+            	</span>
+            	<table style="border-width: 0px">
+            		<tr>
+            			<td style="width:110px"><img src="env/grafics/plus.gif" alt=""> must-match</td>
+            			<td><input name="crawlerOriginURLMustMatch" id="crawlerOriginURLMustMatch" type="text" size="55" maxlength="100000" value="#[crawlerOriginURLMustMatch]#" onblur="if (this.value=='') this.value='.*';"/> (must not be empty)</td>
+            		</tr>
+		    		<tr>
+		    			<td><img src="env/grafics/minus.gif" alt=""> must-not-match</td>
+		    			<td><input name="crawlerOriginURLMustNotMatch" id="crawlerOriginURLMustNotMatch" type="text" size="55" maxlength="100000" value="#[crawlerOriginURLMustNotMatch]#" /></td>
+		    		</tr>
+				</table>
+	        </dd>
+	        
 	        <dt>Load Filter on IPs</dt>
 	        <dd>
            <table style="border-width: 0px">
--- a/htroot/CrawlStartExpert.java
+++ b/htroot/CrawlStartExpert.java
@ -248,6 +248,22 @@ public class CrawlStartExpert {
        } else {
            prop.put("mustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
        }
+        
+		// Filter on URL origin of links: must match
+		if (post != null && post.containsKey(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key)) {
+			prop.put(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key,
+					post.get(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key, CrawlProfile.MATCH_ALL_STRING));
+		} else {
+			prop.put(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key, CrawlProfile.MATCH_ALL_STRING);
+		}
+
+		// Filter on URL origin of links: must-not-match
+		if (post != null && post.containsKey(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTNOTMATCH.key)) {
+			prop.put(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTNOTMATCH.key,
+					post.get(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTNOTMATCH.key, CrawlProfile.MATCH_NEVER_STRING));
+		} else {
+			prop.put(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTNOTMATCH.key, CrawlProfile.MATCH_NEVER_STRING);
+		}

        // Load Filter on IPs: must match
        if (post != null && post.containsKey("ipMustmatch")) {
--- a/htroot/Crawler_p.java
+++ b/htroot/Crawler_p.java
@ -626,7 +626,11 @@ public class Crawler_p {
                            ignoreclassname,
                            new VocabularyScraper(vocabulary_scraper),
                            timezoneOffset);
-                    
+
+					profile.put(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key,
+							post.get(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key, CrawlProfile.MATCH_ALL_STRING));
+					profile.put(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTNOTMATCH.key, post
+							.get(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTNOTMATCH.key, CrawlProfile.MATCH_NEVER_STRING));
 					profile.put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key,
 							post.get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key, CrawlProfile.MATCH_ALL_STRING));
 					profile.put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, post
--- a/source/net/yacy/crawler/data/CrawlProfile.java
+++ b/source/net/yacy/crawler/data/CrawlProfile.java
@ -99,6 +99,8 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        CRAWLER_ALWAYS_CHECK_MEDIA_TYPE("crawlerAlwaysCheckMediaType", false, CrawlAttribute.BOOLEAN, "Always cross check file extension against actual Media Type"),
        CRAWLER_URL_MUSTMATCH        ("crawlerURLMustMatch",        false, CrawlAttribute.STRING,  "URL Must-Match Filter"),
        CRAWLER_URL_MUSTNOTMATCH     ("crawlerURLMustNotMatch",     false, CrawlAttribute.STRING,  "URL Must-Not-Match Filter"),
+        CRAWLER_ORIGIN_URL_MUSTMATCH ("crawlerOriginURLMustMatch",  false, CrawlAttribute.STRING,  "Links Origin URL Must-Match Filter"),
+        CRAWLER_ORIGIN_URL_MUSTNOTMATCH ("crawlerOriginURLMustNotMatch", false, CrawlAttribute.STRING, "Links Origin URL Must-Not-Match Filter"),
        CRAWLER_IP_MUSTMATCH         ("crawlerIPMustMatch",         false, CrawlAttribute.STRING,  "IP Must-Match Filter"),
        CRAWLER_IP_MUSTNOTMATCH      ("crawlerIPMustNotMatch",      false, CrawlAttribute.STRING,  "IP Must-Not-Match Filter"),
        CRAWLER_COUNTRY_MUSTMATCH    ("crawlerCountryMustMatch",    false, CrawlAttribute.STRING,  "Country Must-Match Filter"),
@ -148,6 +150,13 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
    
    
    private Pattern crawlerurlmustmatch = null, crawlerurlmustnotmatch = null;
+    
+    /** Pattern on the URL a document must match to allow adding its embedded links to the crawl stack */
+    private Pattern crawlerOriginUrlMustMatch = null;
+    
+    /** Pattern on the URL a document must not match to allow adding its embedded links to the crawl stack */
+    private Pattern crawlerOriginUrlMustNotMatch = null;
+    
    private Pattern crawleripmustmatch = null, crawleripmustnotmatch = null;
    private Pattern crawlernodepthlimitmatch = null;
    private Pattern indexurlmustmatch = null, indexurlmustnotmatch = null;
@ -243,6 +252,8 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        put(CrawlAttribute.CRAWLER_ALWAYS_CHECK_MEDIA_TYPE.key, true);
        put(CrawlAttribute.CRAWLER_URL_MUSTMATCH.key,     (crawlerUrlMustMatch == null) ? CrawlProfile.MATCH_ALL_STRING : crawlerUrlMustMatch);
        put(CrawlAttribute.CRAWLER_URL_MUSTNOTMATCH.key,  (crawlerUrlMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerUrlMustNotMatch);
+        put(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key, (crawlerUrlMustMatch == null) ? CrawlProfile.MATCH_ALL_STRING : crawlerUrlMustMatch);
+        put(CrawlAttribute.CRAWLER_URL_MUSTNOTMATCH.key,  (crawlerUrlMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerUrlMustNotMatch);
        put(CrawlAttribute.CRAWLER_IP_MUSTMATCH.key,      (crawlerIpMustMatch == null) ? CrawlProfile.MATCH_ALL_STRING : crawlerIpMustMatch);
        put(CrawlAttribute.CRAWLER_IP_MUSTNOTMATCH.key,   (crawlerIpMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerIpMustNotMatch);
        put(CrawlAttribute.CRAWLER_COUNTRY_MUSTMATCH.key, (crawlerCountryMustMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerCountryMustMatch);
@ -501,6 +512,50 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        }
        return this.crawlerurlmustnotmatch;
    }
+    
+	/**
+	 * Get the pattern on the URL a document must match to allow adding its embedded links to the crawl stack
+	 * 
+	 * @return a {@link Pattern} instance, defaulting to
+	 *         {@link CrawlProfile#MATCH_ALL_PATTERN} when the regular expression
+	 *         string is not set or its syntax is incorrect
+	 */
+    public Pattern getCrawlerOriginUrlMustMatchPattern() {
+		if (this.crawlerOriginUrlMustMatch == null) {
+			/* Cache the compiled pattern for faster next calls */
+			final String patternStr = get(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key);
+			try {
+				this.crawlerOriginUrlMustMatch = (patternStr == null
+						|| patternStr.equals(CrawlProfile.MATCH_ALL_STRING)) ? CrawlProfile.MATCH_ALL_PATTERN
+								: Pattern.compile(patternStr, Pattern.CASE_INSENSITIVE);
+			} catch (final PatternSyntaxException e) {
+				this.crawlerOriginUrlMustMatch = CrawlProfile.MATCH_ALL_PATTERN;
+			}
+		}
+        return this.crawlerOriginUrlMustMatch;
+    }
+    
+	/**
+	 * Get the pattern on the URL a document must not match to allow adding its embedded links to the crawl stack
+	 * 
+	 * @return a {@link Pattern} instance, defaulting to
+	 *         {@link CrawlProfile#MATCH_NEVER_PATTERN} when the regular expression
+	 *         string is not set or its syntax is incorrect
+	 */
+    public Pattern getCrawlerOriginUrlMustNotMatchPattern() {
+		if (this.crawlerOriginUrlMustNotMatch == null) {
+			/* Cache the compiled pattern for faster next calls */
+			final String patternStr = get(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTNOTMATCH.key);
+			try {
+				this.crawlerOriginUrlMustNotMatch = (patternStr == null
+						|| patternStr.equals(CrawlProfile.MATCH_NEVER_STRING)) ? CrawlProfile.MATCH_NEVER_PATTERN
+								: Pattern.compile(patternStr, Pattern.CASE_INSENSITIVE);
+			} catch (final PatternSyntaxException e) {
+				this.crawlerOriginUrlMustNotMatch = CrawlProfile.MATCH_NEVER_PATTERN;
+			}
+		}
+        return this.crawlerOriginUrlMustNotMatch;
+    }

    /**
     * Gets the regex which must be matched by IPs in order to be crawled.
@ -926,6 +981,8 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlerAlwaysCheckMediaType", this.isCrawlerAlwaysCheckMediaType());
        prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerURLMustMatch", this.get(CrawlAttribute.CRAWLER_URL_MUSTMATCH.key));
        prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerURLMustNotMatch", this.get(CrawlAttribute.CRAWLER_URL_MUSTNOTMATCH.key));
+        prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerOriginURLMustMatch", this.get(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key));
+        prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerOriginURLMustNotMatch", this.get(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTNOTMATCH.key));
        prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerIPMustMatch", this.get(CrawlAttribute.CRAWLER_IP_MUSTMATCH.key));
        prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerIPMustNotMatch", this.get(CrawlAttribute.CRAWLER_IP_MUSTNOTMATCH.key));
        prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerCountryMustMatch", this.get(CrawlAttribute.CRAWLER_COUNTRY_MUSTMATCH.key));
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@ -3111,95 +3111,112 @@ public final class Switchboard extends serverSwitch {
            )
           ) {
            
-            for (Document d: documents) d.setDepth(response.depth());
-            
-            // get the hyperlinks
-            final Map<AnchorURL, String> hl = Document.getHyperlinks(documents, !response.profile().obeyHtmlRobotsNofollow());
-            
-			final boolean addAllLinksToCrawlStack = response.profile().isIndexNonParseableUrls() /* unsupported resources have to be indexed as pure links if no parser support them */
-					|| response.profile().isCrawlerAlwaysCheckMediaType() /* the crawler must always load resources to double-check the actual Media Type even on unsupported file extensions */;
-			
-			/* Handle media links */
-			
-			for (Map.Entry<DigestURL, String> entry : Document.getImagelinks(documents).entrySet()) {
-				if (addAllLinksToCrawlStack
-						|| (response.profile().indexMedia() && TextParser.supportsExtension(entry.getKey()) == null)) {
-					hl.put(new AnchorURL(entry.getKey()), entry.getValue());
-				}
-			}
-			
-			for (Map.Entry<DigestURL, String> entry : Document.getApplinks(documents).entrySet()) {
-				if (addAllLinksToCrawlStack
-						|| (response.profile().indexMedia() && TextParser.supportsExtension(entry.getKey()) == null)) {
-					hl.put(new AnchorURL(entry.getKey()), entry.getValue());
-				}
-			}
-			
-			for (Map.Entry<DigestURL, String> entry : Document.getVideolinks(documents).entrySet()) {
-				if (addAllLinksToCrawlStack
-						|| (response.profile().indexMedia() && TextParser.supportsExtension(entry.getKey()) == null)) {
-					hl.put(new AnchorURL(entry.getKey()), entry.getValue());
-				}
-			}
-			
-			for (Map.Entry<DigestURL, String> entry : Document.getAudiolinks(documents).entrySet()) {
-				if (addAllLinksToCrawlStack
-						|| (response.profile().indexMedia() && TextParser.supportsExtension(entry.getKey()) == null)) {
-					hl.put(new AnchorURL(entry.getKey()), entry.getValue());
-				}
-			}
-            
-            // insert those hyperlinks to the crawler
-            MultiProtocolURL nextUrl;
-            for ( final Map.Entry<AnchorURL, String> nextEntry : hl.entrySet() ) {
-                // check for interruption
-                checkInterruption();
-
-                // process the next hyperlink
-                nextUrl = nextEntry.getKey();
-                String u = nextUrl.toNormalform(true, true);
-                if ( !(u.startsWith("http://")
-                    || u.startsWith("https://")
-                    || u.startsWith("ftp://")
-                    || u.startsWith("smb://") || u.startsWith("file://")) ) {
-                    continue;
+			final Pattern crawlerOriginUrlMustMatch = response.profile().getCrawlerOriginUrlMustMatchPattern();
+			final Pattern crawlerOriginUrlMustNotMatch = response.profile().getCrawlerOriginUrlMustNotMatchPattern();
+			if (!(crawlerOriginUrlMustMatch == CrawlProfile.MATCH_ALL_PATTERN
+					|| crawlerOriginUrlMustMatch.matcher(response.url().toNormalform(true)).matches())
+					|| (crawlerOriginUrlMustNotMatch != CrawlProfile.MATCH_NEVER_PATTERN
+							&& crawlerOriginUrlMustNotMatch.matcher(response.url().toNormalform(true)).matches())) {
+				if (this.log.isInfo()) {
+					this.log.info("CRAWL: Ignored links from document at " + response.url().toNormalform(true)
+							+ " : prevented by regular expression on URL origin of links, "
+							+ CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH + " = " + crawlerOriginUrlMustMatch.pattern()
+							+ ", " + CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTNOTMATCH + " = "
+							+ crawlerOriginUrlMustNotMatch.pattern());
+	            }
+			} else {
+                for (Document d: documents) {
+                	d.setDepth(response.depth());
                }
                
-                // rewrite the url
-                String u0 = LibraryProvider.urlRewriter.apply(u);
-                if (!u.equals(u0)) {
-                    log.info("REWRITE of url = \"" + u + "\" to \"" + u0 + "\"");
-                    u = u0;
-                }
-                //Matcher m = rewritePattern.matcher(u);
-                //if (m.matches()) u = m.replaceAll("");
+                // get the hyperlinks
+                final Map<AnchorURL, String> hl = Document.getHyperlinks(documents, !response.profile().obeyHtmlRobotsNofollow());
                
-                // enqueue the hyperlink into the pre-notice-url db
-                int nextdepth = nextEntry.getValue() != null && nextEntry.getValue().equals(Document.CANONICAL_MARKER) ? response.depth() : response.depth() + 1; // canonical documents are on the same depth
-                try {
-                    this.crawlStacker.enqueueEntry(new Request(
-                        response.initiator(),
-                        new DigestURL(u),
-                        response.url().hash(),
-                        nextEntry.getValue(),
-                        new Date(),
-                        response.profile().handle(),
-                        nextdepth,
-                        response.profile().timezoneOffset()));
-                } catch (final MalformedURLException e ) {
-                    ConcurrentLog.logException(e);
+    			final boolean addAllLinksToCrawlStack = response.profile().isIndexNonParseableUrls() /* unsupported resources have to be indexed as pure links if no parser support them */
+    					|| response.profile().isCrawlerAlwaysCheckMediaType() /* the crawler must always load resources to double-check the actual Media Type even on unsupported file extensions */;
+    			
+    			/* Handle media links */
+    			
+    			for (Map.Entry<DigestURL, String> entry : Document.getImagelinks(documents).entrySet()) {
+    				if (addAllLinksToCrawlStack
+    						|| (response.profile().indexMedia() && TextParser.supportsExtension(entry.getKey()) == null)) {
+    					hl.put(new AnchorURL(entry.getKey()), entry.getValue());
+    				}
+    			}
+    			
+    			for (Map.Entry<DigestURL, String> entry : Document.getApplinks(documents).entrySet()) {
+    				if (addAllLinksToCrawlStack
+    						|| (response.profile().indexMedia() && TextParser.supportsExtension(entry.getKey()) == null)) {
+    					hl.put(new AnchorURL(entry.getKey()), entry.getValue());
+    				}
+    			}
+    			
+    			for (Map.Entry<DigestURL, String> entry : Document.getVideolinks(documents).entrySet()) {
+    				if (addAllLinksToCrawlStack
+    						|| (response.profile().indexMedia() && TextParser.supportsExtension(entry.getKey()) == null)) {
+    					hl.put(new AnchorURL(entry.getKey()), entry.getValue());
+    				}
+    			}
+    			
+    			for (Map.Entry<DigestURL, String> entry : Document.getAudiolinks(documents).entrySet()) {
+    				if (addAllLinksToCrawlStack
+    						|| (response.profile().indexMedia() && TextParser.supportsExtension(entry.getKey()) == null)) {
+    					hl.put(new AnchorURL(entry.getKey()), entry.getValue());
+    				}
+    			}
+                
+                // insert those hyperlinks to the crawler
+                MultiProtocolURL nextUrl;
+                for ( final Map.Entry<AnchorURL, String> nextEntry : hl.entrySet() ) {
+                    // check for interruption
+                    checkInterruption();
+
+                    // process the next hyperlink
+                    nextUrl = nextEntry.getKey();
+                    String u = nextUrl.toNormalform(true, true);
+                    if ( !(u.startsWith("http://")
+                        || u.startsWith("https://")
+                        || u.startsWith("ftp://")
+                        || u.startsWith("smb://") || u.startsWith("file://")) ) {
+                        continue;
+                    }
+                    
+                    // rewrite the url
+                    String u0 = LibraryProvider.urlRewriter.apply(u);
+                    if (!u.equals(u0)) {
+                        log.info("REWRITE of url = \"" + u + "\" to \"" + u0 + "\"");
+                        u = u0;
+                    }
+                    //Matcher m = rewritePattern.matcher(u);
+                    //if (m.matches()) u = m.replaceAll("");
+                    
+                    // enqueue the hyperlink into the pre-notice-url db
+                    int nextdepth = nextEntry.getValue() != null && nextEntry.getValue().equals(Document.CANONICAL_MARKER) ? response.depth() : response.depth() + 1; // canonical documents are on the same depth
+                    try {
+                        this.crawlStacker.enqueueEntry(new Request(
+                            response.initiator(),
+                            new DigestURL(u),
+                            response.url().hash(),
+                            nextEntry.getValue(),
+                            new Date(),
+                            response.profile().handle(),
+                            nextdepth,
+                            response.profile().timezoneOffset()));
+                    } catch (final MalformedURLException e ) {
+                        ConcurrentLog.logException(e);
+                    }
                }
-            }
-            final long stackEndTime = System.currentTimeMillis();
-            if ( this.log.isInfo() ) {
-                this.log.info("CRAWL: ADDED "
-                    + hl.size()
-                    + " LINKS FROM "
-                    + response.url().toNormalform(true)
-                    + ", STACKING TIME = "
-                    + (stackEndTime - stackStartTime)
-                    + ", PARSING TIME = "
-                    + (parsingEndTime - parsingStartTime));
+                final long stackEndTime = System.currentTimeMillis();
+                if ( this.log.isInfo() ) {
+                    this.log.info("CRAWL: ADDED "
+                        + hl.size()
+                        + " LINKS FROM "
+                        + response.url().toNormalform(true)
+                        + ", STACKING TIME = "
+                        + (stackEndTime - stackStartTime)
+                        + ", PARSING TIME = "
+                        + (parsingEndTime - parsingStartTime));
+                }            	
            }
        }
        return documents;