From 6b45cd579922574059e5385153b84be3ca07533b Mon Sep 17 00:00:00 2001
From: luccioman <luccioman@users.noreply.github.com>
Date: Wed, 1 May 2019 08:54:19 +0200
Subject: [PATCH] New optional crawl filter on the URL a doc must match to
 crawl its links

For finer control over which parsed documents can trigger an addition of
their links to the crawl stack, complementary to the existing crawl
depth parameter.
---
 htroot/CrawlProfileEditor_p.xml               |   2 +
 htroot/CrawlStartExpert.html                  |  25 +++
 htroot/CrawlStartExpert.java                  |  16 ++
 htroot/Crawler_p.java                         |   6 +-
 .../net/yacy/crawler/data/CrawlProfile.java   |  57 ++++++
 source/net/yacy/search/Switchboard.java       | 187 ++++++++++--------
 6 files changed, 207 insertions(+), 86 deletions(-)
diff --git a/htroot/CrawlProfileEditor_p.xml b/htroot/CrawlProfileEditor_p.xml
index 4ab2b2534..0b880ac3f 100644
--- a/htroot/CrawlProfileEditor_p.xml
+++ b/htroot/CrawlProfileEditor_p.xml
@@ -23,6 +23,8 @@
 		<crawlerAlwaysCheckMediaType>#(crawlerAlwaysCheckMediaType)#false::true#(/crawlerAlwaysCheckMediaType)#</crawlerAlwaysCheckMediaType>
 		<crawlerURLMustMatch>#[crawlerURLMustMatch]#</crawlerURLMustMatch>
 		<crawlerURLMustNotMatch>#[crawlerURLMustNotMatch]#</crawlerURLMustNotMatch>
+		<crawlerOriginURLMustMatch>#[crawlerOriginURLMustMatch]#</crawlerOriginURLMustMatch>
+		<crawlerOriginURLMustNotMatch>#[crawlerOriginURLMustNotMatch]#</crawlerOriginURLMustNotMatch>
 		<crawlerIPMustMatch>#[crawlerIPMustMatch]#</crawlerIPMustMatch>
 		<crawlerIPMustNotMatch>#[crawlerIPMustNotMatch]#</crawlerIPMustNotMatch>
 		<crawlerCountryMustMatch>#[crawlerCountryMustMatch]#</crawlerCountryMustMatch>
diff --git a/htroot/CrawlStartExpert.html b/htroot/CrawlStartExpert.html
index 909b45680..7c644ac11 100644
--- a/htroot/CrawlStartExpert.html
+++ b/htroot/CrawlStartExpert.html
@@ -174,6 +174,7 @@
 
                 // remove if MATCH_NEVER_STRING
                 disableIf('mustnotmatch', defaultMatchNone);
+                disableIf('crawlerOriginURLMustNotMatch', defaultMatchNone);
                 disableIf('ipMustnotmatch', defaultMatchNone);
                 disableIf('indexmustnotmatch', defaultMatchNone);
                 disableIf('indexcontentmustnotmatch', defaultMatchNone);
@@ -183,6 +184,7 @@
 
                 // remove if MATCH_ALL_STRING
                 disableIf('mustmatch', defaultMatchAll);
+                disableIf('crawlerOriginURLMustMatch', defaultMatchAll);
                 disableIf('ipMustmatch', defaultMatchAll);
                 disableIf('indexmustmatch', defaultMatchAll);
                 disableIf('indexcontentmustmatch', defaultMatchAll);
@@ -354,6 +356,29 @@
             <tr><td><img src="env/grafics/minus.gif" alt=""> must-not-match</td><td><input name="mustnotmatch" id="mustnotmatch" type="text" size="55" maxlength="100000" value="#[mustnotmatch]#" /></td></tr>
 			</table>
 	        </dd>
+	        
+	        <dt>Load Filter on URL origin of links</dt>
+	        <dd>
+	        	<span class="info" style="float:right">
+	        		<img src="env/grafics/i16.gif" width="16" height="16" alt="info"/>
+	        		<span style="right:0px;">
+            			The filter is a <b><a href="https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html" target="_blank">regular expression</a></b>.
+            			Example: to allow loading only links from pages on example.org domain, set the must-match filter to '.*example.org.*'. 
+            			Attention: you can test the functionality of your regular expressions using the <a href="RegexTest.html">Regular Expression Tester</a> within YaCy.
+            		</span>
+            	</span>
+            	<table style="border-width: 0px">
+            		<tr>
+            			<td style="width:110px"><img src="env/grafics/plus.gif" alt=""> must-match</td>
+            			<td><input name="crawlerOriginURLMustMatch" id="crawlerOriginURLMustMatch" type="text" size="55" maxlength="100000" value="#[crawlerOriginURLMustMatch]#" onblur="if (this.value=='') this.value='.*';"/> (must not be empty)</td>
+            		</tr>
+		    		<tr>
+		    			<td><img src="env/grafics/minus.gif" alt=""> must-not-match</td>
+		    			<td><input name="crawlerOriginURLMustNotMatch" id="crawlerOriginURLMustNotMatch" type="text" size="55" maxlength="100000" value="#[crawlerOriginURLMustNotMatch]#" /></td>
+		    		</tr>
+				</table>
+	        </dd>
+	        
 	        <dt>Load Filter on IPs</dt>
 	        <dd>
             <table style="border-width: 0px">
diff --git a/htroot/CrawlStartExpert.java b/htroot/CrawlStartExpert.java
index 54cc6b234..d73b70c43 100644
--- a/htroot/CrawlStartExpert.java
+++ b/htroot/CrawlStartExpert.java
@@ -248,6 +248,22 @@ public class CrawlStartExpert {
         } else {
             prop.put("mustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
         }
+        
+		// Filter on URL origin of links: must match
+		if (post != null && post.containsKey(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key)) {
+			prop.put(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key,
+					post.get(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key, CrawlProfile.MATCH_ALL_STRING));
+		} else {
+			prop.put(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key, CrawlProfile.MATCH_ALL_STRING);
+		}
+
+		// Filter on URL origin of links: must-not-match
+		if (post != null && post.containsKey(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTNOTMATCH.key)) {
+			prop.put(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTNOTMATCH.key,
+					post.get(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTNOTMATCH.key, CrawlProfile.MATCH_NEVER_STRING));
+		} else {
+			prop.put(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTNOTMATCH.key, CrawlProfile.MATCH_NEVER_STRING);
+		}
 
         // Load Filter on IPs: must match
         if (post != null && post.containsKey("ipMustmatch")) {
diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java
index 343ecfed5..0e5bd8e74 100644
--- a/htroot/Crawler_p.java
+++ b/htroot/Crawler_p.java
@@ -626,7 +626,11 @@ public class Crawler_p {
                             ignoreclassname,
                             new VocabularyScraper(vocabulary_scraper),
                             timezoneOffset);
-                    
+
+					profile.put(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key,
+							post.get(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key, CrawlProfile.MATCH_ALL_STRING));
+					profile.put(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTNOTMATCH.key, post
+							.get(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTNOTMATCH.key, CrawlProfile.MATCH_NEVER_STRING));
 					profile.put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key,
 							post.get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key, CrawlProfile.MATCH_ALL_STRING));
 					profile.put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, post
diff --git a/source/net/yacy/crawler/data/CrawlProfile.java b/source/net/yacy/crawler/data/CrawlProfile.java
index ded1b764b..ae264b6ed 100644
--- a/source/net/yacy/crawler/data/CrawlProfile.java
+++ b/source/net/yacy/crawler/data/CrawlProfile.java
@@ -99,6 +99,8 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
         CRAWLER_ALWAYS_CHECK_MEDIA_TYPE("crawlerAlwaysCheckMediaType", false, CrawlAttribute.BOOLEAN, "Always cross check file extension against actual Media Type"),
         CRAWLER_URL_MUSTMATCH        ("crawlerURLMustMatch",        false, CrawlAttribute.STRING,  "URL Must-Match Filter"),
         CRAWLER_URL_MUSTNOTMATCH     ("crawlerURLMustNotMatch",     false, CrawlAttribute.STRING,  "URL Must-Not-Match Filter"),
+        CRAWLER_ORIGIN_URL_MUSTMATCH ("crawlerOriginURLMustMatch",  false, CrawlAttribute.STRING,  "Links Origin URL Must-Match Filter"),
+        CRAWLER_ORIGIN_URL_MUSTNOTMATCH ("crawlerOriginURLMustNotMatch", false, CrawlAttribute.STRING, "Links Origin URL Must-Not-Match Filter"),
         CRAWLER_IP_MUSTMATCH         ("crawlerIPMustMatch",         false, CrawlAttribute.STRING,  "IP Must-Match Filter"),
         CRAWLER_IP_MUSTNOTMATCH      ("crawlerIPMustNotMatch",      false, CrawlAttribute.STRING,  "IP Must-Not-Match Filter"),
         CRAWLER_COUNTRY_MUSTMATCH    ("crawlerCountryMustMatch",    false, CrawlAttribute.STRING,  "Country Must-Match Filter"),
@@ -148,6 +150,13 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
     
     
     private Pattern crawlerurlmustmatch = null, crawlerurlmustnotmatch = null;
+    
+    /** Pattern on the URL a document must match to allow adding its embedded links to the crawl stack */
+    private Pattern crawlerOriginUrlMustMatch = null;
+    
+    /** Pattern on the URL a document must not match to allow adding its embedded links to the crawl stack */
+    private Pattern crawlerOriginUrlMustNotMatch = null;
+    
     private Pattern crawleripmustmatch = null, crawleripmustnotmatch = null;
     private Pattern crawlernodepthlimitmatch = null;
     private Pattern indexurlmustmatch = null, indexurlmustnotmatch = null;
@@ -243,6 +252,8 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
         put(CrawlAttribute.CRAWLER_ALWAYS_CHECK_MEDIA_TYPE.key, true);
         put(CrawlAttribute.CRAWLER_URL_MUSTMATCH.key,     (crawlerUrlMustMatch == null) ? CrawlProfile.MATCH_ALL_STRING : crawlerUrlMustMatch);
         put(CrawlAttribute.CRAWLER_URL_MUSTNOTMATCH.key,  (crawlerUrlMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerUrlMustNotMatch);
+        put(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key, (crawlerUrlMustMatch == null) ? CrawlProfile.MATCH_ALL_STRING : crawlerUrlMustMatch);
+        put(CrawlAttribute.CRAWLER_URL_MUSTNOTMATCH.key,  (crawlerUrlMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerUrlMustNotMatch);
         put(CrawlAttribute.CRAWLER_IP_MUSTMATCH.key,      (crawlerIpMustMatch == null) ? CrawlProfile.MATCH_ALL_STRING : crawlerIpMustMatch);
         put(CrawlAttribute.CRAWLER_IP_MUSTNOTMATCH.key,   (crawlerIpMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerIpMustNotMatch);
         put(CrawlAttribute.CRAWLER_COUNTRY_MUSTMATCH.key, (crawlerCountryMustMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerCountryMustMatch);
@@ -501,6 +512,50 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
         }
         return this.crawlerurlmustnotmatch;
     }
+    
+	/**
+	 * Get the pattern on the URL a document must match to allow adding its embedded links to the crawl stack
+	 * 
+	 * @return a {@link Pattern} instance, defaulting to
+	 *         {@link CrawlProfile#MATCH_ALL_PATTERN} when the regular expression
+	 *         string is not set or its syntax is incorrect
+	 */
+    public Pattern getCrawlerOriginUrlMustMatchPattern() {
+		if (this.crawlerOriginUrlMustMatch == null) {
+			/* Cache the compiled pattern for faster next calls */
+			final String patternStr = get(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key);
+			try {
+				this.crawlerOriginUrlMustMatch = (patternStr == null
+						|| patternStr.equals(CrawlProfile.MATCH_ALL_STRING)) ? CrawlProfile.MATCH_ALL_PATTERN
+								: Pattern.compile(patternStr, Pattern.CASE_INSENSITIVE);
+			} catch (final PatternSyntaxException e) {
+				this.crawlerOriginUrlMustMatch = CrawlProfile.MATCH_ALL_PATTERN;
+			}
+		}
+        return this.crawlerOriginUrlMustMatch;
+    }
+    
+	/**
+	 * Get the pattern on the URL a document must not match to allow adding its embedded links to the crawl stack
+	 * 
+	 * @return a {@link Pattern} instance, defaulting to
+	 *         {@link CrawlProfile#MATCH_NEVER_PATTERN} when the regular expression
+	 *         string is not set or its syntax is incorrect
+	 */
+    public Pattern getCrawlerOriginUrlMustNotMatchPattern() {
+		if (this.crawlerOriginUrlMustNotMatch == null) {
+			/* Cache the compiled pattern for faster next calls */
+			final String patternStr = get(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTNOTMATCH.key);
+			try {
+				this.crawlerOriginUrlMustNotMatch = (patternStr == null
+						|| patternStr.equals(CrawlProfile.MATCH_NEVER_STRING)) ? CrawlProfile.MATCH_NEVER_PATTERN
+								: Pattern.compile(patternStr, Pattern.CASE_INSENSITIVE);
+			} catch (final PatternSyntaxException e) {
+				this.crawlerOriginUrlMustNotMatch = CrawlProfile.MATCH_NEVER_PATTERN;
+			}
+		}
+        return this.crawlerOriginUrlMustNotMatch;
+    }
 
     /**
      * Gets the regex which must be matched by IPs in order to be crawled.
@@ -926,6 +981,8 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
         prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlerAlwaysCheckMediaType", this.isCrawlerAlwaysCheckMediaType());
         prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerURLMustMatch", this.get(CrawlAttribute.CRAWLER_URL_MUSTMATCH.key));
         prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerURLMustNotMatch", this.get(CrawlAttribute.CRAWLER_URL_MUSTNOTMATCH.key));
+        prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerOriginURLMustMatch", this.get(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key));
+        prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerOriginURLMustNotMatch", this.get(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTNOTMATCH.key));
         prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerIPMustMatch", this.get(CrawlAttribute.CRAWLER_IP_MUSTMATCH.key));
         prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerIPMustNotMatch", this.get(CrawlAttribute.CRAWLER_IP_MUSTNOTMATCH.key));
         prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerCountryMustMatch", this.get(CrawlAttribute.CRAWLER_COUNTRY_MUSTMATCH.key));
diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java
index f436b6e66..218f7b8f5 100644
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@@ -3111,95 +3111,112 @@ public final class Switchboard extends serverSwitch {
             )
            ) {
             
-            for (Document d: documents) d.setDepth(response.depth());
-            
-            // get the hyperlinks
-            final Map<AnchorURL, String> hl = Document.getHyperlinks(documents, !response.profile().obeyHtmlRobotsNofollow());
-            
-			final boolean addAllLinksToCrawlStack = response.profile().isIndexNonParseableUrls() /* unsupported resources have to be indexed as pure links if no parser support them */
-					|| response.profile().isCrawlerAlwaysCheckMediaType() /* the crawler must always load resources to double-check the actual Media Type even on unsupported file extensions */;
-			
-			/* Handle media links */
-			
-			for (Map.Entry<DigestURL, String> entry : Document.getImagelinks(documents).entrySet()) {
-				if (addAllLinksToCrawlStack
-						|| (response.profile().indexMedia() && TextParser.supportsExtension(entry.getKey()) == null)) {
-					hl.put(new AnchorURL(entry.getKey()), entry.getValue());
-				}
-			}
-			
-			for (Map.Entry<DigestURL, String> entry : Document.getApplinks(documents).entrySet()) {
-				if (addAllLinksToCrawlStack
-						|| (response.profile().indexMedia() && TextParser.supportsExtension(entry.getKey()) == null)) {
-					hl.put(new AnchorURL(entry.getKey()), entry.getValue());
-				}
-			}
-			
-			for (Map.Entry<DigestURL, String> entry : Document.getVideolinks(documents).entrySet()) {
-				if (addAllLinksToCrawlStack
-						|| (response.profile().indexMedia() && TextParser.supportsExtension(entry.getKey()) == null)) {
-					hl.put(new AnchorURL(entry.getKey()), entry.getValue());
-				}
-			}
-			
-			for (Map.Entry<DigestURL, String> entry : Document.getAudiolinks(documents).entrySet()) {
-				if (addAllLinksToCrawlStack
-						|| (response.profile().indexMedia() && TextParser.supportsExtension(entry.getKey()) == null)) {
-					hl.put(new AnchorURL(entry.getKey()), entry.getValue());
-				}
-			}
-            
-            // insert those hyperlinks to the crawler
-            MultiProtocolURL nextUrl;
-            for ( final Map.Entry<AnchorURL, String> nextEntry : hl.entrySet() ) {
-                // check for interruption
-                checkInterruption();
-
-                // process the next hyperlink
-                nextUrl = nextEntry.getKey();
-                String u = nextUrl.toNormalform(true, true);
-                if ( !(u.startsWith("http://")
-                    || u.startsWith("https://")
-                    || u.startsWith("ftp://")
-                    || u.startsWith("smb://") || u.startsWith("file://")) ) {
-                    continue;
+			final Pattern crawlerOriginUrlMustMatch = response.profile().getCrawlerOriginUrlMustMatchPattern();
+			final Pattern crawlerOriginUrlMustNotMatch = response.profile().getCrawlerOriginUrlMustNotMatchPattern();
+			if (!(crawlerOriginUrlMustMatch == CrawlProfile.MATCH_ALL_PATTERN
+					|| crawlerOriginUrlMustMatch.matcher(response.url().toNormalform(true)).matches())
+					|| (crawlerOriginUrlMustNotMatch != CrawlProfile.MATCH_NEVER_PATTERN
+							&& crawlerOriginUrlMustNotMatch.matcher(response.url().toNormalform(true)).matches())) {
+				if (this.log.isInfo()) {
+					this.log.info("CRAWL: Ignored links from document at " + response.url().toNormalform(true)
+							+ " : prevented by regular expression on URL origin of links, "
+							+ CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH + " = " + crawlerOriginUrlMustMatch.pattern()
+							+ ", " + CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTNOTMATCH + " = "
+							+ crawlerOriginUrlMustNotMatch.pattern());
+	            }
+			} else {
+                for (Document d: documents) {
+                	d.setDepth(response.depth());
                 }
                 
-                // rewrite the url
-                String u0 = LibraryProvider.urlRewriter.apply(u);
-                if (!u.equals(u0)) {
-                    log.info("REWRITE of url = \"" + u + "\" to \"" + u0 + "\"");
-                    u = u0;
-                }
-                //Matcher m = rewritePattern.matcher(u);
-                //if (m.matches()) u = m.replaceAll("");
+                // get the hyperlinks
+                final Map<AnchorURL, String> hl = Document.getHyperlinks(documents, !response.profile().obeyHtmlRobotsNofollow());
                 
-                // enqueue the hyperlink into the pre-notice-url db
-                int nextdepth = nextEntry.getValue() != null && nextEntry.getValue().equals(Document.CANONICAL_MARKER) ? response.depth() : response.depth() + 1; // canonical documents are on the same depth
-                try {
-                    this.crawlStacker.enqueueEntry(new Request(
-                        response.initiator(),
-                        new DigestURL(u),
-                        response.url().hash(),
-                        nextEntry.getValue(),
-                        new Date(),
-                        response.profile().handle(),
-                        nextdepth,
-                        response.profile().timezoneOffset()));
-                } catch (final MalformedURLException e ) {
-                    ConcurrentLog.logException(e);
+    			final boolean addAllLinksToCrawlStack = response.profile().isIndexNonParseableUrls() /* unsupported resources have to be indexed as pure links if no parser support them */
+    					|| response.profile().isCrawlerAlwaysCheckMediaType() /* the crawler must always load resources to double-check the actual Media Type even on unsupported file extensions */;
+    			
+    			/* Handle media links */
+    			
+    			for (Map.Entry<DigestURL, String> entry : Document.getImagelinks(documents).entrySet()) {
+    				if (addAllLinksToCrawlStack
+    						|| (response.profile().indexMedia() && TextParser.supportsExtension(entry.getKey()) == null)) {
+    					hl.put(new AnchorURL(entry.getKey()), entry.getValue());
+    				}
+    			}
+    			
+    			for (Map.Entry<DigestURL, String> entry : Document.getApplinks(documents).entrySet()) {
+    				if (addAllLinksToCrawlStack
+    						|| (response.profile().indexMedia() && TextParser.supportsExtension(entry.getKey()) == null)) {
+    					hl.put(new AnchorURL(entry.getKey()), entry.getValue());
+    				}
+    			}
+    			
+    			for (Map.Entry<DigestURL, String> entry : Document.getVideolinks(documents).entrySet()) {
+    				if (addAllLinksToCrawlStack
+    						|| (response.profile().indexMedia() && TextParser.supportsExtension(entry.getKey()) == null)) {
+    					hl.put(new AnchorURL(entry.getKey()), entry.getValue());
+    				}
+    			}
+    			
+    			for (Map.Entry<DigestURL, String> entry : Document.getAudiolinks(documents).entrySet()) {
+    				if (addAllLinksToCrawlStack
+    						|| (response.profile().indexMedia() && TextParser.supportsExtension(entry.getKey()) == null)) {
+    					hl.put(new AnchorURL(entry.getKey()), entry.getValue());
+    				}
+    			}
+                
+                // insert those hyperlinks to the crawler
+                MultiProtocolURL nextUrl;
+                for ( final Map.Entry<AnchorURL, String> nextEntry : hl.entrySet() ) {
+                    // check for interruption
+                    checkInterruption();
+
+                    // process the next hyperlink
+                    nextUrl = nextEntry.getKey();
+                    String u = nextUrl.toNormalform(true, true);
+                    if ( !(u.startsWith("http://")
+                        || u.startsWith("https://")
+                        || u.startsWith("ftp://")
+                        || u.startsWith("smb://") || u.startsWith("file://")) ) {
+                        continue;
+                    }
+                    
+                    // rewrite the url
+                    String u0 = LibraryProvider.urlRewriter.apply(u);
+                    if (!u.equals(u0)) {
+                        log.info("REWRITE of url = \"" + u + "\" to \"" + u0 + "\"");
+                        u = u0;
+                    }
+                    //Matcher m = rewritePattern.matcher(u);
+                    //if (m.matches()) u = m.replaceAll("");
+                    
+                    // enqueue the hyperlink into the pre-notice-url db
+                    int nextdepth = nextEntry.getValue() != null && nextEntry.getValue().equals(Document.CANONICAL_MARKER) ? response.depth() : response.depth() + 1; // canonical documents are on the same depth
+                    try {
+                        this.crawlStacker.enqueueEntry(new Request(
+                            response.initiator(),
+                            new DigestURL(u),
+                            response.url().hash(),
+                            nextEntry.getValue(),
+                            new Date(),
+                            response.profile().handle(),
+                            nextdepth,
+                            response.profile().timezoneOffset()));
+                    } catch (final MalformedURLException e ) {
+                        ConcurrentLog.logException(e);
+                    }
                 }
-            }
-            final long stackEndTime = System.currentTimeMillis();
-            if ( this.log.isInfo() ) {
-                this.log.info("CRAWL: ADDED "
-                    + hl.size()
-                    + " LINKS FROM "
-                    + response.url().toNormalform(true)
-                    + ", STACKING TIME = "
-                    + (stackEndTime - stackStartTime)
-                    + ", PARSING TIME = "
-                    + (parsingEndTime - parsingStartTime));
+                final long stackEndTime = System.currentTimeMillis();
+                if ( this.log.isInfo() ) {
+                    this.log.info("CRAWL: ADDED "
+                        + hl.size()
+                        + " LINKS FROM "
+                        + response.url().toNormalform(true)
+                        + ", STACKING TIME = "
+                        + (stackEndTime - stackStartTime)
+                        + ", PARSING TIME = "
+                        + (parsingEndTime - parsingStartTime));
+                }            	
             }
         }
         return documents;