From 2de159719b4612dde3e86c96bf66ccd52093b6e6 Mon Sep 17 00:00:00 2001
From: Michael Peter Christen <mc@yacy.net>
Date: Fri, 18 Jul 2014 12:43:01 +0200
Subject: [PATCH] added an option to set 'obey nofollow' for links with
 rel="nofollow" attribute in the <a> tag for each crawl. This introduces a lot
 of changes because it extends the usage of the AnchorURL Object type which
 now also has a different toString method that the underlying
 DigestURL.toString. It is therefore not advised to use .toString at all for
 urls, just just toNormalform(false) instead.

---
 defaults/yacy.init                            |  1 +
 htroot/BlacklistTest_p.java                   |  4 +-
 htroot/CrawlStartExpert.html                  |  7 ++--
 htroot/CrawlStartExpert.java                  | 13 +++----
 htroot/Crawler_p.java                         |  6 ++-
 htroot/QuickCrawlLink_p.java                  |  4 +-
 htroot/ViewFile.java                          |  2 +-
 htroot/api/getpageinfo.java                   |  6 +--
 htroot/api/getpageinfo_p.java                 |  2 +-
 .../net/yacy/cora/document/id/AnchorURL.java  | 12 ++++++
 .../cora/document/id/MultiProtocolURL.java    | 39 ++++++++++---------
 .../responsewriter/HTMLResponseWriter.java    |  2 +-
 source/net/yacy/crawler/CrawlStacker.java     |  6 +--
 source/net/yacy/crawler/CrawlSwitchboard.java | 18 ++++-----
 source/net/yacy/crawler/data/Cache.java       |  6 +--
 .../net/yacy/crawler/data/CrawlProfile.java   | 11 +++++-
 source/net/yacy/crawler/data/CrawlQueues.java |  4 +-
 .../yacy/crawler/retrieval/HTTPLoader.java    |  2 +-
 .../net/yacy/crawler/retrieval/Request.java   |  2 +-
 .../crawler/retrieval/SitemapImporter.java    |  2 +-
 source/net/yacy/data/BookmarkHelper.java      |  2 +-
 .../net/yacy/data/ymark/YMarkCrawlStart.java  |  3 +-
 source/net/yacy/document/Document.java        | 18 ++++++---
 .../document/parser/html/ContentScraper.java  |  4 +-
 .../yacy/document/parser/html/ImageEntry.java |  2 +-
 .../document/parser/rdfa/impl/RDFaParser.java |  2 +-
 .../yacy/document/parser/sitemapParser.java   |  2 +-
 .../net/yacy/document/parser/vcfParser.java   |  2 +-
 .../net/yacy/repository/LoaderDispatcher.java |  4 +-
 source/net/yacy/search/Switchboard.java       | 39 ++++++++++---------
 .../net/yacy/search/index/DocumentIndex.java  |  2 +-
 source/net/yacy/search/index/Segment.java     |  2 +-
 source/net/yacy/search/query/QueryParams.java | 14 ++++---
 source/net/yacy/search/query/SearchEvent.java |  2 +-
 .../schema/CollectionConfiguration.java       | 16 +++++---
 .../net/yacy/search/snippet/MediaSnippet.java |  4 +-
 .../net/yacy/search/snippet/ResultEntry.java  |  1 -
 .../yacy/server/http/HTTPDProxyHandler.java   |  6 +--
 .../yacy/search/snippet/TextSnippetTest.java  |  2 +-
 39 files changed, 158 insertions(+), 118 deletions(-)
diff --git a/defaults/yacy.init b/defaults/yacy.init
index d79b2684c..f79fcfbc7 100644
--- a/defaults/yacy.init
+++ b/defaults/yacy.init
@@ -550,6 +550,7 @@ crawlingFilter=.*
 crawlingQ=true
 followFrames=true
 obeyHtmlRobotsNoindex=true
+obeyHtmlRobotsNofollow=false
 storeHTCache=true
 storeTXCache=true
 
diff --git a/htroot/BlacklistTest_p.java b/htroot/BlacklistTest_p.java
index 429a7de40..7eb96d272 100644
--- a/htroot/BlacklistTest_p.java
+++ b/htroot/BlacklistTest_p.java
@@ -62,8 +62,8 @@ public class BlacklistTest_p {
             	testurl = null;
             }
             if(testurl != null) {
-                prop.putHTML("url",testurl.toString());
-                prop.putHTML("testlist_url",testurl.toString());
+                prop.putHTML("url",testurl.toNormalform(false));
+                prop.putHTML("testlist_url",testurl.toNormalform(false));
                 boolean isblocked = false;
 
                 if (Switchboard.urlBlacklist.isListed(BlacklistType.CRAWLER, testurl)) {
diff --git a/htroot/CrawlStartExpert.html b/htroot/CrawlStartExpert.html
index f38e18b78..227e978ad 100644
--- a/htroot/CrawlStartExpert.html
+++ b/htroot/CrawlStartExpert.html
@@ -298,9 +298,10 @@
             is accessed with URLs containing question marks. If you are unsure, do not check this to avoid crawl loops.
             Following frames is NOT done by Gxxg1e, but we do by default to have a richer content. 'nofollow' in robots metadata can be overridden; this does not affect obeying of the robots.txt which is never ignored.
             </span></span>
-            Accept URLs with query-part ('?'): <input type="checkbox" name="crawlingQ" id="crawlingQ" #(crawlingQChecked)#::checked="checked"#(/crawlingQChecked)# />&nbsp;&nbsp;
-            Obey html-robots-noindex: <input type="checkbox" name="obeyHtmlRobotsNoindex" id="obeyHtmlRobotsNoindex" #(obeyHtmlRobotsNoindexChecked)#::checked="checked"#(/obeyHtmlRobotsNoindexChecked)# /><!--&nbsp;&nbsp;
-            Follow Frames: <input type="checkbox" name="followFrames" id="followFrames" #(followFramesChecked)#::checked="checked"#(/followFramesChecked)# />&nbsp;&nbsp;-->
+            Accept URLs with query-part ('?'): <input type="checkbox" name="crawlingQ" id="crawlingQ" #(crawlingQChecked)#::checked="checked"#(/crawlingQChecked)# /><br/>
+            Obey html-robots-noindex: <input type="checkbox" name="obeyHtmlRobotsNoindex" id="obeyHtmlRobotsNoindex" #(obeyHtmlRobotsNoindexChecked)#::checked="checked"#(/obeyHtmlRobotsNoindexChecked)# /><br/>
+            Obey html-robots-nofollow: <input type="checkbox" name="obeyHtmlRobotsNofollow" id="obeyHtmlRobotsNofollow" #(obeyHtmlRobotsNofollowChecked)#::checked="checked"#(/obeyHtmlRobotsNofollowChecked)# /><!--<br/>
+            Follow Frames: <input type="checkbox" name="followFrames" id="followFrames" #(followFramesChecked)#::checked="checked"#(/followFramesChecked)# />-->
           </dd>
 	        <dt>Load Filter on URLs</dt>
 	        <dd><span class="info" style="float:right"><img src="env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">
diff --git a/htroot/CrawlStartExpert.java b/htroot/CrawlStartExpert.java
index 587d1b810..b3f2a6640 100644
--- a/htroot/CrawlStartExpert.java
+++ b/htroot/CrawlStartExpert.java
@@ -192,16 +192,15 @@ public class CrawlStartExpert {
         }
 
         // Accept URLs with query-part?
-        // Obey html-robots-noindex?
+        // Obey html-robots-noindex, nofollow?
         if (post == null) {
-            prop.put("crawlingQChecked",
-                    env.getConfigBool("crawlingQ", true) ? 1 : 0);
-            prop.put("obeyHtmlRobotsNoindexChecked",
-                    env.getConfigBool("obeyHtmlRobotsNoindex", true) ? 1 : 0);
+            prop.put("crawlingQChecked", env.getConfigBool("crawlingQ", true) ? 1 : 0);
+            prop.put("obeyHtmlRobotsNoindexChecked", env.getConfigBool("obeyHtmlRobotsNoindex", true) ? 1 : 0);
+            prop.put("obeyHtmlRobotsNofollowChecked", env.getConfigBool("obeyHtmlRobotsNofollow", true) ? 1 : 0);
         } else {
             prop.put("crawlingQChecked", post.getBoolean("crawlingQ") ? 1 : 0);
-            prop.put("obeyHtmlRobotsNoindexChecked",
-                    post.getBoolean("obeyHtmlRobotsNoindex") ? 1 : 0);
+            prop.put("obeyHtmlRobotsNoindexChecked", post.getBoolean("obeyHtmlRobotsNoindex") ? 1 : 0);
+            prop.put("obeyHtmlRobotsNofollowChecked", post.getBoolean("obeyHtmlRobotsNofollow") ? 1 : 0);
         }
 
         // Load Filter on URLs (range)
diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java
index 7e1ac3828..c83c4aba0 100644
--- a/htroot/Crawler_p.java
+++ b/htroot/Crawler_p.java
@@ -310,6 +310,9 @@ public class Crawler_p {
                 
                 boolean obeyHtmlRobotsNoindex = "on".equals(post.get("obeyHtmlRobotsNoindex", "false"));
                 env.setConfig("obeyHtmlRobotsNoindex", obeyHtmlRobotsNoindex);
+                
+                boolean obeyHtmlRobotsNofollow = "on".equals(post.get("obeyHtmlRobotsNofollow", "false"));
+                env.setConfig("obeyHtmlRobotsNofollow", obeyHtmlRobotsNofollow);
 
                 final boolean indexText = "on".equals(post.get("indexText", "false"));
                 env.setConfig("indexText", indexText);
@@ -444,7 +447,8 @@ public class Crawler_p {
                             directDocByURL,
                             crawlingIfOlder,
                             crawlingDomMaxPages,
-                            crawlingQ, followFrames, obeyHtmlRobotsNoindex,
+                            crawlingQ, followFrames,
+                            obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,
                             indexText,
                             indexMedia,
                             storeHTCache,
diff --git a/htroot/QuickCrawlLink_p.java b/htroot/QuickCrawlLink_p.java
index d79613767..46fef42fc 100644
--- a/htroot/QuickCrawlLink_p.java
+++ b/htroot/QuickCrawlLink_p.java
@@ -101,6 +101,7 @@ public class QuickCrawlLink_p {
         final boolean crawlingQ      = post.get("crawlingQ", "").equals("on");
         final boolean followFrames   = post.get("followFrames", "").equals("on");
         final boolean obeyHtmlRobotsNoindex = post.get("obeyHtmlRobotsNoindex", "").equals("on");
+        final boolean obeyHtmlRobotsNofollow = post.get("obeyHtmlRobotsNofollow", "").equals("on");
         final boolean indexText      = post.get("indexText", "off").equals("on");
         final boolean indexMedia     = post.get("indexMedia", "off").equals("on");
         final boolean storeHTCache   = post.get("storeHTCache", "").equals("on");
@@ -147,7 +148,8 @@ public class QuickCrawlLink_p {
                         true,
                         60 * 24 * 30, // recrawlIfOlder (minutes); here: one month
                         -1, // domMaxPages, if negative: no count restriction
-                        crawlingQ, followFrames, obeyHtmlRobotsNoindex,
+                        crawlingQ, followFrames,
+                        obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,
                         indexText, indexMedia,
                         storeHTCache, remoteIndexing,
                         CacheStrategy.IFFRESH,
diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java
index 5694966d0..ecfe14f8b 100644
--- a/htroot/ViewFile.java
+++ b/htroot/ViewFile.java
@@ -243,7 +243,7 @@ public class ViewFile {
                 prop.put("viewMode_publisher", document.dc_publisher());
                 prop.put("viewMode_format", document.dc_format());
                 prop.put("viewMode_identifier", document.dc_identifier());
-                prop.put("viewMode_source", url.toString());
+                prop.put("viewMode_source", url.toNormalform(false));
                 prop.put("viewMode_lat", document.lat());
                 prop.put("viewMode_lon", document.lon());
                 prop.put("viewMode_parsedText", markup(wordArray, content).replaceAll("\n", "<br />").replaceAll("\t", "&nbsp;&nbsp;&nbsp;&nbsp;"));
diff --git a/htroot/api/getpageinfo.java b/htroot/api/getpageinfo.java
index 3c1ec6ac5..bd5c9e7e7 100644
--- a/htroot/api/getpageinfo.java
+++ b/htroot/api/getpageinfo.java
@@ -168,10 +168,8 @@ public class getpageinfo {
             }
             if (actions.indexOf("oai",0) >= 0) {
 				try {
-					final DigestURL theURL = new DigestURL(url
-							+ "?verb=Identify");
-
-					final String oairesult = checkOAI(theURL.toString());
+					final DigestURL theURL = new DigestURL(url + "?verb=Identify");
+					final String oairesult = checkOAI(theURL.toNormalform(false));
 
 					prop.put("oai", oairesult == "" ? 0 : 1);
 
diff --git a/htroot/api/getpageinfo_p.java b/htroot/api/getpageinfo_p.java
index f280f87a0..1b7418ff3 100644
--- a/htroot/api/getpageinfo_p.java
+++ b/htroot/api/getpageinfo_p.java
@@ -173,7 +173,7 @@ public class getpageinfo_p {
 					final DigestURL theURL = new DigestURL(url
 							+ "?verb=Identify");
 
-					final String oairesult = checkOAI(theURL.toString());
+					final String oairesult = checkOAI(theURL.toNormalform(false));
 
 					prop.put("oai", oairesult == "" ? 0 : 1);
 
diff --git a/source/net/yacy/cora/document/id/AnchorURL.java b/source/net/yacy/cora/document/id/AnchorURL.java
index 21fb4dd3d..fc05853b7 100644
--- a/source/net/yacy/cora/document/id/AnchorURL.java
+++ b/source/net/yacy/cora/document/id/AnchorURL.java
@@ -127,4 +127,16 @@ public class AnchorURL extends DigestURL {
         return tagopts;
     }
     
+    public boolean attachedNofollow() {
+        return this.relProperty.indexOf("nofollow") >= 0;
+    }
+    
+    @Override
+    public String toString() {
+        return "<a href=\"" + this.toNormalform(false) + "\"" +
+                (this.nameProperty.length() > 0 ? (" name=\"" + this.nameProperty + "\"") : "") +
+                (this.relProperty.length() > 0 ? (" rel=\"" + this.relProperty + "\"") : "") +
+                ">" + this.textProperty + "</a>";
+    }
+    
 }
diff --git a/source/net/yacy/cora/document/id/MultiProtocolURL.java b/source/net/yacy/cora/document/id/MultiProtocolURL.java
index ed3f45596..f4fe6126b 100644
--- a/source/net/yacy/cora/document/id/MultiProtocolURL.java
+++ b/source/net/yacy/cora/document/id/MultiProtocolURL.java
@@ -857,6 +857,7 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
 
     @Override
     public String toString() {
+        assert false; // this shall not be used to avoid confusion with AnchorURL.toString
         return toNormalform(false);
     }
 
@@ -2006,9 +2007,9 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
         if (isSMB()) try {
             return TimeoutRequest.exists(getSmbFile(), SMB_TIMEOUT);
         } catch (final SmbException e) {
-            throw new IOException("SMB.exists SmbException (" + e.getMessage() + ") for " + toString());
+            throw new IOException("SMB.exists SmbException (" + e.getMessage() + ") for " + toNormalform(false));
         } catch (final MalformedURLException e) {
-            throw new IOException("SMB.exists MalformedURLException (" + e.getMessage() + ") for " + toString());
+            throw new IOException("SMB.exists MalformedURLException (" + e.getMessage() + ") for " + toNormalform(false));
         }
         return false;
     }
@@ -2018,9 +2019,9 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
         if (isSMB()) try {
             return TimeoutRequest.canRead(getSmbFile(), SMB_TIMEOUT);
         } catch (final SmbException e) {
-            throw new IOException("SMB.canRead SmbException (" + e.getMessage() + ") for " + toString());
+            throw new IOException("SMB.canRead SmbException (" + e.getMessage() + ") for " + toNormalform(false));
         } catch (final MalformedURLException e) {
-            throw new IOException("SMB.canRead MalformedURLException (" + e.getMessage() + ") for " + toString());
+            throw new IOException("SMB.canRead MalformedURLException (" + e.getMessage() + ") for " + toNormalform(false));
         }
         return false;
     }
@@ -2030,9 +2031,9 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
         if (isSMB()) try {
             return TimeoutRequest.canWrite(getSmbFile(), SMB_TIMEOUT);
         } catch (final SmbException e) {
-            throw new IOException("SMB.canWrite SmbException (" + e.getMessage() + ") for " + toString());
+            throw new IOException("SMB.canWrite SmbException (" + e.getMessage() + ") for " + toNormalform(false));
         } catch (final MalformedURLException e) {
-            throw new IOException("SMB.canWrite MalformedURLException (" + e.getMessage() + ") for " + toString());
+            throw new IOException("SMB.canWrite MalformedURLException (" + e.getMessage() + ") for " + toNormalform(false));
         }
         return false;
     }
@@ -2042,9 +2043,9 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
         if (isSMB()) try {
             return TimeoutRequest.isHidden(getSmbFile(), SMB_TIMEOUT);
         } catch (final SmbException e) {
-            throw new IOException("SMB.isHidden SmbException (" + e.getMessage() + ") for " + toString());
+            throw new IOException("SMB.isHidden SmbException (" + e.getMessage() + ") for " + toNormalform(false));
         } catch (final MalformedURLException e) {
-            throw new IOException("SMB.isHidden MalformedURLException (" + e.getMessage() + ") for " + toString());
+            throw new IOException("SMB.isHidden MalformedURLException (" + e.getMessage() + ") for " + toNormalform(false));
         }
         return false;
     }
@@ -2054,9 +2055,9 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
         if (isSMB()) try {
             return TimeoutRequest.isDirectory(getSmbFile(), SMB_TIMEOUT);
         } catch (final SmbException e) {
-            throw new IOException("SMB.isDirectory SmbException (" + e.getMessage() + ") for " + toString());
+            throw new IOException("SMB.isDirectory SmbException (" + e.getMessage() + ") for " + toNormalform(false));
         } catch (final MalformedURLException e) {
-            throw new IOException("SMB.isDirectory MalformedURLException (" + e.getMessage() + ") for " + toString());
+            throw new IOException("SMB.isDirectory MalformedURLException (" + e.getMessage() + ") for " + toNormalform(false));
         }
         return false;
     }
@@ -2082,9 +2083,9 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
         if (isSMB()) try {
             return TimeoutRequest.lastModified(getSmbFile(), SMB_TIMEOUT);
         } catch (final SmbException e) {
-            throw new IOException("SMB.lastModified SmbException (" + e.getMessage() + ") for " + toString());
+            throw new IOException("SMB.lastModified SmbException (" + e.getMessage() + ") for " + toNormalform(false));
         } catch (final MalformedURLException e) {
-            throw new IOException("SMB.lastModified MalformedURLException (" + e.getMessage() + ") for " + toString());
+            throw new IOException("SMB.lastModified MalformedURLException (" + e.getMessage() + ") for " + toNormalform(false));
         }
         return 0;
     }
@@ -2094,7 +2095,7 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
         if (isSMB()) try {
             return getSmbFile().getName();
         } catch (final MalformedURLException e) {
-            throw new IOException("SMB.getName MalformedURLException (" + e.getMessage() + ") for " + toString() );
+            throw new IOException("SMB.getName MalformedURLException (" + e.getMessage() + ") for " + toNormalform(false) );
         }
         if (isFTP()) {
             return this.getFileName();
@@ -2113,7 +2114,7 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
                 throw new IOException("SMB.list SmbException for " + sf.toString() + ": " + e.getMessage());
             }
         } catch (final MalformedURLException e) {
-            throw new IOException("SMB.list MalformedURLException for " + toString() + ": " + e.getMessage());
+            throw new IOException("SMB.list MalformedURLException for " + toNormalform(false) + ": " + e.getMessage());
         }
         return null;
     }
@@ -2262,10 +2263,10 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
             // check equality to java.net.URL
             if (((aURL == null) && (jURL != null)) ||
                 ((aURL != null) && (jURL == null)) ||
-                ((aURL != null) && (jURL != null) && (!(jURL.toString().equals(aURL.toString()))))) {
+                ((aURL != null) && (jURL != null) && (!(jURL.toString().equals(aURL.toNormalform(false)))))) {
                 System.out.println("Difference for environment=" + environment + ", url=" + url + ":");
                 System.out.println((jURL == null) ? "jURL rejected input" : "jURL=" + jURL.toString());
-                System.out.println((aURL == null) ? "aURL rejected input" : "aURL=" + aURL.toString());
+                System.out.println((aURL == null) ? "aURL rejected input" : "aURL=" + aURL.toNormalform(false));
             }
 
             // check stability: the normalform of the normalform must be equal to the normalform
@@ -2273,12 +2274,12 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
                 aURL1 = new MultiProtocolURL(aURL.toNormalform(false));
                 if (!(aURL1.toNormalform(false).equals(aURL.toNormalform(false)))) {
                     System.out.println("no stability for url:");
-                    System.out.println("aURL0=" + aURL.toString());
-                    System.out.println("aURL1=" + aURL1.toString());
+                    System.out.println("aURL0=" + aURL.toNormalform(false));
+                    System.out.println("aURL1=" + aURL1.toNormalform(false));
                 }
             } catch (final MalformedURLException e) {
                 System.out.println("no stability for url:");
-                System.out.println("aURL0=" + aURL.toString());
+                System.out.println("aURL0=" + aURL.toNormalform(false));
                 System.out.println("aURL1 cannot be computed:" + e.getMessage());
             }
         }
diff --git a/source/net/yacy/cora/federate/solr/responsewriter/HTMLResponseWriter.java b/source/net/yacy/cora/federate/solr/responsewriter/HTMLResponseWriter.java
index 38aaa9361..d275a92de 100644
--- a/source/net/yacy/cora/federate/solr/responsewriter/HTMLResponseWriter.java
+++ b/source/net/yacy/cora/federate/solr/responsewriter/HTMLResponseWriter.java
@@ -159,7 +159,7 @@ public class HTMLResponseWriter implements QueryResponseWriter {
         
         // add a link to re-crawl this url (in case it is a remote metadata only entry)
         String sku = tdoc.get(CollectionSchema.sku.getSolrFieldName());
-        final String jsc= "javascript:w = window.open('/QuickCrawlLink_p.html?indexText=on&indexMedia=on&crawlingQ=on&followFrames=on&obeyHtmlRobotsNoindex=on&xdstopw=on&title='+escape('"+title+"')+'&url='+escape('"+sku+"'),'_blank','height=250,width=600,resizable=yes,scrollbar=no,directory=no,menubar=no,location=no');w.focus();";
+        final String jsc= "javascript:w = window.open('/QuickCrawlLink_p.html?indexText=on&indexMedia=on&crawlingQ=on&followFrames=on&obeyHtmlRobotsNoindex=on&obeyHtmlRobotsNofollow=off&xdstopw=on&title='+escape('"+title+"')+'&url='+escape('"+sku+"'),'_blank','height=250,width=600,resizable=yes,scrollbar=no,directory=no,menubar=no,location=no');w.focus();";
         writer.write("<div class='btn btn-default btn-sm' style='float:right' onclick=\""+jsc+"\">re-crawl url</div>\n");
 
         writer.write("<h1 property=\"dc:Title\">" + title + "</h1>\n");
diff --git a/source/net/yacy/crawler/CrawlStacker.java b/source/net/yacy/crawler/CrawlStacker.java
index e0e6a30b8..c81a0dd07 100644
--- a/source/net/yacy/crawler/CrawlStacker.java
+++ b/source/net/yacy/crawler/CrawlStacker.java
@@ -395,7 +395,7 @@ public final class CrawlStacker {
             return null; // no evidence that we know that url
         }
         final boolean recrawl = profile.recrawlIfOlder() > oldDate.longValue();
-        final String urlstring = url.toString();
+        final String urlstring = url.toNormalform(false);
         if (recrawl) {
             if (CrawlStacker.log.isInfo())
                 CrawlStacker.log.info("RE-CRAWL of URL '" + urlstring + "': this url was crawled " +
@@ -409,7 +409,7 @@ public final class CrawlStacker {
         if (maxAllowedPagesPerDomain < Integer.MAX_VALUE && maxAllowedPagesPerDomain > 0) {
             final AtomicInteger dp = profile.getCount(url.getHost());
             if (dp != null && dp.get() >= maxAllowedPagesPerDomain) {
-                if (CrawlStacker.log.isFine()) CrawlStacker.log.fine("URL '" + url.toString() + "' appeared too often in crawl stack, a maximum of " + maxAllowedPagesPerDomain + " is allowed.");
+                if (CrawlStacker.log.isFine()) CrawlStacker.log.fine("URL '" + url.toNormalform(false) + "' appeared too often in crawl stack, a maximum of " + maxAllowedPagesPerDomain + " is allowed.");
                 return "crawl stack domain counter exceeded (test by profile)";
             }
 
@@ -435,7 +435,7 @@ public final class CrawlStacker {
 
         // check if the protocol is supported
         final String urlProtocol = url.getProtocol();
-        final String urlstring = url.toString();
+        final String urlstring = url.toNormalform(true);
         if (!Switchboard.getSwitchboard().loader.isSupportedProtocol(urlProtocol)) {
             CrawlStacker.log.severe("Unsupported protocol in URL '" + urlstring + "'.");
             return "unsupported protocol";
diff --git a/source/net/yacy/crawler/CrawlSwitchboard.java b/source/net/yacy/crawler/CrawlSwitchboard.java
index bfcb399e1..69bf338ea 100644
--- a/source/net/yacy/crawler/CrawlSwitchboard.java
+++ b/source/net/yacy/crawler/CrawlSwitchboard.java
@@ -288,7 +288,7 @@ public final class CrawlSwitchboard {
                 true,
                 CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE),
                 -1,
-				false, true, true,
+				false, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,
                 sb.getConfigBool(SwitchboardConstants.PROXY_INDEXING_LOCAL_TEXT, true),
                 sb.getConfigBool(SwitchboardConstants.PROXY_INDEXING_LOCAL_MEDIA, true),
                 true,
@@ -317,7 +317,7 @@ public final class CrawlSwitchboard {
                 false,
                 -1,
                 -1,
-                true, true, true,
+                true, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,
                 true,
                 true,
                 false,
@@ -346,7 +346,7 @@ public final class CrawlSwitchboard {
                 false,
                 CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE),
                 -1,
-                true, true, true,
+                true, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,
                 false,
                 false,
                 true,
@@ -375,7 +375,7 @@ public final class CrawlSwitchboard {
                 false,
                 CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE),
                 -1,
-                true, true, true,
+                true, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,
                 true,
                 true,
                 true,
@@ -405,7 +405,7 @@ public final class CrawlSwitchboard {
                 false,
                 CrawlProfile.getRecrawlDate(CRAWL_PROFILE_GREEDY_LEARNING_TEXT_RECRAWL_CYCLE),
                 -1,
-                true, true, true,
+                true, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,
                 false,
                 false,
                 true,
@@ -434,7 +434,7 @@ public final class CrawlSwitchboard {
                 false,
                 CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE),
                 -1,
-                true, true, true,
+                true, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,
                 false,
                 false,
                 true,
@@ -463,7 +463,7 @@ public final class CrawlSwitchboard {
                 false,
                 CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE),
                 -1,
-                true, true, true,
+                true, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,
                 false,
                 true,
                 true,
@@ -492,7 +492,7 @@ public final class CrawlSwitchboard {
                 false,
                 CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE),
                 -1,
-                true, true, false,
+                true, true, false, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,
                 true,
                 false,
                 false,
@@ -524,7 +524,7 @@ public final class CrawlSwitchboard {
                 false,
                 System.currentTimeMillis(),
                 -1,
-                true, true, false,
+                true, true, false, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,
                 true,
                 true,
                 false,
diff --git a/source/net/yacy/crawler/data/Cache.java b/source/net/yacy/crawler/data/Cache.java
index 9973f08a0..683b3b17d 100644
--- a/source/net/yacy/crawler/data/Cache.java
+++ b/source/net/yacy/crawler/data/Cache.java
@@ -201,9 +201,9 @@ public final class Cache {
 
     public static void store(final DigestURL url, final ResponseHeader responseHeader, final byte[] file) throws IOException {
         if (maxCacheSize == 0) return;
-        if (responseHeader == null) throw new IOException("Cache.store of url " + url.toString() + " not possible: responseHeader == null");
-        if (file == null) throw new IOException("Cache.store of url " + url.toString() + " not possible: file == null");
-        log.info("storing content of url " + url.toString() + ", " + file.length + " bytes");
+        if (responseHeader == null) throw new IOException("Cache.store of url " + url.toNormalform(false) + " not possible: responseHeader == null");
+        if (file == null) throw new IOException("Cache.store of url " + url.toNormalform(false) + " not possible: file == null");
+        log.info("storing content of url " + url.toNormalform(false) + ", " + file.length + " bytes");
 
         // store the file
         try {
diff --git a/source/net/yacy/crawler/data/CrawlProfile.java b/source/net/yacy/crawler/data/CrawlProfile.java
index 8af014fbe..98c979758 100644
--- a/source/net/yacy/crawler/data/CrawlProfile.java
+++ b/source/net/yacy/crawler/data/CrawlProfile.java
@@ -69,6 +69,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
     public static final String CRAWLING_Q       = "crawlingQ";
     public static final String FOLLOW_FRAMES    = "followFrames";
     public static final String OBEY_HTML_ROBOTS_NOINDEX = "obeyHtmlRobotsNoindex";
+    public static final String OBEY_HTML_ROBOTS_NOFOLLOW = "obeyHtmlRobotsNofollow";
     public static final String INDEX_TEXT       = "indexText";
     public static final String INDEX_MEDIA      = "indexMedia";
     public static final String STORE_HTCACHE    = "storeHTCache";
@@ -135,7 +136,8 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
                  final boolean directDocByURL,
                  final long recrawlIfOlder /*date*/,
                  final int domMaxPages,
-                 final boolean crawlingQ, final boolean followFrames, final boolean obeyHtmlRobotsNoindex,
+                 final boolean crawlingQ, final boolean followFrames,
+                 final boolean obeyHtmlRobotsNoindex, final boolean obeyHtmlRobotsNofollow,
                  final boolean indexText,
                  final boolean indexMedia,
                  final boolean storeHTCache,
@@ -170,6 +172,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
         put(CRAWLING_Q,       crawlingQ); // crawling of urls with '?'
         put(FOLLOW_FRAMES,    followFrames); // load pages contained in frames or ifames
         put(OBEY_HTML_ROBOTS_NOINDEX, obeyHtmlRobotsNoindex); // if false, then a meta robots tag containing 'noindex' is ignored
+        put(OBEY_HTML_ROBOTS_NOFOLLOW, obeyHtmlRobotsNofollow);
         put(INDEX_TEXT,       indexText);
         put(INDEX_MEDIA,      indexMedia);
         put(STORE_HTCACHE,    storeHTCache);
@@ -534,6 +537,12 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
         return (r.equals(Boolean.TRUE.toString()));
     }
 
+    public boolean obeyHtmlRobotsNofollow() {
+        final String r = get(OBEY_HTML_ROBOTS_NOFOLLOW);
+        if (r == null) return false;
+        return (r.equals(Boolean.TRUE.toString()));
+    }
+
     public boolean indexText() {
         final String r = get(INDEX_TEXT);
         if (r == null) return true;
diff --git a/source/net/yacy/crawler/data/CrawlQueues.java b/source/net/yacy/crawler/data/CrawlQueues.java
index 55c938502..803959c91 100644
--- a/source/net/yacy/crawler/data/CrawlQueues.java
+++ b/source/net/yacy/crawler/data/CrawlQueues.java
@@ -356,7 +356,7 @@ public class CrawlQueues {
                     }
                 }
             } else {
-                CrawlQueues.log.severe("Unsupported protocol in URL '" + url.toString());
+                CrawlQueues.log.severe("Unsupported protocol in URL '" + url.toNormalform(false));
             }
         } else {
             if (CrawlQueues.log.isFine()) CrawlQueues.log.fine(stats + ": LOST PROFILE HANDLE '" + urlEntry.profileHandle() + "' for URL " + urlEntry.url());
@@ -627,7 +627,7 @@ public class CrawlQueues {
                 while ((request = CrawlQueues.this.workerQueue.poll(10, TimeUnit.SECONDS)) != POISON_REQUEST) {
                     if (request == null) break; // we run this only for a specific time and then let the process die to clear up resources
                     request.setStatus("worker-initialized", WorkflowJob.STATUS_INITIATED);
-                    this.setName("CrawlQueues.Loader(" + request.url() + ")");
+                    this.setName("CrawlQueues.Loader(" + request.url().toNormalform(false) + ")");
                     CrawlProfile profile = CrawlQueues.this.sb.crawler.get(UTF8.getBytes(request.profileHandle()));
                     try {
                         // checking robots.txt for http(s) resources
diff --git a/source/net/yacy/crawler/retrieval/HTTPLoader.java b/source/net/yacy/crawler/retrieval/HTTPLoader.java
index a00b563af..9dbdcfda2 100644
--- a/source/net/yacy/crawler/retrieval/HTTPLoader.java
+++ b/source/net/yacy/crawler/retrieval/HTTPLoader.java
@@ -149,7 +149,7 @@ public final class HTTPLoader {
 
             // restart crawling with new url
             this.log.info("CRAWLER Redirection detected ('" + client.getHttpResponse().getStatusLine() + "') for URL " + requestURLString);
-            this.log.info("CRAWLER ..Redirecting request to: " + redirectionUrl);
+            this.log.info("CRAWLER ..Redirecting request to: " + redirectionUrl.toNormalform(false));
 
             this.sb.webStructure.generateCitationReference(url, redirectionUrl);
             
diff --git a/source/net/yacy/crawler/retrieval/Request.java b/source/net/yacy/crawler/retrieval/Request.java
index bcb5c6b24..1d027e5bc 100644
--- a/source/net/yacy/crawler/retrieval/Request.java
+++ b/source/net/yacy/crawler/retrieval/Request.java
@@ -225,7 +225,7 @@ public class Request extends WorkflowJob
             new byte[][] {
                 this.url.hash(),
                 this.initiator,
-                UTF8.getBytes(this.url.toString()),
+                UTF8.getBytes(this.url.toNormalform(false)),
                 this.refhash,
                 namebytes,
                 appdatestr,
diff --git a/source/net/yacy/crawler/retrieval/SitemapImporter.java b/source/net/yacy/crawler/retrieval/SitemapImporter.java
index a2530a391..b3594b109 100644
--- a/source/net/yacy/crawler/retrieval/SitemapImporter.java
+++ b/source/net/yacy/crawler/retrieval/SitemapImporter.java
@@ -98,7 +98,7 @@ public class SitemapImporter extends Thread {
         this.sb.crawlStacker.enqueueEntry(new Request(
                 ASCII.getBytes(this.sb.peers.mySeed().hash),
                 url,
-                null, // this.siteMapURL.toString(),
+                null, // this.siteMapURL.toNormalform(false),
                 entry.url(),
                 entry.lastmod(new Date()),
                 this.crawlingProfile.handle(),
diff --git a/source/net/yacy/data/BookmarkHelper.java b/source/net/yacy/data/BookmarkHelper.java
index abc7e2e51..c93019db1 100644
--- a/source/net/yacy/data/BookmarkHelper.java
+++ b/source/net/yacy/data/BookmarkHelper.java
@@ -149,7 +149,7 @@ public class BookmarkHelper {
             title = url.getNameProperty();
             ConcurrentLog.info("BOOKMARKS", "links.get(url)");
             if ("".equals(title)) {//cannot be displayed
-                title = url.toString();
+                title = url.toNormalform(false);
             }
             bm = db.new Bookmark(url);
             bm.setProperty(Bookmark.BOOKMARK_TITLE, title);
diff --git a/source/net/yacy/data/ymark/YMarkCrawlStart.java b/source/net/yacy/data/ymark/YMarkCrawlStart.java
index bbda8b5b7..7db41a887 100644
--- a/source/net/yacy/data/ymark/YMarkCrawlStart.java
+++ b/source/net/yacy/data/ymark/YMarkCrawlStart.java
@@ -184,7 +184,8 @@ public class YMarkCrawlStart extends HashMap<String,String>{
 		                CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_PROXY_RECRAWL_CYCLE),
 		                -1,
 		                crawlingQ,
-		                true, true, true, true, true, false,
+		                true, true, true, false,
+		                true, true, false,
 		                CacheStrategy.IFFRESH,
 		                "robot_" + CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA,
 		                ClientIdentification.yacyIntranetCrawlerAgentName); // TODO: make this a default profile in CrawlSwitchboard
diff --git a/source/net/yacy/document/Document.java b/source/net/yacy/document/Document.java
index fe3a1b0c2..7fb72b1a0 100644
--- a/source/net/yacy/document/Document.java
+++ b/source/net/yacy/document/Document.java
@@ -818,7 +818,7 @@ dc_rights
         final List<String>       descriptions  = new ArrayList<String>();
         final Collection<String> titles        = new LinkedHashSet<String>();
         final Collection<String> sectionTitles = new LinkedHashSet<String>();
-        final List<AnchorURL>       anchors       = new ArrayList<AnchorURL>();
+        final List<AnchorURL>    anchors       = new ArrayList<AnchorURL>();
         final LinkedHashMap<DigestURL, String> rss = new LinkedHashMap<DigestURL, String>();
         final LinkedHashMap<AnchorURL, ImageEntry> images = new LinkedHashMap<AnchorURL, ImageEntry>();
         final Set<String> languages = new HashSet<String>();
@@ -913,16 +913,22 @@ dc_rights
 
     public final static String CANONICAL_MARKER = "canonical";
     
-    public static Map<DigestURL, String> getHyperlinks(final Document[] documents) {
-        final Map<DigestURL, String> result = new HashMap<DigestURL, String>();
+    public static Map<AnchorURL, String> getHyperlinks(final Document[] documents, boolean includeNofollow) {
+        final Map<AnchorURL, String> result = new HashMap<>();
         for (final Document d: documents) {
-            result.putAll(d.getHyperlinks());
+            if (includeNofollow) {
+                result.putAll(d.getHyperlinks());
+            } else {
+                for (Map.Entry<AnchorURL, String> entry: d.getHyperlinks().entrySet()) {
+                    if (!entry.getKey().attachedNofollow()) result.put(entry.getKey(), entry.getValue());
+                }
+            }
             final Object parser = d.getParserObject();
             if (parser instanceof ContentScraper) {
                 final ContentScraper html = (ContentScraper) parser;
                 String refresh = html.getRefreshPath();
-                if (refresh != null && refresh.length() > 0) try {result.put(new DigestURL(refresh), "refresh");} catch (final MalformedURLException e) {}
-                DigestURL canonical = html.getCanonical();
+                if (refresh != null && refresh.length() > 0) try {result.put(new AnchorURL(refresh), "refresh");} catch (final MalformedURLException e) {}
+                AnchorURL canonical = html.getCanonical();
                 if (canonical != null) {
                     result.put(canonical, CANONICAL_MARKER);
                 }
diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java
index a41a9cc4d..e1d447616 100644
--- a/source/net/yacy/document/parser/html/ContentScraper.java
+++ b/source/net/yacy/document/parser/html/ContentScraper.java
@@ -183,7 +183,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
     private final CharBuffer content;
     private final EventListenerList htmlFilterEventListeners;
     private double lon, lat;
-    private DigestURL canonical, publisher;
+    private AnchorURL canonical, publisher;
     private final int maxLinks;
     private int breadcrumbs;
 
@@ -771,7 +771,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
         return this.script;
     }
 
-    public DigestURL getCanonical() {
+    public AnchorURL getCanonical() {
         return this.canonical;
     }
 
diff --git a/source/net/yacy/document/parser/html/ImageEntry.java b/source/net/yacy/document/parser/html/ImageEntry.java
index 60003f23c..f1d6061d7 100644
--- a/source/net/yacy/document/parser/html/ImageEntry.java
+++ b/source/net/yacy/document/parser/html/ImageEntry.java
@@ -133,7 +133,7 @@ public class ImageEntry implements Comparable<ImageEntry>, Comparator<ImageEntry
         final int ohc = (h).hashCode();
         if (thc < ohc) return -1;
         if (thc > ohc) return 1;
-        return this.imageurl.toString().compareTo((h).imageurl.toString());
+        return this.imageurl.toNormalform(true).compareTo((h).imageurl.toNormalform(true));
     }
 
     @Override
diff --git a/source/net/yacy/document/parser/rdfa/impl/RDFaParser.java b/source/net/yacy/document/parser/rdfa/impl/RDFaParser.java
index ceeaff2f7..e4cadf152 100644
--- a/source/net/yacy/document/parser/rdfa/impl/RDFaParser.java
+++ b/source/net/yacy/document/parser/rdfa/impl/RDFaParser.java
@@ -55,7 +55,7 @@ public class RDFaParser extends AbstractParser implements Parser {
 
 		// TODO: current hardcoded restriction: apply rdfa parser only on selected sources.
 
-		if (url.toString().contains(".yacy") || url.toString().contains("experiments")) {
+		if (url.toNormalform(true).contains(".yacy") || url.toNormalform(true).contains("experiments")) {
 		// if (true == false) {
 			Document rdfaDoc = parseRDFa(url, mimeType, charset, source);
 			Document[] retDocs = new Document[htmlDocs.length + 1];
diff --git a/source/net/yacy/document/parser/sitemapParser.java b/source/net/yacy/document/parser/sitemapParser.java
index a2616a6a6..4e14899e1 100644
--- a/source/net/yacy/document/parser/sitemapParser.java
+++ b/source/net/yacy/document/parser/sitemapParser.java
@@ -116,7 +116,7 @@ public class sitemapParser extends AbstractParser implements Parser {
         final HTTPClient client = new HTTPClient(agent);
         client.setHeader(requestHeader.entrySet());
         try {
-            client.GET(sitemapURL.toString(), false);
+            client.GET(sitemapURL.toNormalform(false), false);
             if (client.getStatusCode() != 200) {
                 throw new IOException("Unable to download the sitemap file " + sitemapURL +
                         "\nServer returned status: " + client.getHttpResponse().getStatusLine());
diff --git a/source/net/yacy/document/parser/vcfParser.java b/source/net/yacy/document/parser/vcfParser.java
index 1c78b213f..a60269b1d 100644
--- a/source/net/yacy/document/parser/vcfParser.java
+++ b/source/net/yacy/document/parser/vcfParser.java
@@ -179,7 +179,7 @@ public class vcfParser extends AbstractParser implements Parser {
                     } else if (key.toUpperCase().startsWith("URL")) {
                         try {
                             final AnchorURL newURL = new AnchorURL(value);
-                            newURL.setNameProperty(newURL.toString());
+                            newURL.setNameProperty(newURL.toNormalform(false));
                             anchors.add(newURL);
                             //parsedData.put(key,value);
                         } catch (final MalformedURLException ex) {/* ignore this */}
diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java
index c25cde6d6..004907dcc 100644
--- a/source/net/yacy/repository/LoaderDispatcher.java
+++ b/source/net/yacy/repository/LoaderDispatcher.java
@@ -396,7 +396,7 @@ public final class LoaderDispatcher {
      * @return a map from URLs to the anchor texts of the urls
      * @throws IOException
      */
-    public final Map<DigestURL, String> loadLinks(final AnchorURL url, final CacheStrategy cacheStrategy, BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException {
+    public final Map<AnchorURL, String> loadLinks(final AnchorURL url, final CacheStrategy cacheStrategy, BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException {
         final Response response = load(request(url, true, false), cacheStrategy, Integer.MAX_VALUE, blacklistType, agent);
         if (response == null) throw new IOException("response == null");
         final ResponseHeader responseHeader = response.getResponseHeader();
@@ -413,7 +413,7 @@ public final class LoaderDispatcher {
             throw new IOException("parser error: " + e.getMessage());
         }
 
-        return Document.getHyperlinks(documents);
+        return Document.getHyperlinks(documents, true);
     }
 
     public synchronized static void cleanupAccessTimeTable(final long timeout) {
diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java
index 7d55deaaf..ea48acef2 100644
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@@ -2584,26 +2584,27 @@ public final class Switchboard extends serverSwitch {
             for (Document d: documents) d.setDepth(response.depth());
             
             // get the hyperlinks
-            final Map<DigestURL, String> hl = Document.getHyperlinks(documents);
+            final Map<AnchorURL, String> hl = Document.getHyperlinks(documents, !response.profile().obeyHtmlRobotsNofollow());
+            
             if (response.profile().indexMedia()) {
                 for (Map.Entry<DigestURL, String> entry: Document.getImagelinks(documents).entrySet()) {
-                    if (TextParser.supportsExtension(entry.getKey()) == null) hl.put(entry.getKey(), entry.getValue());
+                    if (TextParser.supportsExtension(entry.getKey()) == null) hl.put(new AnchorURL(entry.getKey()), entry.getValue());
                 }
             }
             
             // add all media links also to the crawl stack. They will be re-sorted to the NOLOAD queue and indexed afterwards as pure links
             if (response.profile().directDocByURL()) {
                 for (Map.Entry<DigestURL, String> entry: Document.getImagelinks(documents).entrySet()) {
-                    if (TextParser.supportsExtension(entry.getKey()) != null) hl.put(entry.getKey(), entry.getValue());
+                    if (TextParser.supportsExtension(entry.getKey()) != null) hl.put(new AnchorURL(entry.getKey()), entry.getValue());
                 }
-                hl.putAll(Document.getApplinks(documents));
-                hl.putAll(Document.getVideolinks(documents));
-                hl.putAll(Document.getAudiolinks(documents));
+                for (Map.Entry<DigestURL, String> d: Document.getApplinks(documents).entrySet()) hl.put(new AnchorURL(d.getKey()), d.getValue());
+                for (Map.Entry<DigestURL, String> d: Document.getVideolinks(documents).entrySet()) hl.put(new AnchorURL(d.getKey()), d.getValue());
+                for (Map.Entry<DigestURL, String> d: Document.getAudiolinks(documents).entrySet()) hl.put(new AnchorURL(d.getKey()), d.getValue());
             }
 
             // insert those hyperlinks to the crawler
             MultiProtocolURL nextUrl;
-            for ( final Map.Entry<DigestURL, String> nextEntry : hl.entrySet() ) {
+            for ( final Map.Entry<AnchorURL, String> nextEntry : hl.entrySet() ) {
                 // check for interruption
                 checkInterruption();
 
@@ -2880,7 +2881,7 @@ public final class Switchboard extends serverSwitch {
 
     public final void addAllToIndex(
         final DigestURL url,
-        final Map<DigestURL, String> links,
+        final Map<AnchorURL, String> links,
         final SearchEvent searchEvent,
         final String heuristicName,
         final Map<String, Pattern> collections,
@@ -2893,15 +2894,15 @@ public final class Switchboard extends serverSwitch {
         }
 
         // check if some of the links match with the query
-        final Map<DigestURL, String> matcher = searchEvent.query.separateMatches(links);
+        final Map<AnchorURL, String> matcher = searchEvent.query.separateMatches(links);
 
         // take the matcher and load them all
-        for (final Map.Entry<DigestURL, String> entry : matcher.entrySet()) {
+        for (final Map.Entry<AnchorURL, String> entry : matcher.entrySet()) {
             urls.add(new DigestURL(entry.getKey(), (byte[]) null));
         }
 
         // take then the no-matcher and load them also
-        for (final Map.Entry<DigestURL, String> entry : links.entrySet()) {
+        for (final Map.Entry<AnchorURL, String> entry : links.entrySet()) {
             urls.add(new DigestURL(entry.getKey(), (byte[]) null));
         }
         addToIndex(urls, searchEvent, heuristicName, collections, doublecheck);
@@ -3479,12 +3480,12 @@ public final class Switchboard extends serverSwitch {
                     return;
                 }
 
-                final Map<DigestURL, String> links;
+                final Map<AnchorURL, String> links;
                 searchEvent.oneFeederStarted();
                 try {
                     links = Switchboard.this.loader.loadLinks(url, CacheStrategy.NOCACHE, BlacklistType.SEARCH, ClientIdentification.yacyIntranetCrawlerAgent);
                     if ( links != null ) {
-                        final Iterator<DigestURL> i = links.keySet().iterator();
+                        final Iterator<AnchorURL> i = links.keySet().iterator();
                         while ( i.hasNext() ) {
                             if ( !i.next().getHost().endsWith(host) ) {
                                 i.remove();
@@ -3518,13 +3519,13 @@ public final class Switchboard extends serverSwitch {
                     return;
                 }
 
-                final Map<DigestURL, String> links;
+                final Map<AnchorURL, String> links;
                 DigestURL url;
                 try {
                     links = Switchboard.this.loader.loadLinks(startUrl, CacheStrategy.IFFRESH, BlacklistType.SEARCH, ClientIdentification.yacyIntranetCrawlerAgent);
                     if (links != null) {
                         if (links.size() < 1000) { // limit to 1000 to skip large index pages
-                            final Iterator<DigestURL> i = links.keySet().iterator();
+                            final Iterator<AnchorURL> i = links.keySet().iterator();
                             final boolean globalcrawljob = Switchboard.this.getConfigBool(SwitchboardConstants.HEURISTIC_SEARCHRESULTS_CRAWLGLOBAL,false);
                             Collection<DigestURL> urls = new ArrayList<DigestURL>();
                             while (i.hasNext()) {
@@ -3590,11 +3591,11 @@ public final class Switchboard extends serverSwitch {
                     //System.out.println("BLEKKO: " + UTF8.String(resource));
                     rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource);
                     if ( rss != null ) {
-                        final Map<DigestURL, String> links = new TreeMap<DigestURL, String>();
-                        DigestURL uri;
+                        final Map<AnchorURL, String> links = new TreeMap<>();
+                        AnchorURL uri;
                         for ( final RSSMessage message : rss.getFeed() ) {
                             try {
-                                uri = new DigestURL(message.getLink());
+                                uri = new AnchorURL(message.getLink());
                                 links.put(uri, message.getTitle());
                             } catch (final MalformedURLException e ) {
                             }
@@ -3720,7 +3721,7 @@ public final class Switchboard extends serverSwitch {
                     final HTTPClient client = new HTTPClient(ClientIdentification.yacyInternetCrawlerAgent, timeout);
                     client.setHeader(reqHeader.entrySet());
 
-                    client.HEADResponse(url.toString(), false);
+                    client.HEADResponse(url.toNormalform(false), false);
                     int statusCode = client.getHttpResponse().getStatusLine().getStatusCode();
                     ResponseHeader header = new ResponseHeader(statusCode, client.getHttpResponse().getAllHeaders());
                     if (checkAge) {
diff --git a/source/net/yacy/search/index/DocumentIndex.java b/source/net/yacy/search/index/DocumentIndex.java
index 03161e768..77eb7120e 100644
--- a/source/net/yacy/search/index/DocumentIndex.java
+++ b/source/net/yacy/search/index/DocumentIndex.java
@@ -151,7 +151,7 @@ public class DocumentIndex extends Segment {
         try {
             documents = TextParser.parseSource(url, null, null, 0, length, url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent, null, null));
         } catch (final Exception e ) {
-            throw new IOException("cannot parse " + url.toString() + ": " + e.getMessage());
+            throw new IOException("cannot parse " + url.toNormalform(false) + ": " + e.getMessage());
         }
         //Document document = Document.mergeDocuments(url, null, documents);
         final SolrInputDocument[] rows = new SolrInputDocument[documents.length];
diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java
index 0b54dcee8..dff407505 100644
--- a/source/net/yacy/search/index/Segment.java
+++ b/source/net/yacy/search/index/Segment.java
@@ -592,7 +592,7 @@ public class Segment {
         int outlinksSame = document.inboundLinks().size();
         int outlinksOther = document.outboundLinks().size();
         final int urlLength = urlNormalform.length();
-        final int urlComps = MultiProtocolURL.urlComps(url.toString()).length;
+        final int urlComps = MultiProtocolURL.urlComps(url.toNormalform(false)).length;
 
         // create a word prototype which is re-used for all entries
         if ((this.termIndex != null && storeToRWI) || searchEvent != null) {
diff --git a/source/net/yacy/search/query/QueryParams.java b/source/net/yacy/search/query/QueryParams.java
index c5dd35a5e..60599232b 100644
--- a/source/net/yacy/search/query/QueryParams.java
+++ b/source/net/yacy/search/query/QueryParams.java
@@ -35,10 +35,11 @@ import java.util.Set;
 import java.util.SortedSet;
 import java.util.regex.Pattern;
 import java.util.regex.PatternSyntaxException;
+
 import net.yacy.cora.document.analysis.Classification;
 import net.yacy.cora.document.analysis.Classification.ContentDomain;
 import net.yacy.cora.document.encoding.ASCII;
-import net.yacy.cora.document.id.DigestURL;
+import net.yacy.cora.document.id.AnchorURL;
 import net.yacy.cora.federate.solr.Ranking;
 import net.yacy.cora.federate.yacy.CacheStrategy;
 import net.yacy.cora.geo.GeoLocation;
@@ -60,6 +61,7 @@ import net.yacy.search.index.Segment;
 import net.yacy.search.ranking.RankingProfile;
 import net.yacy.search.schema.CollectionConfiguration;
 import net.yacy.search.schema.CollectionSchema;
+
 import org.apache.solr.client.solrj.SolrQuery;
 import org.apache.solr.client.solrj.SolrQuery.SortClause;
 import org.apache.solr.common.params.CommonParams;
@@ -522,11 +524,11 @@ public final class QueryParams {
         return this.queryGoal;
     }
 
-    public final Map<DigestURL, String> separateMatches(final Map<DigestURL, String> links) {
-        final Map<DigestURL, String> matcher = new HashMap<DigestURL, String>();
-        final Iterator <Map.Entry<DigestURL, String>> i = links.entrySet().iterator();
-        Map.Entry<DigestURL, String> entry;
-        DigestURL url;
+    public final Map<AnchorURL, String> separateMatches(final Map<AnchorURL, String> links) {
+        final Map<AnchorURL, String> matcher = new HashMap<>();
+        final Iterator <Map.Entry<AnchorURL, String>> i = links.entrySet().iterator();
+        Map.Entry<AnchorURL, String> entry;
+        AnchorURL url;
         String anchorText;
         while (i.hasNext()) {
             entry = i.next();
diff --git a/source/net/yacy/search/query/SearchEvent.java b/source/net/yacy/search/query/SearchEvent.java
index 0be3a2e7a..2349849b6 100644
--- a/source/net/yacy/search/query/SearchEvent.java
+++ b/source/net/yacy/search/query/SearchEvent.java
@@ -1531,7 +1531,7 @@ public final class SearchEvent {
         }
         @Override
         public String toString() {
-            return this.imageUrl.toString();
+            return this.imageUrl.toNormalform(false);
         }
     }
     
diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java
index e694cc3d9..5c070785d 100644
--- a/source/net/yacy/search/schema/CollectionConfiguration.java
+++ b/source/net/yacy/search/schema/CollectionConfiguration.java
@@ -1199,10 +1199,12 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
                                         }
                                         proccount.incrementAndGet();
                                         allcount.incrementAndGet();
-                                        if (proccount.get() % 1000 == 0) ConcurrentLog.info(
-                                                "CollectionConfiguration", "webgraph - postprocessed " + proccount + " from " + count + " documents; " +
+                                        if (proccount.get() % 1000 == 0) {
+                                            postprocessingActivity = "writing cr values to webgraph for host " + hostfinal + "postprocessed " + proccount + " from " + count + " documents; " +
                                                 (proccount.get() * 1000 / (System.currentTimeMillis() - start)) + " docs/second; " +
-                                                ((System.currentTimeMillis() - start) * (count - proccount.get()) / proccount.get() / 60000) + " minutes remaining for host " + hostfinal);
+                                                ((System.currentTimeMillis() - start) * (count - proccount.get()) / proccount.get() / 60000) + " minutes remaining";
+                                            ConcurrentLog.info("CollectionConfiguration", postprocessingActivity);
+                                        }
                                     }
                                 } catch (InterruptedException e) {
                                     ConcurrentLog.warn("CollectionConfiguration", e.getMessage(), e);
@@ -1301,10 +1303,12 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
                     collectionConnector.add(sid);
                     
                     proccount++; allcount.incrementAndGet();
-                    if (proccount % 100 == 0) ConcurrentLog.info(
-                            "CollectionConfiguration", "collection - postprocessed " + proccount + " from " + count + " documents; " +
+                    if (proccount % 100 == 0) {
+                        postprocessingActivity = "postprocessed " + proccount + " from " + count + " collection documents; " +
                             (proccount * 1000 / (System.currentTimeMillis() - start)) + " docs/second; " +
-                            ((System.currentTimeMillis() - start) * (count - proccount) / proccount / 60000) + " minutes remaining");
+                            ((System.currentTimeMillis() - start) * (count - proccount) / proccount / 60000) + " minutes remaining";
+                        ConcurrentLog.info("CollectionConfiguration", postprocessingActivity);
+                    }
                 } catch (final Throwable e1) {
                     ConcurrentLog.logException(e1);
                     failids.add(i);
diff --git a/source/net/yacy/search/snippet/MediaSnippet.java b/source/net/yacy/search/snippet/MediaSnippet.java
index b2d2fc2c2..c88890007 100644
--- a/source/net/yacy/search/snippet/MediaSnippet.java
+++ b/source/net/yacy/search/snippet/MediaSnippet.java
@@ -202,7 +202,7 @@ public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaS
         while (i.hasNext()) {
             ientry = i.next();
             url = ientry.url();
-            final String u = url.toString();
+            final String u = url.toNormalform(false);
             if (isUrlBlacklisted(BlacklistType.SEARCH, url)) continue;
             if (u.indexOf(".ico",0) >= 0 || u.indexOf("favicon",0) >= 0) continue;
             if (ientry.height() > 0 && ientry.height() < 32) continue;
@@ -262,7 +262,7 @@ public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaS
         if (isBlacklisted) {
             
             Switchboard.getSwitchboard().crawlQueues.errorURL.push(url, 996, null, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
-            ConcurrentLog.fine("snippet fetch", "MEDIA-SNIPPET Rejecting URL '" + url.toString() + "'. URL is in blacklist.");
+            ConcurrentLog.fine("snippet fetch", "MEDIA-SNIPPET Rejecting URL '" + url.toNormalform(false) + "'. URL is in blacklist.");
         }
 
         return isBlacklisted;
diff --git a/source/net/yacy/search/snippet/ResultEntry.java b/source/net/yacy/search/snippet/ResultEntry.java
index 7eafcdbd1..11dc8e64d 100644
--- a/source/net/yacy/search/snippet/ResultEntry.java
+++ b/source/net/yacy/search/snippet/ResultEntry.java
@@ -29,7 +29,6 @@ package net.yacy.search.snippet;
 import java.io.IOException;
 import java.util.Comparator;
 import java.util.Date;
-import java.util.List;
 
 import net.yacy.cora.document.id.DigestURL;
 import net.yacy.cora.document.id.MultiProtocolURL;
diff --git a/source/net/yacy/server/http/HTTPDProxyHandler.java b/source/net/yacy/server/http/HTTPDProxyHandler.java
index c00f3d2eb..65e5760e6 100644
--- a/source/net/yacy/server/http/HTTPDProxyHandler.java
+++ b/source/net/yacy/server/http/HTTPDProxyHandler.java
@@ -585,7 +585,7 @@ public final class HTTPDProxyHandler {
                         }
                     } else {
                         // no caching
-                        if (log.isFine()) log.fine(reqID +" "+ url.toString() + " not cached." +
+                        if (log.isFine()) log.fine(reqID +" "+ url.toNormalform(false) + " not cached." +
                                 " StoreError=" + ((storeError==null)?"None":storeError) +
                                 " StoreHTCache=" + storeHTCache +
                                 " SupportError=" + supportError);
@@ -663,7 +663,7 @@ public final class HTTPDProxyHandler {
             if (requestHeader.containsKey(RequestHeader.IF_MODIFIED_SINCE)) {
                 // conditional request: freshness of cache for that condition was already
                 // checked within shallUseCache(). Now send only a 304 response
-                log.info("CACHE HIT/304 " + url.toString());
+                log.info("CACHE HIT/304 " + url.toNormalform(false));
                 conProp.put(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_CODE,"TCP_REFRESH_HIT");
 
                 // setting the content length header to 0
@@ -674,7 +674,7 @@ public final class HTTPDProxyHandler {
                 //respondHeader(respond, "304 OK", cachedResponseHeader); // respond with 'not modified'
             } else {
                 // unconditional request: send content of cache
-                log.info("CACHE HIT/203 " + url.toString());
+                log.info("CACHE HIT/203 " + url.toNormalform(false));
                 conProp.put(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_CODE,"TCP_HIT");
 
                 // setting the content header to the proper length
diff --git a/test/net/yacy/search/snippet/TextSnippetTest.java b/test/net/yacy/search/snippet/TextSnippetTest.java
index 0c92f581f..e6b4330d6 100644
--- a/test/net/yacy/search/snippet/TextSnippetTest.java
+++ b/test/net/yacy/search/snippet/TextSnippetTest.java
@@ -34,7 +34,7 @@ public class TextSnippetTest {
         doc = new SolrDocument();
         DigestURL url = new DigestURL("http://localhost/page.html");
         doc.addField(CollectionSchema.id.name(), ASCII.String(url.hash()));
-        doc.addField(CollectionSchema.sku.name(),url.toString());
+        doc.addField(CollectionSchema.sku.name(), url.toNormalform(false));
         // for testcases add other fields
         // fields involved in snippet extraction:
         // url, title, keywords, author, text_t