From f3a6b6e21e1e4acb57dec36dc5c9d44db2026696 Mon Sep 17 00:00:00 2001
From: Michael Peter Christen <mc@yacy.net>
Date: Thu, 10 Jul 2014 01:59:29 +0200
Subject: [PATCH 1/9] fix for bad URL decoding

---
 .../cora/document/id/MultiProtocolURL.java    | 61 ++++++++++++-------
 .../document/parser/html/CharacterCoding.java | 10 ++-
 2 files changed, 48 insertions(+), 23 deletions(-)

diff --git a/source/net/yacy/cora/document/id/MultiProtocolURL.java b/source/net/yacy/cora/document/id/MultiProtocolURL.java
index 7752f4f30..ed3f45596 100644
--- a/source/net/yacy/cora/document/id/MultiProtocolURL.java
+++ b/source/net/yacy/cora/document/id/MultiProtocolURL.java
@@ -59,7 +59,6 @@ import net.yacy.cora.protocol.http.HTTPClient;
 import net.yacy.cora.util.CommonPattern;
 import net.yacy.cora.util.ConcurrentLog;
 import net.yacy.crawler.retrieval.Response;
-import net.yacy.document.parser.html.CharacterCoding;
 
 /**
  * MultiProtocolURI provides a URL object for multiple protocols like http, https, ftp, smb and file
@@ -225,13 +224,15 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
                 if (h.startsWith("///")) { //absolute local file path
                     // no host given
                     this.path = h.substring(2); // "/path"  or "/c:/path"
-                } else { // "//host/path" or "//host/c:/path"
+                } else if (h.startsWith("//")) { // "//host/path" or "//host/c:/path"
                     int q = url.indexOf('/', p + 3);
                     if (q < 0) {
                         this.path = "/";
                     } else {
                         this.path = url.substring(q);
                     }
+                } else if (h.startsWith("/")) { // "/host/path" or "/host/c:/path"
+                    this.path = h;
                 }
                 this.userInfo = null;
                 this.port = -1;
@@ -418,7 +419,7 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
     private void escape() {
         if (this.path != null && this.path.indexOf('%') == -1) escapePath();
         if (this.searchpart != null && this.searchpart.indexOf('%') == -1) escapeSearchpart();
-        if (this.anchor != null && this.anchor.indexOf('%') == -1) escapeAnchor();
+        if (this.anchor != null) this.anchor = escape(this.anchor).toString();
     }
 
     private void escapePath() {
@@ -431,10 +432,6 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
         this.path = ptmp.substring((ptmp.length() > 0) ? 1 : 0);
     }
 
-    private void escapeAnchor() {
-        this.anchor = escape(this.anchor).toString();
-    }
-
     private void escapeSearchpart() {
         final String[] questp = CommonPattern.AMP.split(this.searchpart, -1);
         final StringBuilder qtmp = new StringBuilder(this.searchpart.length() + 10);
@@ -517,24 +514,39 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
         final StringBuilder sbuf = new StringBuilder(len + 10);
         for (int i = 0; i < len; i++) {
             final int ch = s.charAt(i);
-            if ('A' <= ch && ch <= 'Z') {           // 'A'..'Z'
-                sbuf.append((char)ch);
-            } else if ('a' <= ch && ch <= 'z') {    // 'a'..'z'
+            if (ch == ' ') {                 // space
+                sbuf.append("%20");
+            } else if (ch == '%') {
+                if (i < len - 2 && s.charAt(i + 1) >= '0' && s.charAt(i + 1) <= '9' && s.charAt(i + 2) >= '0' && s.charAt(i + 2) <= '9') {
+                    sbuf.append((char)ch);   // lets consider this is used for encoding, leave it that way
+                } else {
+                    sbuf.append("%23");      // RFC 1738 2.2 unsafe char shall be encoded
+                }
+            } else if (ch == '&') { 
+                if (i < len - 6 && "amp;".equals(s.substring(i + 1, i + 5).toLowerCase())) {
+                    sbuf.append((char)ch);   // leave it that way, it is used the right way
+                } else {
+                    sbuf.append("&amp;");    // this must be urlencoded
+                }
                 sbuf.append((char)ch);
-            } else if ('0' <= ch && ch <= '9') {    // '0'..'9'
+            } else if (ch == '#') {          // RFC 1738 2.2 unsafe char is _not_ encoded because it may already be used for encoding 
                 sbuf.append((char)ch);
-            } else if (ch == ' ') {                 // space
-                sbuf.append("%20");
-            } else if (ch == '&' || ch == ':'       // unreserved
+            } else if (ch == '!' || ch == ':'   // unreserved
                     || ch == '-' || ch == '_'
-                    || ch == '.' || ch == '!'
-                    || ch == '~' || ch == '*'
-                    || ch == '\'' || ch == '('
-                    || ch == ')' || ch == ';' 
-                    || ch == ',' || ch == '=') { // RFC 1738 2.2 special char (may be used unencoded)
+                    || ch == '.' || ch == '~' 
+                    || ch == '*' || ch == '\''
+                    || ch == '(' || ch == ')'
+                    || ch == '{' || ch == '}'
+                    || ch == ';' || ch == ',' || ch == '=') {    // RFC 1738 2.2 unsafe char (may be used unencoded)
+                sbuf.append((char)ch);
+            } else if ('0' <= ch && ch <= '9') {    // '0'..'9'
                 sbuf.append((char)ch);
             } else if (ch == '/') {                 // reserved, but may appear in post part where it should not be replaced
                 sbuf.append((char)ch);
+            } else if ('A' <= ch && ch <= 'Z') {    // 'A'..'Z'
+                sbuf.append((char)ch);
+            } else if ('a' <= ch && ch <= 'z') {    // 'a'..'z'
+                sbuf.append((char)ch);
             } else if (ch <= 0x007f) {              // other ASCII
                 sbuf.append(hex[ch]);
             } else if (ch <= 0x07FF) {              // non-ASCII <= 0x7FF
@@ -647,11 +659,15 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
         } else {
             this.searchpart = this.path.substring(r + 1);
             // strip &amp;
+            /*
             Matcher matcher = CharacterCoding.ampPattern.matcher(this.searchpart);
-            while (matcher.find()) {
+            int from = 0;
+            while (matcher.find(from)) {
+                from = matcher.start() + 1;
                 this.searchpart = matcher.replaceAll("&");
                 matcher.reset(this.searchpart);
             }
+            */
             this.path = this.path.substring(0, r);
         }
     }
@@ -934,7 +950,7 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
         }
         String urlPath = this.getFile(excludeAnchor, removeSessionID);
         String h = getHost();
-        final StringBuilder u = new StringBuilder(20 + urlPath.length() + ((h == null) ? 0 : h.length()));
+        final StringBuilder u = new StringBuilder(20 + (urlPath == null ? 0 : urlPath.length()) + ((h == null) ? 0 : h.length()));
         u.append(this.protocol);
         u.append("://");
         if (h != null) {
@@ -2179,10 +2195,11 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
 */
     public static void main(final String[] args) {
         final String[][] test = new String[][]{
+          new String[]{null, "https://www.example.com/shoe/?p=2&ps=75#t={%22san_NaviPaging%22:2}"}, // ugly strange pagination link
           new String[]{null, "C:WINDOWS\\CMD0.EXE"},
           new String[]{null, "file://C:WINDOWS\\CMD0.EXE"},
-          new String[]{null, "file:/bin/yacy1"}, // file://<host>/<path> may have many '/' if the host is omitted and the path starts with '/'
           new String[]{null, "file:///bin/yacy2"}, // file://<host>/<path> may have many '/' if the host is omitted and the path starts with '/'
+          new String[]{null, "file:/bin/yacy1"}, // file://<host>/<path> may have many '/' if the host is omitted and the path starts with '/'
           new String[]{null, "file:C:WINDOWS\\CMD.EXE"},
           new String[]{null, "file:///C:WINDOWS\\CMD1.EXE"},
           new String[]{null, "file:///C|WINDOWS\\CMD2.EXE"},
diff --git a/source/net/yacy/document/parser/html/CharacterCoding.java b/source/net/yacy/document/parser/html/CharacterCoding.java
index f93300cbd..7541e22e1 100644
--- a/source/net/yacy/document/parser/html/CharacterCoding.java
+++ b/source/net/yacy/document/parser/html/CharacterCoding.java
@@ -312,6 +312,13 @@ public final class CharacterCoding {
             }
             s = text.substring(p, q + 1);
             p = q + 1;
+            // check if another ampersand is in between
+            int pp;
+            while ((pp = s.indexOf('&', 1)) >= 0) {
+                // we skip the first ampersand
+                sb.append(s.substring(0, pp));
+                s = s.substring(pp);
+            }
             if (s.equals(AMP_HTML)) {
                 sb.append(AMP_UNICODE);
                 continue;
@@ -340,7 +347,8 @@ public final class CharacterCoding {
                 } catch (final NumberFormatException e) { }
                 continue;
             }
-            // the entity is unknown, skip it
+            // the entity is unknown, copy it
+            sb.append(s);
         }
         return sb.toString();
     }

From 32bd2a61c1e741c099ba0b788631bdd51c49f691 Mon Sep 17 00:00:00 2001
From: reger <reger18@arcor.de>
Date: Thu, 10 Jul 2014 02:09:26 +0200
Subject: [PATCH 2/9] add local ip to AbstractRemoteHandler local hostname
 cache

---
 source/net/yacy/http/AbstractRemoteHandler.java | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/source/net/yacy/http/AbstractRemoteHandler.java b/source/net/yacy/http/AbstractRemoteHandler.java
index 776e8362c..4e779ec4f 100644
--- a/source/net/yacy/http/AbstractRemoteHandler.java
+++ b/source/net/yacy/http/AbstractRemoteHandler.java
@@ -49,7 +49,7 @@ import org.eclipse.jetty.server.Request;
  */
 abstract public class AbstractRemoteHandler extends ConnectHandler implements Handler {
 	
-	protected Switchboard sb = null;
+    protected Switchboard sb = null;
     private List<String> localVirtualHostNames; // list for quick check for req to local peer
     
     @Override
@@ -66,6 +66,7 @@ abstract public class AbstractRemoteHandler extends ConnectHandler implements Ha
         if (localInetAddress != null) {
             if (!localVirtualHostNames.contains(localInetAddress.getHostName())) {
                 localVirtualHostNames.add(localInetAddress.getHostName());
+                localVirtualHostNames.add(localInetAddress.getHostAddress());  // same as getServer().getURI().getHost()
             }
 
             if (!localVirtualHostNames.contains(localInetAddress.getCanonicalHostName())) {

From 336425912acd2ae46bf0ece136e8a172d247cb3c Mon Sep 17 00:00:00 2001
From: reger <reger18@arcor.de>
Date: Thu, 10 Jul 2014 02:14:03 +0200
Subject: [PATCH 3/9] remove unused localSearchThread from SearchEvent

---
 source/net/yacy/search/query/SearchEvent.java | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/source/net/yacy/search/query/SearchEvent.java b/source/net/yacy/search/query/SearchEvent.java
index 137f7050c..0be3a2e7a 100644
--- a/source/net/yacy/search/query/SearchEvent.java
+++ b/source/net/yacy/search/query/SearchEvent.java
@@ -130,7 +130,6 @@ public final class SearchEvent {
     public final List<Thread> nodeSearchThreads;
     public Thread[] secondarySearchThreads;
     public final SortedMap<byte[], String> preselectedPeerHashes;
-    private final Thread localSearchThread;
     private final SortedMap<byte[], Integer> IACount;
     private final SortedMap<byte[], String> IAResults;
     private final SortedMap<byte[], HeuristicResult> heuristics;
@@ -249,7 +248,6 @@ public final class SearchEvent {
         this.heuristics = new TreeMap<byte[], HeuristicResult>(Base64Order.enhancedCoder);
         this.IAmaxcounthash = null;
         this.IAneardhthash = null;
-        this.localSearchThread = null;
         this.remote = (peers != null && peers.sizeConnected() > 0) && (this.query.domType == QueryParams.Searchdom.CLUSTER || (this.query.domType == QueryParams.Searchdom.GLOBAL && Switchboard.getSwitchboard().getConfigBool(SwitchboardConstants.INDEX_RECEIVE_ALLOW_SEARCH, false)));
         this.local_rwi_available  = new AtomicInteger(0); // the number of results in the local peer after filtering
         this.local_rwi_stored     = new AtomicInteger(0);
@@ -650,7 +648,6 @@ public final class SearchEvent {
 
         // clear all data structures
         if (this.preselectedPeerHashes != null) this.preselectedPeerHashes.clear();
-        if (this.localSearchThread != null && this.localSearchThread.isAlive()) this.localSearchThread.interrupt();
         if (this.IACount != null) this.IACount.clear();
         if (this.IAResults != null) this.IAResults.clear();
         if (this.heuristics != null) this.heuristics.clear();

From 665e12f88e8f2edbae1d825dc79d29b578954f9d Mon Sep 17 00:00:00 2001
From: reger <reger18@arcor.de>
Date: Thu, 10 Jul 2014 02:17:56 +0200
Subject: [PATCH 4/9] move startup time from old serverCore to switchboard
 (most used here) to make servercore eventually obsolete.

---
 .../net/yacy/http/servlets/YaCyDefaultServlet.java   |  2 +-
 source/net/yacy/peers/Network.java                   |  5 -----
 source/net/yacy/search/Switchboard.java              | 12 ++++++------
 source/net/yacy/server/serverCore.java               |  1 -
 4 files changed, 7 insertions(+), 13 deletions(-)

diff --git a/source/net/yacy/http/servlets/YaCyDefaultServlet.java b/source/net/yacy/http/servlets/YaCyDefaultServlet.java
index 9b9703fbc..7eb2eb16d 100644
--- a/source/net/yacy/http/servlets/YaCyDefaultServlet.java
+++ b/source/net/yacy/http/servlets/YaCyDefaultServlet.java
@@ -895,7 +895,7 @@ public class YaCyDefaultServlet extends HttpServlet  {
 
                 // add the application version, the uptime and the client name to every rewrite table
                 templatePatterns.put(servletProperties.PEER_STAT_VERSION, yacyBuildProperties.getVersion());
-                templatePatterns.put(servletProperties.PEER_STAT_UPTIME, ((System.currentTimeMillis() - serverCore.startupTime) / 1000) / 60); // uptime in minutes
+                templatePatterns.put(servletProperties.PEER_STAT_UPTIME, ((System.currentTimeMillis() - sb.startupTime) / 1000) / 60); // uptime in minutes
                 templatePatterns.putHTML(servletProperties.PEER_STAT_CLIENTNAME, sb.peers.mySeed().getName());
                 templatePatterns.putHTML(servletProperties.PEER_STAT_CLIENTID, sb.peers.myID());
                 templatePatterns.put(servletProperties.PEER_STAT_MYTIME, GenericFormatter.SHORT_SECOND_FORMATTER.format());
diff --git a/source/net/yacy/peers/Network.java b/source/net/yacy/peers/Network.java
index 305e63bdc..00e90f95a 100644
--- a/source/net/yacy/peers/Network.java
+++ b/source/net/yacy/peers/Network.java
@@ -89,11 +89,6 @@ public class Network
     // class variables
     Switchboard sb;
 
-    public static int yacyTime() {
-        // the time since startup of yacy in seconds
-        return Math.max(0, (int) ((System.currentTimeMillis() - serverCore.startupTime) / 1000));
-    }
-
     public Network(final Switchboard sb) {
         final long time = System.currentTimeMillis();
 
diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java
index 773afccf2..f21a90c5a 100644
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@@ -194,7 +194,6 @@ import net.yacy.search.ranking.RankingProfile;
 import net.yacy.search.schema.CollectionConfiguration;
 import net.yacy.search.schema.CollectionSchema;
 import net.yacy.search.schema.WebgraphConfiguration;
-import net.yacy.server.serverCore;
 import net.yacy.server.serverSwitch;
 import net.yacy.server.http.RobotsTxtConfig;
 import net.yacy.utils.CryptoLib;
@@ -281,6 +280,7 @@ public final class Switchboard extends serverSwitch {
     public boolean useTailCache;
     public boolean exceed134217727;
 
+    public final long startupTime = System.currentTimeMillis();
     private final Semaphore shutdownSync = new Semaphore(0);
     private boolean terminate = false;
     private boolean startupAction = true; // this is set to false after the first event
@@ -3607,22 +3607,22 @@ public final class Switchboard extends serverSwitch {
     }
 
     public float averageQPM() {
-        final long uptime = (System.currentTimeMillis() - serverCore.startupTime) / 1000;
+        final long uptime = (System.currentTimeMillis() - this.startupTime) / 1000;
         return (this.searchQueriesRobinsonFromRemote + this.searchQueriesGlobal) * 60f / Math.max(uptime, 1f);
     }
 
     public float averageQPMGlobal() {
-        final long uptime = (System.currentTimeMillis() - serverCore.startupTime) / 1000;
+        final long uptime = (System.currentTimeMillis() - this.startupTime) / 1000;
         return (this.searchQueriesGlobal) * 60f / Math.max(uptime, 1f);
     }
 
     public float averageQPMPrivateLocal() {
-        final long uptime = (System.currentTimeMillis() - serverCore.startupTime) / 1000;
+        final long uptime = (System.currentTimeMillis() - this.startupTime) / 1000;
         return (this.searchQueriesRobinsonFromLocal) * 60f / Math.max(uptime, 1f);
     }
 
     public float averageQPMPublicLocal() {
-        final long uptime = (System.currentTimeMillis() - serverCore.startupTime) / 1000;
+        final long uptime = (System.currentTimeMillis() - this.startupTime) / 1000;
         return (this.searchQueriesRobinsonFromRemote) * 60f / Math.max(uptime, 1f);
     }
 
@@ -3632,7 +3632,7 @@ public final class Switchboard extends serverSwitch {
         this.peers.mySeed().put(Seed.PORT, getConfig("port", "8090"));
 
         //the speed of indexing (pages/minute) of the peer
-        final long uptime = (System.currentTimeMillis() - serverCore.startupTime) / 1000;
+        final long uptime = (System.currentTimeMillis() - this.startupTime) / 1000;
         Seed mySeed = this.peers.mySeed();
         
         mySeed.put(Seed.ISPEED, Integer.toString(currentPPM()));
diff --git a/source/net/yacy/server/serverCore.java b/source/net/yacy/server/serverCore.java
index 5da152543..f02bd4934 100644
--- a/source/net/yacy/server/serverCore.java
+++ b/source/net/yacy/server/serverCore.java
@@ -37,7 +37,6 @@ public final class serverCore {
     public static final byte[] CRLF = {CR, LF};
     public static final String CRLF_STRING = UTF8.String(CRLF);
     public static final String LF_STRING = UTF8.String(new byte[]{LF});
-    public static final long startupTime = System.currentTimeMillis();
 
     public static boolean useStaticIP = false;
 

From d9472d043a42e08fdc420884c461345797d58ba4 Mon Sep 17 00:00:00 2001
From: reger <reger18@arcor.de>
Date: Thu, 10 Jul 2014 02:20:01 +0200
Subject: [PATCH 5/9] cleanup older unused classes

---
 source/net/yacy/search/IndexAbstracts.java    | 69 -------------------
 source/net/yacy/search/StorageQueueEntry.java | 35 ----------
 .../server/serverSwitchAbstractAction.java    | 52 --------------
 3 files changed, 156 deletions(-)
 delete mode 100644 source/net/yacy/search/IndexAbstracts.java
 delete mode 100644 source/net/yacy/search/StorageQueueEntry.java
 delete mode 100644 source/net/yacy/server/serverSwitchAbstractAction.java

diff --git a/source/net/yacy/search/IndexAbstracts.java b/source/net/yacy/search/IndexAbstracts.java
deleted file mode 100644
index 165d2958c..000000000
--- a/source/net/yacy/search/IndexAbstracts.java
+++ /dev/null
@@ -1,69 +0,0 @@
-// IndexAbstracts.java
-// (C) 2005 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
-// first published 10.10.2005 on http://yacy.net
-//
-// This is a part of YaCy, a peer-to-peer based web search engine
-//
-// $LastChangedDate$
-// $LastChangedRevision$
-// $LastChangedBy$
-//
-// LICENSE
-// 
-// This program is free software; you can redistribute it and/or modify
-// it under the terms of the GNU General Public License as published by
-// the Free Software Foundation; either version 2 of the License, or
-// (at your option) any later version.
-//
-// This program is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU General Public License
-// along with this program; if not, write to the Free Software
-// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-
-package net.yacy.search;
-
-import java.util.Iterator;
-import java.util.Map;
-import java.util.TreeMap;
-
-public class IndexAbstracts extends TreeMap<String, TreeMap<String, String>> {
-
-	private static final long serialVersionUID = 3037740969349726216L;
-
-	public IndexAbstracts() {
-		super();
-	}
-	
-	public String wordsFromPeer(final String peerhash, final String urls) {
-        Map.Entry<String, TreeMap<String, String>> entry;
-        String word, peerlist, url, wordlist = "";
-        TreeMap<String, String> urlPeerlist;
-        int p;
-        boolean hasURL;
-        synchronized (this) {
-            final Iterator<Map.Entry <String, TreeMap<String, String>>> i = this.entrySet().iterator();
-            while (i.hasNext()) {
-                entry = i.next();
-                word = entry.getKey();
-                urlPeerlist = entry.getValue();
-                hasURL = true;
-                for (int j = 0; j < urls.length(); j = j + 12) {
-                    url = urls.substring(j, j + 12);
-                    peerlist = urlPeerlist.get(url);
-                    p = (peerlist == null) ? -1 : peerlist.indexOf(peerhash);
-                    if ((p < 0) || (p % 12 != 0)) {
-                        hasURL = false;
-                        break;
-                    }
-                }
-                if (hasURL) wordlist += word;
-            }
-        }
-        return wordlist;
-    }
-	
-}
\ No newline at end of file
diff --git a/source/net/yacy/search/StorageQueueEntry.java b/source/net/yacy/search/StorageQueueEntry.java
deleted file mode 100644
index d623a45c4..000000000
--- a/source/net/yacy/search/StorageQueueEntry.java
+++ /dev/null
@@ -1,35 +0,0 @@
-/**
- *  StorageQueueEntry
- *  Copyright 2011 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany
- *  First released 30.05.2013 at http://yacy.net
- *
- *  This library is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU Lesser General Public
- *  License as published by the Free Software Foundation; either
- *  version 2.1 of the License, or (at your option) any later version.
- *
- *  This library is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- *  Lesser General Public License for more details.
- *
- *  You should have received a copy of the GNU Lesser General Public License
- *  along with this program in the file lgpl21.txt
- *  If not, see <http://www.gnu.org/licenses/>.
- */
-
-package net.yacy.search;
-
-import org.apache.solr.common.SolrInputDocument;
-
-import net.yacy.kelondro.workflow.WorkflowJob;
-
-public class StorageQueueEntry extends WorkflowJob {
-
-    public SolrInputDocument queueEntry;
-
-    public StorageQueueEntry(final SolrInputDocument queueEntry) {
-        super();
-        this.queueEntry = queueEntry;
-    }
-}
diff --git a/source/net/yacy/server/serverSwitchAbstractAction.java b/source/net/yacy/server/serverSwitchAbstractAction.java
deleted file mode 100644
index 06881f5ff..000000000
--- a/source/net/yacy/server/serverSwitchAbstractAction.java
+++ /dev/null
@@ -1,52 +0,0 @@
-// serverSwitchAbstractAction.java 
-// -------------------------------------
-// (C) by Michael Peter Christen; mc@yacy.net
-// first published on http://www.anomic.de
-// Frankfurt, Germany, 2005
-// last major change: 11.05.2005
-//
-// This program is free software; you can redistribute it and/or modify
-// it under the terms of the GNU General Public License as published by
-// the Free Software Foundation; either version 2 of the License, or
-// (at your option) any later version.
-//
-// This program is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU General Public License
-// along with this program; if not, write to the Free Software
-// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-
-package net.yacy.server;
-
-import net.yacy.cora.util.ConcurrentLog;
-
-public abstract class serverSwitchAbstractAction {
-
-    protected ConcurrentLog log = null;
-    private String shortDescr = "", longDescr = "";
-    
-    public void setDescription(final String shortText, final String longText) {
-        // sets a visible description string
-        this.shortDescr = shortText;
-        this.longDescr  = longText;
-    }
-    
-    public String getShortDescription() {
-	// returns short description string for online display
-        return this.shortDescr;
-    }
-    
-    public String getLongDescription() {
-	// returns long description string for online display
-	return this.longDescr;
-    }
-
-    public void setLog(final ConcurrentLog log) {
-        // defines a log where process states can be written to
-        this.log = log;
-    }
-    
-}

From b0d941626fb15caadbf20abc70fe46d31ee5d5a1 Mon Sep 17 00:00:00 2001
From: Michael Peter Christen <mc@yacy.net>
Date: Thu, 10 Jul 2014 15:40:38 +0200
Subject: [PATCH 6/9] fixed bugs in canonical, robots and title/description
 unique calculation

---
 .../yacy/cora/federate/solr/SchemaConfiguration.java | 12 +++---------
 .../yacy/search/schema/CollectionConfiguration.java  | 10 +++++-----
 2 files changed, 8 insertions(+), 14 deletions(-)

diff --git a/source/net/yacy/cora/federate/solr/SchemaConfiguration.java b/source/net/yacy/cora/federate/solr/SchemaConfiguration.java
index 5537b767b..c61955004 100644
--- a/source/net/yacy/cora/federate/solr/SchemaConfiguration.java
+++ b/source/net/yacy/cora/federate/solr/SchemaConfiguration.java
@@ -173,7 +173,7 @@ public class SchemaConfiguration extends Configuration implements Serializable {
         String canonical_s = this.contains(CollectionSchema.canonical_s) ? (String) sid.getFieldValue(CollectionSchema.canonical_s.getSolrFieldName()) : null;
         Boolean canonical_equal_sku_b = this.contains(CollectionSchema.canonical_equal_sku_b) ? (Boolean) sid.getFieldValue(CollectionSchema.canonical_equal_sku_b.getSolrFieldName()) : null;
         if (segment.fulltext().getDefaultConfiguration().contains(CollectionSchema.host_id_s) &&
-            (robots_i == null || (robots_i.intValue() & (1 << 9)) == 0) &&
+            (robots_i == null || (robots_i.intValue() & (1 << 9)) == 0 /*noindex in http X-ROBOTS*/ && (robots_i.intValue() & (1 << 3)) == 0 /*noindex in html metas*/ ) &&
             (canonical_s == null || canonical_s.length() == 0 || (canonical_equal_sku_b != null && canonical_equal_sku_b.booleanValue()) || url.toNormalform(true).equals(canonical_s)) &&
             (httpstatus_i == null || httpstatus_i.intValue() == 200)) {
             uniquecheck: for (CollectionSchema[] checkfields: new CollectionSchema[][] {
@@ -190,14 +190,8 @@ public class SchemaConfiguration extends Configuration implements Serializable {
                         continue uniquecheck;
                     }
                     try {
-                        SolrDocumentList docs = segment.fulltext().getDefaultConnector().getDocumentListByQuery("-" + CollectionSchema.id.getSolrFieldName() + ":\"" + urlhash + "\" AND " + CollectionSchema.host_id_s.getSolrFieldName() + ":\"" + hostid + "\" AND " + signaturefield.getSolrFieldName() + ":\"" + signature.toString() + "\"", null, 0, 100, CollectionSchema.id.getSolrFieldName());
-                        if (docs.getNumFound() == 0) {
-                            sid.setField(uniquefield.getSolrFieldName(), true);
-                        } else {
-                            boolean firstappearance = true;
-                            for (SolrDocument d: docs) {if (uniqueURLs.contains(d.getFieldValue(CollectionSchema.id.getSolrFieldName()))) firstappearance = false; break;}
-                            sid.setField(uniquefield.getSolrFieldName(), firstappearance);
-                        }
+                        long doccount = segment.fulltext().getDefaultConnector().getCountByQuery("-" + CollectionSchema.id.getSolrFieldName() + ":\"" + urlhash + "\" AND " + CollectionSchema.host_id_s.getSolrFieldName() + ":\"" + hostid + "\" AND " + signaturefield.getSolrFieldName() + ":\"" + signature.toString() + "\"");
+                        sid.setField(uniquefield.getSolrFieldName(), doccount == 0);
                     } catch (final IOException e) {}
                 }
             }
diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java
index a81320e30..5dba2e9d6 100644
--- a/source/net/yacy/search/schema/CollectionConfiguration.java
+++ b/source/net/yacy/search/schema/CollectionConfiguration.java
@@ -397,13 +397,11 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
         // we use the SolrCell design as index schema
         SolrVector doc = new SolrVector();
         final DigestURL digestURL = document.dc_source();
-        final String id = ASCII.String(digestURL.hash());
         boolean allAttr = this.isEmpty();
         String url = addURIAttributes(doc, allAttr, digestURL, Response.docType(digestURL));
         
         Set<ProcessType> processTypes = new LinkedHashSet<ProcessType>();
         String host = digestURL.getHost();
-        String us = digestURL.toNormalform(true);
         
         int crawldepth = document.getDepth();
         if ((allAttr || contains(CollectionSchema.crawldepth_i))) {
@@ -562,9 +560,10 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
             // bit 15: "noimageindex" contained in http header X-Robots-Tag
             // bit 16: "unavailable_after" contained in http header X-Robots-Tag
             int b = 0;
-            final String robots_meta = html.getMetas().get("robots");
+            String robots_meta = html.getMetas().get("robots");
             // this tag may have values: all, index, noindex, nofollow; see http://www.robotstxt.org/meta.html
             if (robots_meta != null) {
+                robots_meta = robots_meta.toLowerCase();
                 if (robots_meta.indexOf("all",0) >= 0) b += 1;      // set bit 0
                 if (robots_meta.indexOf("index",0) == 0 || robots_meta.indexOf(" index",0) >= 0 || robots_meta.indexOf(",index",0) >= 0 ) b += 2; // set bit 1
                 if (robots_meta.indexOf("follow",0) == 0 || robots_meta.indexOf(" follow",0) >= 0 || robots_meta.indexOf(",follow",0) >= 0 ) b += 4; // set bit 2
@@ -579,6 +578,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
                 }
             }
             if (!x_robots_tag.isEmpty()) {
+                x_robots_tag = x_robots_tag.toLowerCase();
                 // this tag may have values: all, noindex, nofollow, noarchive, nosnippet, noodp, notranslate, noimageindex, unavailable_after, none; see https://developers.google.com/webmasters/control-crawl-index/docs/robots_meta_tag?hl=de
                 if (x_robots_tag.indexOf("all",0) >= 0) b += 1<<8;                // set bit 8
                 if (x_robots_tag.indexOf("noindex",0) >= 0||x_robots_tag.indexOf("none",0) >= 0) b += 1<<9;   // set bit 9
@@ -754,14 +754,14 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
                         }
                     }
                 }
-                if (canonical != null && !ASCII.String(canonical.hash()).equals(id)) {
+                if (canonical != null) {
                     containsCanonical = true;
                     inboundLinks.remove(canonical);
                     outboundLinks.remove(canonical);
                     add(doc, CollectionSchema.canonical_s, canonical.toNormalform(false));
                     // set a flag if this is equal to sku
                     if (contains(CollectionSchema.canonical_equal_sku_b)) {
-                        add(doc, CollectionSchema.canonical_equal_sku_b, canonical.equals(us));
+                        add(doc, CollectionSchema.canonical_equal_sku_b, canonical.equals(digestURL));
                     }
                 }
             }

From fb3dd56b02125f9293fa67eb47e04f64e82d4903 Mon Sep 17 00:00:00 2001
From: Michael Peter Christen <mc@yacy.net>
Date: Thu, 10 Jul 2014 17:13:35 +0200
Subject: [PATCH 7/9] fix for processing of noindex flag in http header

---
 .../net/yacy/cora/protocol/ResponseHeader.java   |  3 ++-
 source/net/yacy/document/Document.java           | 11 +++++++++--
 source/net/yacy/repository/LoaderDispatcher.java | 16 ++++++++++++++--
 .../search/schema/CollectionConfiguration.java   |  9 +--------
 4 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/source/net/yacy/cora/protocol/ResponseHeader.java b/source/net/yacy/cora/protocol/ResponseHeader.java
index 92f4fd84b..7c1b3924c 100644
--- a/source/net/yacy/cora/protocol/ResponseHeader.java
+++ b/source/net/yacy/cora/protocol/ResponseHeader.java
@@ -108,6 +108,7 @@ public class ResponseHeader extends HeaderFramework {
         if (x_robots_tag.isEmpty()) {
             x_robots_tag = this.get(HeaderFramework.X_ROBOTS, "");
         }
-        return x_robots_tag;
+        return x_robots_tag.toLowerCase();
     }
+
 }
diff --git a/source/net/yacy/document/Document.java b/source/net/yacy/document/Document.java
index 040227428..fe3a1b0c2 100644
--- a/source/net/yacy/document/Document.java
+++ b/source/net/yacy/document/Document.java
@@ -90,7 +90,7 @@ public class Document {
     private MultiProtocolURL favicon;
     private boolean resorted;
     private final Set<String> languages;
-    private final boolean indexingDenied;
+    private boolean indexingDenied;
     private final double lon, lat;
     private final Object parserObject; // the source object that was used to create the Document
     private final Map<String, Set<String>> generic_facets; // a map from vocabulary names to the set of tags for that vocabulary which apply for this document
@@ -733,6 +733,10 @@ dc_rights
         return this.indexingDenied;
     }
 
+    public void setIndexingDenied(boolean indexingDenied) {
+        this.indexingDenied = indexingDenied;
+    }
+
     public void setDepth(int depth) {
         this.crawldepth = depth;
     }
@@ -819,6 +823,7 @@ dc_rights
         final LinkedHashMap<AnchorURL, ImageEntry> images = new LinkedHashMap<AnchorURL, ImageEntry>();
         final Set<String> languages = new HashSet<String>();
         double lon = 0.0d, lat = 0.0d;
+        boolean indexingDenied = false;
         Date date = new Date();
         String charset = null;
 
@@ -867,6 +872,8 @@ dc_rights
             
             if (doc.getDepth() < mindepth) mindepth = doc.getDepth();
             if (doc.dc_language() != null) languages.add(doc.dc_language());
+            
+            indexingDenied |= doc.indexingDenied;
         }
 
         // clean up parser data
@@ -898,7 +905,7 @@ dc_rights
                 anchors,
                 rss,
                 images,
-                false,
+                indexingDenied,
                 date);
         newDoc.setDepth(mindepth);
         return newDoc;
diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java
index ca15d13fb..c25cde6d6 100644
--- a/source/net/yacy/repository/LoaderDispatcher.java
+++ b/source/net/yacy/repository/LoaderDispatcher.java
@@ -355,7 +355,14 @@ public final class LoaderDispatcher {
         if (response.getContent() == null || response.getResponseHeader() == null) throw new IOException("no Content available for url " + url);
 
         // parse resource
-        return response.parse();
+        Document[] documents = response.parse();
+
+        String x_robots_tag = response.getResponseHeader().getXRobotsTag();
+        if (x_robots_tag.indexOf("noindex",0) >= 0) {
+            for (Document d: documents) d.setIndexingDenied(true);
+        }
+        
+        return documents;
     }
 
     public Document loadDocument(final DigestURL location, final CacheStrategy cachePolicy, BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException {
@@ -371,7 +378,12 @@ public final class LoaderDispatcher {
         // parse resource
         try {
             Document[] documents = response.parse();
-            return Document.mergeDocuments(location, response.getMimeType(), documents);
+            Document merged = Document.mergeDocuments(location, response.getMimeType(), documents);
+            
+            String x_robots_tag = response.getResponseHeader().getXRobotsTag();
+            if (x_robots_tag.indexOf("noindex",0) >= 0) merged.setIndexingDenied(true);
+            
+            return merged;
         } catch(final Parser.Failure e) {
             throw new IOException(e.getMessage());
         }
diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java
index 5dba2e9d6..f000d20da 100644
--- a/source/net/yacy/search/schema/CollectionConfiguration.java
+++ b/source/net/yacy/search/schema/CollectionConfiguration.java
@@ -570,15 +570,8 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
                 if (robots_meta.indexOf("noindex",0) >= 0) b += 8;  // set bit 3
                 if (robots_meta.indexOf("nofollow",0) >= 0) b += 16; // set bit 4
             }
-            String x_robots_tag = "";
-            if (responseHeader != null) {
-                x_robots_tag = responseHeader.get(HeaderFramework.X_ROBOTS_TAG, "");
-                if (x_robots_tag.isEmpty()) {
-                    x_robots_tag = responseHeader.get(HeaderFramework.X_ROBOTS, "");
-                }
-            }
+            String x_robots_tag = responseHeader.getXRobotsTag();
             if (!x_robots_tag.isEmpty()) {
-                x_robots_tag = x_robots_tag.toLowerCase();
                 // this tag may have values: all, noindex, nofollow, noarchive, nosnippet, noodp, notranslate, noimageindex, unavailable_after, none; see https://developers.google.com/webmasters/control-crawl-index/docs/robots_meta_tag?hl=de
                 if (x_robots_tag.indexOf("all",0) >= 0) b += 1<<8;                // set bit 8
                 if (x_robots_tag.indexOf("noindex",0) >= 0||x_robots_tag.indexOf("none",0) >= 0) b += 1<<9;   // set bit 9

From a694b6a8fcb83962fb19ffeea7bef8830d7f1ad6 Mon Sep 17 00:00:00 2001
From: Michael Peter Christen <mc@yacy.net>
Date: Thu, 10 Jul 2014 17:25:33 +0200
Subject: [PATCH 8/9] another fix for unique field computation

---
 .../yacy/cora/federate/solr/SchemaConfiguration.java  | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/source/net/yacy/cora/federate/solr/SchemaConfiguration.java b/source/net/yacy/cora/federate/solr/SchemaConfiguration.java
index c61955004..7e2696088 100644
--- a/source/net/yacy/cora/federate/solr/SchemaConfiguration.java
+++ b/source/net/yacy/cora/federate/solr/SchemaConfiguration.java
@@ -190,7 +190,16 @@ public class SchemaConfiguration extends Configuration implements Serializable {
                         continue uniquecheck;
                     }
                     try {
-                        long doccount = segment.fulltext().getDefaultConnector().getCountByQuery("-" + CollectionSchema.id.getSolrFieldName() + ":\"" + urlhash + "\" AND " + CollectionSchema.host_id_s.getSolrFieldName() + ":\"" + hostid + "\" AND " + signaturefield.getSolrFieldName() + ":\"" + signature.toString() + "\"");
+                        long doccount = segment.fulltext().getDefaultConnector().getCountByQuery(
+                                CollectionSchema.host_id_s.getSolrFieldName() + ":\"" + hostid + "\" AND " +
+                                "-" + CollectionSchema.robots_i.getSolrFieldName() + ":8 AND " + // bit 3
+                                "-" + CollectionSchema.robots_i.getSolrFieldName() + ":24 AND " + // bit 3 + 4
+                                "-" + CollectionSchema.robots_i.getSolrFieldName() + ":512 AND " + // bit 9
+                                "-" + CollectionSchema.robots_i.getSolrFieldName() + ":1536 AND " + // bit 9 + 10
+                                "(-" + CollectionSchema.canonical_equal_sku_b.getSolrFieldName() + ":[* TO *] OR " + CollectionSchema.canonical_equal_sku_b.getSolrFieldName() + ":true ) AND " +
+                                CollectionSchema.httpstatus_i.getSolrFieldName() + ":200 AND " +
+                                "-" + CollectionSchema.id.getSolrFieldName() + ":\"" + urlhash + "\" AND " +
+                                signaturefield.getSolrFieldName() + ":\"" + signature.toString() + "\"");
                         sid.setField(uniquefield.getSolrFieldName(), doccount == 0);
                     } catch (final IOException e) {}
                 }

From 7057e0b3e2cf8d49b5a2f9fea6cd5388b7a862e6 Mon Sep 17 00:00:00 2001
From: reger <reger18@arcor.de>
Date: Thu, 10 Jul 2014 23:58:47 +0200
Subject: [PATCH 9/9] catch input file not found in Mediawiki import

---
 htroot/IndexImportMediawiki_p.java | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/htroot/IndexImportMediawiki_p.java b/htroot/IndexImportMediawiki_p.java
index dece70730..f36fd4b57 100644
--- a/htroot/IndexImportMediawiki_p.java
+++ b/htroot/IndexImportMediawiki_p.java
@@ -54,11 +54,16 @@ public class IndexImportMediawiki_p {
             } else {
                 if (post.containsKey("file")) {
                     final File sourcefile = new File(post.get("file"));
-                    MediawikiImporter.job = new MediawikiImporter(sourcefile, sb.surrogatesInPath);
-                    MediawikiImporter.job.start();
+                    if (sourcefile.exists()) {
+                        MediawikiImporter.job = new MediawikiImporter(sourcefile, sb.surrogatesInPath);
+                        MediawikiImporter.job.start();
+                        prop.put("import_dump", MediawikiImporter.job.source());
+                        prop.put("import_thread", "started");
+                    } else {
+                        prop.put("import_dump", "");
+                        prop.put("import_thread", "Error: file not found ["+sourcefile+"]");
+                    }
                     prop.put("import", 1);
-                    prop.put("import_thread", "started");
-                    prop.put("import_dump", MediawikiImporter.job.source());
                     prop.put("import_count", 0);
                     prop.put("import_speed", 0);
                     prop.put("import_runningHours", 0);
@@ -66,7 +71,6 @@ public class IndexImportMediawiki_p {
                     prop.put("import_remainingHours", 0);
                     prop.put("import_remainingMinutes", 0);
                 }
-                return prop;
             }
         }
         return prop;