From b250e6466d541e0ac15d915b56055300b0eb84e1 Mon Sep 17 00:00:00 2001
From: orbiter <orbiter@6c8d7289-2bf4-0310-a012-ef5d649a1542>
Date: Thu, 29 Sep 2011 15:17:39 +0000
Subject: [PATCH] implemented crawl restrictions for IP pattern and country
 lists

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7980 6c8d7289-2bf4-0310-a012-ef5d649a1542
---
 htroot/CrawlProfileEditor_p.java              |  41 ++---
 htroot/CrawlStartExpert_p.html                |   2 +-
 htroot/CrawlStartExpert_p.java                |  10 +-
 htroot/Crawler_p.java                         |  20 +-
 htroot/QuickCrawlLink_p.java                  |   8 +-
 source/de/anomic/crawler/CrawlProfile.java    |  91 +++++++--
 source/de/anomic/crawler/CrawlQueues.java     |   4 +-
 source/de/anomic/crawler/CrawlStacker.java    |  67 +++++--
 .../de/anomic/crawler/CrawlSwitchboard.java   |  16 +-
 .../yacy/cora/document/MultiProtocolURI.java  |  18 +-
 source/net/yacy/cora/protocol/Domains.java    |   7 +-
 source/net/yacy/cora/protocol/Scanner.java    | 172 +++++++++---------
 .../services/federated/solr/SolrScheme.java   |  11 +-
 .../federated/solr/SolrSingleConnector.java   |   2 +-
 .../yacy/kelondro/rwi/ReferenceContainer.java |   1 +
 source/net/yacy/search/Switchboard.java       |   4 +-
 16 files changed, 291 insertions(+), 183 deletions(-)
diff --git a/htroot/CrawlProfileEditor_p.java b/htroot/CrawlProfileEditor_p.java
index d1b0e7e00..660970d8b 100644
--- a/htroot/CrawlProfileEditor_p.java
+++ b/htroot/CrawlProfileEditor_p.java
@@ -9,7 +9,7 @@
 // $LastChangedBy$
 //
 // LICENSE
-// 
+//
 // This program is free software; you can redistribute it and/or modify
 // it under the terms of the GNU General Public License as published by
 // the Free Software Foundation; either version 2 of the License, or
@@ -38,10 +38,9 @@ import net.yacy.cora.protocol.RequestHeader;
 import net.yacy.kelondro.index.RowSpaceExceededException;
 import net.yacy.kelondro.logging.Log;
 import net.yacy.search.Switchboard;
-
+import de.anomic.crawler.CrawlProfile;
 import de.anomic.crawler.CrawlStacker;
 import de.anomic.crawler.CrawlSwitchboard;
-import de.anomic.crawler.CrawlProfile;
 import de.anomic.server.serverObjects;
 import de.anomic.server.serverSwitch;
 import de.anomic.server.servletProperties;
@@ -63,7 +62,7 @@ public class CrawlProfileEditor_p {
         ignoreNames.add(CrawlSwitchboard.DBFILE_ACTIVE_CRAWL_PROFILES);
         ignoreNames.add(CrawlSwitchboard.DBFILE_PASSIVE_CRAWL_PROFILES);
     }
-    
+
     public static class eentry {
         public static final int BOOLEAN = 0;
         public static final int INTEGER = 1;
@@ -73,7 +72,7 @@ public class CrawlProfileEditor_p {
         public final String label;
         public final boolean readonly;
         public final int type;
-        
+
         public eentry(final String name, final String label, final boolean readonly, final int type) {
             this.name = name;
             this.label = label;
@@ -81,7 +80,7 @@ public class CrawlProfileEditor_p {
             this.type = type;
         }
     }
-    
+
     private static final List <eentry> labels = new ArrayList<eentry>();
     static {
         labels.add(new eentry(CrawlProfile.NAME,                "Name",                  true,  eentry.STRING));
@@ -100,14 +99,14 @@ public class CrawlProfileEditor_p {
         labels.add(new eentry(CrawlProfile.XDSTOPW,             "Dynamic stop-words",    false, eentry.BOOLEAN));
         labels.add(new eentry(CrawlProfile.XPSTOPW,             "Parent stop-words",     false, eentry.BOOLEAN));
     }
-    
+
     public static serverObjects respond(
-            final RequestHeader header, 
+            final RequestHeader header,
             final serverObjects post,
             final serverSwitch env) {
         final servletProperties prop = new servletProperties();
         final Switchboard sb = (Switchboard)env;
-        
+
         // read post for handle
         final String handle = (post == null) ? "" : post.get("handle", "");
         if (post != null) {
@@ -117,8 +116,8 @@ public class CrawlProfileEditor_p {
                 if (p != null) sb.crawler.putPassive(handle.getBytes(), p);
                 // delete all entries from the crawl queue that are deleted here
                 sb.crawler.removeActive(handle.getBytes());
-                sb.crawlQueues.noticeURL.removeByProfileHandle(handle, 10000); 
-            } catch (RowSpaceExceededException e) {
+                sb.crawlQueues.noticeURL.removeByProfileHandle(handle, 10000);
+            } catch (final RowSpaceExceededException e) {
                 Log.logException(e);
             }
             if (post.containsKey("delete")) {
@@ -131,7 +130,7 @@ public class CrawlProfileEditor_p {
                 }
             }
         }
-        
+
         // generate handle list: first sort by handle name
         CrawlProfile selentry;
         final Map<String, String> orderdHandles = new TreeMap<String, String>();
@@ -141,7 +140,7 @@ public class CrawlProfileEditor_p {
                 orderdHandles.put(selentry.name(), selentry.handle());
             }
         }
-        
+
         // then write into pop-up menu list
         int count = 0;
         for (final Map.Entry<String, String> NameHandle: orderdHandles.entrySet()) {
@@ -159,8 +158,8 @@ public class CrawlProfileEditor_p {
         if ((post != null) && (selentry != null)) {
             if (post.containsKey("submit")) {
                 try {
-                	Pattern.compile(post.get(CrawlProfile.FILTER_URL_MUSTMATCH, CrawlProfile.MATCH_ALL));
-                	Pattern.compile(post.get(CrawlProfile.FILTER_URL_MUSTNOTMATCH, CrawlProfile.MATCH_NEVER));
+                	Pattern.compile(post.get(CrawlProfile.FILTER_URL_MUSTMATCH, CrawlProfile.MATCH_ALL_STRING));
+                	Pattern.compile(post.get(CrawlProfile.FILTER_URL_MUSTNOTMATCH, CrawlProfile.MATCH_NEVER_STRING));
                     final Iterator<eentry> lit = labels.iterator();
                     eentry tee;
                     while (lit.hasNext()) {
@@ -179,7 +178,7 @@ public class CrawlProfileEditor_p {
                 }
             }
         }
-        
+
         // generate crawl profile table
         count = 0;
         boolean dark = true;
@@ -231,10 +230,10 @@ public class CrawlProfileEditor_p {
             }
             prop.put("edit_entries", count);
         }
-        
+
         return prop;
     }
-    
+
     private static void putProfileEntry(
             final servletProperties prop,
             final CrawlStacker crawlStacker,
@@ -253,8 +252,8 @@ public class CrawlProfileEditor_p {
         prop.putXML(CRAWL_PROFILE_PREFIX + count + "_startURL", profile.startURL());
         prop.put(CRAWL_PROFILE_PREFIX + count + "_handle", profile.handle());
         prop.put(CRAWL_PROFILE_PREFIX + count + "_depth", profile.depth());
-        prop.put(CRAWL_PROFILE_PREFIX + count + "_mustmatch", profile.mustMatchPattern().toString());
-        prop.put(CRAWL_PROFILE_PREFIX + count + "_mustnotmatch", profile.mustNotMatchPattern().toString());
+        prop.put(CRAWL_PROFILE_PREFIX + count + "_mustmatch", profile.urlMustMatchPattern().toString());
+        prop.put(CRAWL_PROFILE_PREFIX + count + "_mustnotmatch", profile.urlMustNotMatchPattern().toString());
         prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingIfOlder", (profile.recrawlIfOlder() == 0L) ? "no re-crawl" : DateFormat.getDateTimeInstance().format(profile.recrawlIfOlder()));
         prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingDomFilterDepth", "inactive");
 
@@ -270,7 +269,7 @@ public class CrawlProfileEditor_p {
             i++;
         }
         }
-        
+
         prop.put(CRAWL_PROFILE_PREFIX+count+"_crawlingDomFilterContent", i);
 
         prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingDomMaxPages", (profile.domMaxPages() == Integer.MAX_VALUE) ? "unlimited" : Integer.toString(profile.domMaxPages()));
diff --git a/htroot/CrawlStartExpert_p.html b/htroot/CrawlStartExpert_p.html
index 91ffbb23f..b91ea7b52 100644
--- a/htroot/CrawlStartExpert_p.html
+++ b/htroot/CrawlStartExpert_p.html
@@ -185,7 +185,7 @@
           <td><label for="crawlingCountryMustMatch">Must-Match List for Country Codes</label>:</td>
           <td>
 			<input type="radio" name="countryMustMatchSwitch" id="countryMustMatchSwitch" value="true" />Use filter&nbsp;&nbsp;
-			<input name="crawlingCountryMustMatch" id="crawlingCountryMustMatch" type="text" size="60" maxlength="100" value="#[crawlingCountryMustMatch]#" />
+			<input name="countryMustMatchList" id="countryMustMatchList" type="text" size="60" maxlength="100" value="#[countryMustMatch]#" /><br />
 			<input type="radio" name="countryMustMatchSwitch" id="countryMustMatchSwitch" value="false" checked="checked" />no country code restriction
 		  </td>
           <td>
diff --git a/htroot/CrawlStartExpert_p.java b/htroot/CrawlStartExpert_p.java
index 812f17f59..4b1793e68 100644
--- a/htroot/CrawlStartExpert_p.java
+++ b/htroot/CrawlStartExpert_p.java
@@ -42,11 +42,11 @@ public class CrawlStartExpert_p {
         prop.put("starturl", /*(intranet) ? repository :*/ "http://");
         prop.put("proxyPrefetchDepth", env.getConfig("proxyPrefetchDepth", "0"));
         prop.put("crawlingDepth", Math.min(3, env.getConfigLong("crawlingDepth", 0)));
-        prop.put("mustmatch", /*(intranet) ? repository + ".*" :*/ CrawlProfile.MATCH_ALL);
-        prop.put("mustnotmatch", CrawlProfile.MATCH_NEVER);
-        prop.put("ipMustmatch", sb.getConfig("crawlingIPMustMatch", CrawlProfile.MATCH_ALL));
-        prop.put("ipMustnotmatch", sb.getConfig("crawlingIPMustNotMatch", CrawlProfile.MATCH_NEVER));
-        prop.put("crawlingCountryMustMatch", sb.getConfig("crawlingCountryMustMatch", ""));
+        prop.put("mustmatch", /*(intranet) ? repository + ".*" :*/ CrawlProfile.MATCH_ALL_STRING);
+        prop.put("mustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
+        prop.put("ipMustmatch", sb.getConfig("crawlingIPMustMatch", CrawlProfile.MATCH_ALL_STRING));
+        prop.put("ipMustnotmatch", sb.getConfig("crawlingIPMustNotMatch", CrawlProfile.MATCH_NEVER_STRING));
+        prop.put("countryMustMatch", sb.getConfig("crawlingCountryMustMatch", ""));
 
         prop.put("crawlingIfOlderCheck", "0");
         prop.put("crawlingIfOlderUnitYearCheck", "0");
diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java
index 0dadc3ce4..fbef760bf 100644
--- a/htroot/Crawler_p.java
+++ b/htroot/Crawler_p.java
@@ -153,12 +153,12 @@ public class Crawler_p {
                 final boolean subPath    = "subpath".equals(post.get("range", "wide")); // special property in simple crawl start
 
                 // set the crawl filter
-                String newcrawlingMustMatch = post.get("mustmatch", CrawlProfile.MATCH_ALL);
-                final String newcrawlingMustNotMatch = post.get("mustnotmatch", CrawlProfile.MATCH_NEVER);
-                if (newcrawlingMustMatch.length() < 2) newcrawlingMustMatch = CrawlProfile.MATCH_ALL; // avoid that all urls are filtered out if bad value was submitted
-                String ipMustMatch = post.get("ipMustmatch", CrawlProfile.MATCH_ALL);
-                final String ipMustNotMatch = post.get("ipMustnotmatch", CrawlProfile.MATCH_NEVER);
-                if (ipMustMatch.length() < 2) ipMustMatch = CrawlProfile.MATCH_ALL;
+                String newcrawlingMustMatch = post.get("mustmatch", CrawlProfile.MATCH_ALL_STRING);
+                final String newcrawlingMustNotMatch = post.get("mustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
+                if (newcrawlingMustMatch.length() < 2) newcrawlingMustMatch = CrawlProfile.MATCH_ALL_STRING; // avoid that all urls are filtered out if bad value was submitted
+                String ipMustMatch = post.get("ipMustmatch", CrawlProfile.MATCH_ALL_STRING);
+                final String ipMustNotMatch = post.get("ipMustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
+                if (ipMustMatch.length() < 2) ipMustMatch = CrawlProfile.MATCH_ALL_STRING;
                 final String countryMustMatch = post.getBoolean("countryMustMatchSwitch", false) ? post.get("countryMustMatchList", "") : "";
                 sb.setConfig("crawlingIPMustMatch", ipMustMatch);
                 sb.setConfig("crawlingIPMustNotMatch", ipMustNotMatch);
@@ -439,7 +439,7 @@ public class Crawler_p {
                                     crawlingFileName,
                                     crawlURL,
                                     newcrawlingMustMatch,
-                                    CrawlProfile.MATCH_NEVER,
+                                    CrawlProfile.MATCH_NEVER_STRING,
                                     ipMustMatch,
                                     ipMustNotMatch,
                                     countryMustMatch,
@@ -478,8 +478,8 @@ public class Crawler_p {
                 		final CrawlProfile pe = new CrawlProfile(
                 				sitemapURLStr,
                 				sitemapURL,
-                				CrawlProfile.MATCH_ALL,
-                				CrawlProfile.MATCH_NEVER,
+                				CrawlProfile.MATCH_ALL_STRING,
+                				CrawlProfile.MATCH_NEVER_STRING,
                                 ipMustMatch,
                                 ipMustNotMatch,
                                 countryMustMatch,
@@ -523,7 +523,7 @@ public class Crawler_p {
                                 sitelistURL.getHost(),
                                 sitelistURL,
                                 newcrawlingMustMatch,
-                                CrawlProfile.MATCH_NEVER,
+                                CrawlProfile.MATCH_NEVER_STRING,
                                 ipMustMatch,
                                 ipMustNotMatch,
                                 countryMustMatch,
diff --git a/htroot/QuickCrawlLink_p.java b/htroot/QuickCrawlLink_p.java
index a61d07de2..7de24d99a 100644
--- a/htroot/QuickCrawlLink_p.java
+++ b/htroot/QuickCrawlLink_p.java
@@ -108,8 +108,8 @@ public class QuickCrawlLink_p {
         final String title = post.get("title",null);
 
         // get other parameters if set
-        final String crawlingMustMatch  = post.get("mustmatch", CrawlProfile.MATCH_ALL);
-        final String crawlingMustNotMatch  = post.get("mustnotmatch", CrawlProfile.MATCH_NEVER);
+        final String crawlingMustMatch  = post.get("mustmatch", CrawlProfile.MATCH_ALL_STRING);
+        final String crawlingMustNotMatch  = post.get("mustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
         final int CrawlingDepth      = post.getInt("crawlingDepth", 0);
         final boolean crawlDynamic   = post.get("crawlingQ", "").equals("on");
         final boolean indexText      = post.get("indexText", "on").equals("on");
@@ -149,8 +149,8 @@ public class QuickCrawlLink_p {
                         crawlingStartURL.getHost(),
                         crawlingStartURL,
                         crawlingMustMatch,
-                        CrawlProfile.MATCH_ALL,
-                        CrawlProfile.MATCH_NEVER,
+                        CrawlProfile.MATCH_ALL_STRING,
+                        CrawlProfile.MATCH_NEVER_STRING,
                         "",
                         crawlingMustNotMatch,
                         CrawlingDepth,
diff --git a/source/de/anomic/crawler/CrawlProfile.java b/source/de/anomic/crawler/CrawlProfile.java
index 4705fa7c2..cce5e2688 100644
--- a/source/de/anomic/crawler/CrawlProfile.java
+++ b/source/de/anomic/crawler/CrawlProfile.java
@@ -41,8 +41,10 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
 
     private static final long serialVersionUID = 5527325718810703504L;
 
-    public static final String MATCH_ALL = ".*";
-    public static final String MATCH_NEVER = "";
+    public static final String  MATCH_ALL_STRING    = ".*";
+    public static final String  MATCH_NEVER_STRING  = "";
+    public static final Pattern MATCH_ALL_PATTERN   = Pattern.compile(MATCH_ALL_STRING);
+    public static final Pattern MATCH_NEVER_PATTERN = Pattern.compile(MATCH_NEVER_STRING);
 
     // this is a simple record structure that hold all properties of a single crawl start
     public static final String HANDLE           = "handle";
@@ -67,7 +69,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
     public static final String FILTER_IP_MUSTNOTMATCH   = "crawlingIPMustNotMatch";
     public static final String FILTER_COUNTRY_MUSTMATCH = "crawlingCountryMustMatch";
 
-    private Pattern mustmatch = null, mustnotmatch = null;
+    private Pattern urlmustmatch = null, urlmustnotmatch = null, ipmustmatch = null, ipmustnotmatch = null;
 
     /**
      * Constructor which creates CrawlPofile from parameters.
@@ -119,10 +121,10 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
         put(HANDLE,           handle);
         put(NAME,             name);
         put(START_URL,        (startURL == null) ? "" : startURL.toNormalform(true, false));
-        put(FILTER_URL_MUSTMATCH,     (urlMustMatch == null) ? CrawlProfile.MATCH_ALL : urlMustMatch);
-        put(FILTER_URL_MUSTNOTMATCH,  (urlMustNotMatch == null) ? CrawlProfile.MATCH_NEVER : urlMustNotMatch);
-        put(FILTER_IP_MUSTMATCH,      (ipMustMatch == null) ? CrawlProfile.MATCH_ALL : ipMustMatch);
-        put(FILTER_IP_MUSTNOTMATCH,   (ipMustNotMatch == null) ? CrawlProfile.MATCH_NEVER : ipMustNotMatch);
+        put(FILTER_URL_MUSTMATCH,     (urlMustMatch == null) ? CrawlProfile.MATCH_ALL_STRING : urlMustMatch);
+        put(FILTER_URL_MUSTNOTMATCH,  (urlMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : urlMustNotMatch);
+        put(FILTER_IP_MUSTMATCH,      (ipMustMatch == null) ? CrawlProfile.MATCH_ALL_STRING : ipMustMatch);
+        put(FILTER_IP_MUSTNOTMATCH,   (ipMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : ipMustNotMatch);
         put(FILTER_COUNTRY_MUSTMATCH, (countryMustMatch == null) ? "" : countryMustMatch);
         put(DEPTH,            depth);
         put(RECRAWL_IF_OLDER, recrawlIfOlder);
@@ -207,26 +209,77 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
      * Gets the regex which must be matched by URLs in order to be crawled.
      * @return regex which must be matched
      */
-    public Pattern mustMatchPattern() {
-        if (this.mustmatch == null) {
-            String r = get(FILTER_URL_MUSTMATCH);
-            if (r == null) r = CrawlProfile.MATCH_ALL;
-            this.mustmatch = Pattern.compile(r);
+    public Pattern urlMustMatchPattern() {
+        if (this.urlmustmatch == null) {
+            final String r = get(FILTER_URL_MUSTMATCH);
+            if (r == null || r.equals(CrawlProfile.MATCH_ALL_STRING)) {
+                this.urlmustmatch = CrawlProfile.MATCH_ALL_PATTERN;
+            } else {
+                this.urlmustmatch = Pattern.compile(r);
+            }
         }
-        return this.mustmatch;
+        return this.urlmustmatch;
     }
 
     /**
      * Gets the regex which must not be matched by URLs in order to be crawled.
      * @return regex which must not be matched
      */
-    public Pattern mustNotMatchPattern() {
-        if (this.mustnotmatch == null) {
-            String r = get(FILTER_URL_MUSTNOTMATCH);
-            if (r == null) r = CrawlProfile.MATCH_NEVER;
-            this.mustnotmatch = Pattern.compile(r);
+    public Pattern urlMustNotMatchPattern() {
+        if (this.urlmustnotmatch == null) {
+            final String r = get(FILTER_URL_MUSTNOTMATCH);
+            if (r == null || r.equals(CrawlProfile.MATCH_NEVER_STRING)) {
+                this.urlmustnotmatch = CrawlProfile.MATCH_NEVER_PATTERN;
+            } else {
+                this.urlmustnotmatch = Pattern.compile(r);
+            }
         }
-        return this.mustnotmatch;
+        return this.urlmustnotmatch;
+    }
+
+    /**
+     * Gets the regex which must be matched by IPs in order to be crawled.
+     * @return regex which must be matched
+     */
+    public Pattern ipMustMatchPattern() {
+        if (this.ipmustmatch == null) {
+            final String r = get(FILTER_IP_MUSTMATCH);
+            if (r == null || r.equals(CrawlProfile.MATCH_ALL_STRING)) {
+                this.ipmustmatch = CrawlProfile.MATCH_ALL_PATTERN;
+            } else {
+                this.ipmustmatch = Pattern.compile(r);
+            }
+        }
+        return this.ipmustmatch;
+    }
+
+    /**
+     * Gets the regex which must not be matched by IPs in order to be crawled.
+     * @return regex which must not be matched
+     */
+    public Pattern ipMustNotMatchPattern() {
+        if (this.ipmustnotmatch == null) {
+            final String r = get(FILTER_IP_MUSTNOTMATCH);
+            if (r == null || r.equals(CrawlProfile.MATCH_NEVER_STRING)) {
+                this.ipmustnotmatch = CrawlProfile.MATCH_NEVER_PATTERN;
+            } else {
+                this.ipmustnotmatch = Pattern.compile(r);
+            }
+        }
+        return this.ipmustnotmatch;
+    }
+
+    /**
+     * get the list of countries that must match for the locations of the URLs IPs
+     * @return a list of country codes
+     */
+    public String[] countryMustMatchList() {
+        String countryMustMatch = get(FILTER_COUNTRY_MUSTMATCH);
+        if (countryMustMatch == null) countryMustMatch = "";
+        if (countryMustMatch.length() == 0) return new String[0];
+        String[] list = countryMustMatch.split(",");
+        if (list.length == 1 && list.length == 0) list = new String[0];
+        return list;
     }
 
     /**
diff --git a/source/de/anomic/crawler/CrawlQueues.java b/source/de/anomic/crawler/CrawlQueues.java
index 6413ea782..2bdce5c45 100644
--- a/source/de/anomic/crawler/CrawlQueues.java
+++ b/source/de/anomic/crawler/CrawlQueues.java
@@ -299,8 +299,8 @@ public class CrawlQueues {
                             + ", crawlOrder=" + ((profile.remoteIndexing()) ? "true" : "false")
                             + ", depth=" + urlEntry.depth()
                             + ", crawlDepth=" + profile.depth()
-                            + ", must-match=" + profile.mustMatchPattern().toString()
-                            + ", must-not-match=" + profile.mustNotMatchPattern().toString()
+                            + ", must-match=" + profile.urlMustMatchPattern().toString()
+                            + ", must-not-match=" + profile.urlMustNotMatchPattern().toString()
                             + ", permission=" + ((this.sb.peers == null) ? "undefined" : (((this.sb.peers.mySeed().isSenior()) || (this.sb.peers.mySeed().isPrincipal())) ? "true" : "false")));
 
                 // work off one Crawl stack entry
diff --git a/source/de/anomic/crawler/CrawlStacker.java b/source/de/anomic/crawler/CrawlStacker.java
index fc6f2a283..149e01632 100644
--- a/source/de/anomic/crawler/CrawlStacker.java
+++ b/source/de/anomic/crawler/CrawlStacker.java
@@ -34,6 +34,7 @@ import java.net.MalformedURLException;
 import java.net.UnknownHostException;
 import java.util.Date;
 import java.util.Iterator;
+import java.util.Locale;
 import java.util.Map;
 import java.util.Properties;
 import java.util.concurrent.BlockingQueue;
@@ -438,8 +439,9 @@ public final class CrawlStacker {
 
         // check if the protocol is supported
         final String urlProtocol = url.getProtocol();
+        final String urlstring = url.toString();
         if (!Switchboard.getSwitchboard().loader.isSupportedProtocol(urlProtocol)) {
-            this.log.logSevere("Unsupported protocol in URL '" + url.toString() + "'.");
+            this.log.logSevere("Unsupported protocol in URL '" + urlstring + "'.");
             return "unsupported protocol";
         }
 
@@ -452,31 +454,31 @@ public final class CrawlStacker {
 
         // check blacklist
         if (Switchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_CRAWLER, url)) {
-            if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' is in blacklist.");
+            if (this.log.isFine()) this.log.logFine("URL '" + urlstring + "' is in blacklist.");
             return "url in blacklist";
         }
 
-        // filter with must-match
-        if ((depth > 0) && !profile.mustMatchPattern().matcher(url.toString()).matches()) {
-            if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' does not match must-match crawling filter '" + profile.mustMatchPattern().toString() + "'.");
+        // filter with must-match for URLs
+        if ((depth > 0) && !profile.urlMustMatchPattern().matcher(urlstring).matches()) {
+            if (this.log.isFine()) this.log.logFine("URL '" + urlstring + "' does not match must-match crawling filter '" + profile.urlMustMatchPattern().toString() + "'.");
             return "url does not match must-match filter";
         }
 
-        // filter with must-not-match
-        if ((depth > 0) && profile.mustNotMatchPattern().matcher(url.toString()).matches()) {
-            if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' does matches do-not-match crawling filter '" + profile.mustNotMatchPattern().toString() + "'.");
+        // filter with must-not-match for URLs
+        if ((depth > 0) && profile.urlMustNotMatchPattern().matcher(urlstring).matches()) {
+            if (this.log.isFine()) this.log.logFine("URL '" + urlstring + "' matches must-not-match crawling filter '" + profile.urlMustNotMatchPattern().toString() + "'.");
             return "url matches must-not-match filter";
         }
 
         // deny cgi
         if (url.isIndividual() && !(profile.crawlingQ()))  { // TODO: make special property for crawlingIndividual
-            if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' is CGI URL.");
+            if (this.log.isFine()) this.log.logFine("URL '" + urlstring + "' is CGI URL.");
             return "individual url (sessionid etc) not wanted";
         }
 
         // deny post properties
         if (url.isPOST() && !(profile.crawlingQ()))  {
-            if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' is post URL.");
+            if (this.log.isFine()) this.log.logFine("URL '" + urlstring + "' is post URL.");
             return "post url not allowed";
         }
 
@@ -486,7 +488,7 @@ public final class CrawlStacker {
         if (oldEntry == null) {
             if (dbocc != null) {
                 // do double-check
-                if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' is double registered in '" + dbocc + "'.");
+                if (this.log.isFine()) this.log.logFine("URL '" + urlstring + "' is double registered in '" + dbocc + "'.");
                 if (dbocc.equals("errors")) {
                     final ZURL.Entry errorEntry = this.nextQueue.errorURL.get(url.hash());
                     return "double in: errors (" + errorEntry.anycause() + ")";
@@ -498,13 +500,13 @@ public final class CrawlStacker {
             final boolean recrawl = profile.recrawlIfOlder() > oldEntry.loaddate().getTime();
             if (recrawl) {
                 if (this.log.isInfo())
-                    this.log.logInfo("RE-CRAWL of URL '" + url.toString() + "': this url was crawled " +
+                    this.log.logInfo("RE-CRAWL of URL '" + urlstring + "': this url was crawled " +
                         ((System.currentTimeMillis() - oldEntry.loaddate().getTime()) / 60000 / 60 / 24) + " days ago.");
             } else {
                 if (dbocc == null) {
                     return "double in: LURL-DB";
                 } else {
-                    if (this.log.isInfo()) this.log.logInfo("URL '" + url.toString() + "' is double registered in '" + dbocc + "'. " + "Stack processing time:");
+                    if (this.log.isInfo()) this.log.logInfo("URL '" + urlstring + "' is double registered in '" + dbocc + "'. " + "Stack processing time:");
                     if (dbocc.equals("errors")) {
                         final ZURL.Entry errorEntry = this.nextQueue.errorURL.get(url.hash());
                         return "double in: errors (" + errorEntry.anycause() + ")";
@@ -520,16 +522,51 @@ public final class CrawlStacker {
         if (maxAllowedPagesPerDomain < Integer.MAX_VALUE) {
             final DomProfile dp = this.doms.get(url.getHost());
             if (dp != null && dp.count >= maxAllowedPagesPerDomain) {
-                if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' appeared too often in crawl stack, a maximum of " + profile.domMaxPages() + " is allowed.");
+                if (this.log.isFine()) this.log.logFine("URL '" + urlstring + "' appeared too often in crawl stack, a maximum of " + profile.domMaxPages() + " is allowed.");
                 return "crawl stack domain counter exceeded";
             }
 
             if (ResultURLs.domainCount(EventOrigin.LOCAL_CRAWLING, url.getHost()) >= profile.domMaxPages()) {
-                if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' appeared too often in result stack, a maximum of " + profile.domMaxPages() + " is allowed.");
+                if (this.log.isFine()) this.log.logFine("URL '" + urlstring + "' appeared too often in result stack, a maximum of " + profile.domMaxPages() + " is allowed.");
                 return "result stack domain counter exceeded";
             }
         }
 
+        // the following filters use a DNS lookup to check if the url matches with IP filter
+        // this is expensive and those filters are check at the end of all other tests
+
+        // filter with must-match for IPs
+        if ((depth > 0) && profile.ipMustMatchPattern() != CrawlProfile.MATCH_ALL_PATTERN && !profile.ipMustMatchPattern().matcher(url.getInetAddress().getHostAddress()).matches()) {
+            if (this.log.isFine()) this.log.logFine("IP " + url.getInetAddress().getHostAddress() + " of URL '" + urlstring + "' does not match must-match crawling filter '" + profile.ipMustMatchPattern().toString() + "'.");
+            return "ip " + url.getInetAddress().getHostAddress() + " of url does not match must-match filter";
+        }
+
+        // filter with must-not-match for IPs
+        if ((depth > 0) && profile.ipMustMatchPattern() != CrawlProfile.MATCH_NEVER_PATTERN && profile.ipMustNotMatchPattern().matcher(url.getInetAddress().getHostAddress()).matches()) {
+            if (this.log.isFine()) this.log.logFine("IP " + url.getInetAddress().getHostAddress() + " of URL '" + urlstring + "' matches must-not-match crawling filter '" + profile.ipMustMatchPattern().toString() + "'.");
+            return "ip " + url.getInetAddress().getHostAddress() + " of url matches must-not-match filter";
+        }
+
+        // filter with must-match for IPs
+        final String[] countryMatchList = profile.countryMustMatchList();
+        if (depth > 0 && countryMatchList != null && countryMatchList.length > 0) {
+            final Locale locale = url.getLocale();
+            if (locale != null) {
+                final String c0 = locale.getCountry();
+                boolean granted = false;
+                matchloop: for (final String c: countryMatchList) {
+                    if (c0.equals(c)) {
+                        granted = true;
+                        break matchloop;
+                    }
+                }
+                if (!granted) {
+                    if (this.log.isFine()) this.log.logFine("IP " + url.getInetAddress().getHostAddress() + " of URL '" + urlstring + "' does not match must-match crawling filter '" + profile.ipMustMatchPattern().toString() + "'.");
+                    return "country " + c0 + " of url does not match must-match filter for countries";
+                }
+            }
+        }
+
         return null;
     }
 
diff --git a/source/de/anomic/crawler/CrawlSwitchboard.java b/source/de/anomic/crawler/CrawlSwitchboard.java
index fe2013edf..ea2fb73d4 100644
--- a/source/de/anomic/crawler/CrawlSwitchboard.java
+++ b/source/de/anomic/crawler/CrawlSwitchboard.java
@@ -229,8 +229,8 @@ public final class CrawlSwitchboard {
             // generate new default entry for proxy crawling
             this.defaultProxyProfile = new CrawlProfile(
                     "proxy", null,
-                    CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER,
-                    CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER,
+                    CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING,
+                    CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING,
                     "",
                     0 /*Integer.parseInt(getConfig(PROXY_PREFETCH_DEPTH, "0"))*/,
                     CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), -1, false,
@@ -243,38 +243,38 @@ public final class CrawlSwitchboard {
         }
         if (this.defaultRemoteProfile == null) {
             // generate new default entry for remote crawling
-            this.defaultRemoteProfile = new CrawlProfile(CRAWL_PROFILE_REMOTE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, "", CrawlProfile.MATCH_NEVER, 0,
+            this.defaultRemoteProfile = new CrawlProfile(CRAWL_PROFILE_REMOTE, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", CrawlProfile.MATCH_NEVER_STRING, 0,
                     -1, -1, true, true, true, false, false, true, true, false, CacheStrategy.IFFRESH);
             this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultRemoteProfile.handle()), this.defaultRemoteProfile);
         }
         if (this.defaultTextSnippetLocalProfile == null) {
             // generate new default entry for snippet fetch and optional crawling
-            this.defaultTextSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, "", 0,
+            this.defaultTextSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0,
                     CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, true, false, false, true, false, true, true, false, CacheStrategy.IFEXIST);
             this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultTextSnippetLocalProfile.handle()), this.defaultTextSnippetLocalProfile);
         }
         if (this.defaultTextSnippetGlobalProfile == null) {
             // generate new default entry for snippet fetch and optional crawling
-            this.defaultTextSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, "", 0,
+            this.defaultTextSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0,
                     CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, true, true, true, true, false, true, true, false, CacheStrategy.IFEXIST);
             this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultTextSnippetGlobalProfile.handle()), this.defaultTextSnippetGlobalProfile);
         }
         this.defaultTextSnippetGlobalProfile.setCacheStrategy(CacheStrategy.IFEXIST);
         if (this.defaultMediaSnippetLocalProfile == null) {
             // generate new default entry for snippet fetch and optional crawling
-            this.defaultMediaSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, "", 0,
+            this.defaultMediaSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0,
                     CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, true, false, false, true, false, true, true, false, CacheStrategy.IFEXIST);
             this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultMediaSnippetLocalProfile.handle()), this.defaultMediaSnippetLocalProfile);
         }
         if (this.defaultMediaSnippetGlobalProfile == null) {
             // generate new default entry for snippet fetch and optional crawling
-            this.defaultMediaSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, "", 0,
+            this.defaultMediaSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0,
                     CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, true, false, true, true, false, true, true, false, CacheStrategy.IFEXIST);
             this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultMediaSnippetGlobalProfile.handle()), this.defaultMediaSnippetGlobalProfile);
         }
         if (this.defaultSurrogateProfile == null) {
             // generate new default entry for surrogate parsing
-            this.defaultSurrogateProfile = new CrawlProfile(CRAWL_PROFILE_SURROGATE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, "", 0,
+            this.defaultSurrogateProfile = new CrawlProfile(CRAWL_PROFILE_SURROGATE, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0,
                     CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, true, true, false, false, false, true, true, false, CacheStrategy.NOCACHE);
             this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultSurrogateProfile.handle()), this.defaultSurrogateProfile);
         }
diff --git a/source/net/yacy/cora/document/MultiProtocolURI.java b/source/net/yacy/cora/document/MultiProtocolURI.java
index 9eab711b5..1847de5f4 100644
--- a/source/net/yacy/cora/document/MultiProtocolURI.java
+++ b/source/net/yacy/cora/document/MultiProtocolURI.java
@@ -88,6 +88,7 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
     protected final String protocol, userInfo;
     protected       String host, path, quest, ref;
     protected       int port;
+    private         InetAddress hostAddress;
 
     /**
      * initialization of a MultiProtocolURI to produce poison pills for concurrent blocking queues
@@ -95,6 +96,7 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
     public MultiProtocolURI()  {
         this.protocol = null;
         this.host = null;
+        this.hostAddress = null;
         this.userInfo = null;
         this.path = null;
         this.quest = null;
@@ -109,6 +111,7 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
     protected MultiProtocolURI(final MultiProtocolURI url) {
         this.protocol = url.protocol;
         this.host = url.host;
+        this.hostAddress = null;
         this.userInfo = url.userInfo;
         this.path = url.path;
         this.quest = url.quest;
@@ -119,6 +122,8 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
     public MultiProtocolURI(String url) throws MalformedURLException {
         if (url == null) throw new MalformedURLException("url string is null");
 
+        this.hostAddress = null;
+
         // identify protocol
         assert (url != null);
         url = url.trim();
@@ -688,6 +693,12 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
         return this.host;
     }
 
+    public InetAddress getInetAddress() {
+        if (this.hostAddress != null) return this.hostAddress;
+        this.hostAddress = Domains.dnsResolve(this.host.toLowerCase());
+        return this.hostAddress;
+    }
+
     public int getPort() {
         return this.port;
     }
@@ -827,7 +838,7 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
             }
             final String hl = getHost().toLowerCase();
             if (resolveHost) {
-                final InetAddress r = Domains.dnsResolve(hl);
+                final InetAddress r = getInetAddress();
                 u.append(r == null ? hl : r.getHostAddress());
             } else {
                 u.append(hl);
@@ -1119,8 +1130,11 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
         return baos.toByteArray();
     }
 
-
     public Locale getLocale() {
+        if (this.hostAddress != null) {
+            final Locale locale = Domains.getLocale(this.hostAddress);
+            if (locale != null && locale.getCountry() != null && locale.getCountry().length() > 0) return locale;
+        }
         return Domains.getLocale(this.host);
     }
 
diff --git a/source/net/yacy/cora/protocol/Domains.java b/source/net/yacy/cora/protocol/Domains.java
index 193b263e3..cddd090e5 100644
--- a/source/net/yacy/cora/protocol/Domains.java
+++ b/source/net/yacy/cora/protocol/Domains.java
@@ -550,6 +550,11 @@ public class Domains {
         cacheHit_Insert++;
     }
 
+    /**
+     * resolve a host address using a local DNS cache and a DNS lookup if necessary
+     * @param host
+     * @return the hosts InetAddress or null if the address cannot be resolved
+     */
     public static InetAddress dnsResolve(String host) {
         if ((host == null) || (host.length() == 0)) return null;
         host = host.toLowerCase().trim();
@@ -921,7 +926,7 @@ public class Domains {
     public static Locale getLocale(final String host) {
         if (host == null) return null;
         final Locale locale = getLocale(dnsResolve(host));
-        if (locale != null) return locale;
+        if (locale != null && locale.getCountry() != null && locale.getCountry().length() > 0) return locale;
         final int p = host.lastIndexOf('.');
         if (p < 0) return null;
         String tld = host.substring(p + 1).toUpperCase();
diff --git a/source/net/yacy/cora/protocol/Scanner.java b/source/net/yacy/cora/protocol/Scanner.java
index 7e6954d37..8c508e1a0 100644
--- a/source/net/yacy/cora/protocol/Scanner.java
+++ b/source/net/yacy/cora/protocol/Scanner.java
@@ -11,12 +11,12 @@
  *  modify it under the terms of the GNU Lesser General Public
  *  License as published by the Free Software Foundation; either
  *  version 2.1 of the License, or (at your option) any later version.
- *  
+ *
  *  This library is distributed in the hope that it will be useful,
  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  *  Lesser General Public License for more details.
- *  
+ *
  *  You should have received a copy of the GNU Lesser General Public License
  *  along with this program in the file lgpl21.txt
  *  If not, see <http://www.gnu.org/licenses/>.
@@ -53,22 +53,22 @@ public class Scanner extends Thread {
 
     private static final Service POISONSERVICE = new Service(Protocol.http, null);
     private static final Object PRESENT = new Object();
-    
+
     public static enum Access {unknown, empty, granted, denied;}
     public static enum Protocol {http(80), https(443), ftp(21), smb(445);
         public int port;
-        private Protocol(int port) {this.port = port;}
+        private Protocol(final int port) {this.port = port;}
     }
     public static class Service {
         public Protocol protocol;
         public InetAddress inetAddress;
         private String hostname;
-        public Service(Protocol protocol, InetAddress inetAddress) {
+        public Service(final Protocol protocol, final InetAddress inetAddress) {
             this.protocol = protocol;
             this.inetAddress = inetAddress;
             this.hostname = null;
         }
-        public Service(String protocol, InetAddress inetAddress) {
+        public Service(final String protocol, final InetAddress inetAddress) {
             this.protocol = protocol.equals("http") ? Protocol.http : protocol.equals("https") ? Protocol.https : protocol.equals("ftp") ? Protocol.ftp : Protocol.smb;
             this.inetAddress = inetAddress;
             this.hostname = null;
@@ -92,7 +92,7 @@ public class Scanner extends Thread {
             try {
                 this.hostname = TimeoutRequest.getHostName(this.inetAddress, 100);
                 Domains.setHostName(this.inetAddress, this.hostname);
-            } catch (ExecutionException e) {
+            } catch (final ExecutionException e) {
                 this.hostname = this.inetAddress.getHostAddress();
             }
             //this.hostname = Domains.getHostName(this.inetAddress);
@@ -105,7 +105,7 @@ public class Scanner extends Thread {
         public String toString() {
             try {
                 return new MultiProtocolURI(this.protocol.name() + "://" + this.inetAddress.getHostAddress() + "/").toNormalform(true, false);
-            } catch (MalformedURLException e) {
+            } catch (final MalformedURLException e) {
                 return "";
             }
         }
@@ -114,11 +114,11 @@ public class Scanner extends Thread {
             return this.inetAddress.hashCode();
         }
         @Override
-        public boolean equals(Object o) {
+        public boolean equals(final Object o) {
             return (o instanceof Service) && ((Service) o).protocol == this.protocol && ((Service) o).inetAddress.equals(this.inetAddress);
         }
     }
-    
+
     private final static Map<Service, Access> scancache = new ConcurrentHashMap<Service, Access>();
     //private       static long scancacheUpdateTime = 0;
     //private       static long scancacheValidUntilTime = Long.MAX_VALUE;
@@ -127,17 +127,17 @@ public class Scanner extends Thread {
     public static int scancacheSize() {
         return scancache.size();
     }
-    
-    public static void scancacheReplace(Scanner newScanner, long validTime) {
+
+    public static void scancacheReplace(final Scanner newScanner, final long validTime) {
         scancache.clear();
         scancache.putAll(newScanner.services());
         //scancacheUpdateTime = System.currentTimeMillis();
         //scancacheValidUntilTime = validTime == Long.MAX_VALUE ? Long.MAX_VALUE : scancacheUpdateTime + validTime;
         scancacheScanrange = newScanner.scanrange;
     }
-    
-    public static void scancacheExtend(Scanner newScanner, long validTime) {
-        Iterator<Map.Entry<Service, Access>> i = Scanner.scancache.entrySet().iterator();
+
+    public static void scancacheExtend(final Scanner newScanner, final long validTime) {
+        final Iterator<Map.Entry<Service, Access>> i = Scanner.scancache.entrySet().iterator();
         Map.Entry<Service, Access> entry;
         while (i.hasNext()) {
             entry = i.next();
@@ -148,11 +148,11 @@ public class Scanner extends Thread {
         //scancacheValidUntilTime = validTime == Long.MAX_VALUE ? Long.MAX_VALUE : scancacheUpdateTime + validTime;
         scancacheScanrange = newScanner.scanrange;
     }
-    
+
     public static Iterator<Map.Entry<Service, Scanner.Access>> scancacheEntries() {
         return scancache.entrySet().iterator();
     }
-    
+
     /**
      * check if the url can be accepted by the scanner. the scanner accepts the url if:
      * - the host of the url is not supervised (it is not in the scan range), or
@@ -160,100 +160,100 @@ public class Scanner extends Thread {
      * @param url
      * @return true if the url shall be part of a search result
      */
-    public static boolean acceptURL(MultiProtocolURI url) {
+    public static boolean acceptURL(final MultiProtocolURI url) {
         // if the scan range is empty, then all urls are accepted
         if (scancacheScanrange == null || scancacheScanrange.isEmpty()) return true;
-        
+
         //if (System.currentTimeMillis() > scancacheValidUntilTime) return true;
-        InetAddress a = Domains.dnsResolve(url.getHost()); // try to avoid that!
+        final InetAddress a = url.getInetAddress(); // try to avoid that!
         if (a == null) return true;
-        InetAddress n = normalize(a);
+        final InetAddress n = normalize(a);
         if (!scancacheScanrange.contains(n)) return true;
-        Access access = scancache.get(new Service(url.getProtocol(), a));
+        final Access access = scancache.get(new Service(url.getProtocol(), a));
         if (access == null) return false;
         return access == Access.granted;
     }
 
-    private static InetAddress normalize(InetAddress a) {
+    private static InetAddress normalize(final InetAddress a) {
         if (a == null) return null;
-        byte[] b = a.getAddress();
+        final byte[] b = a.getAddress();
         if (b[3] == 1) return a;
         b[3] = 1;
         try {
             return InetAddress.getByAddress(b);
-        } catch (UnknownHostException e) {
+        } catch (final UnknownHostException e) {
             return a;
         }
     }
-    
-    private int runnerCount;
-    private Set<InetAddress> scanrange;
-    private BlockingQueue<Service> scanqueue;
-    private Map<Service, Access> services;
-    private Map<Runner, Object> runner;
-    private int timeout;
 
-    public Scanner(Set<InetAddress> scanrange, int concurrentRunner, int timeout) {
+    private final int runnerCount;
+    private final Set<InetAddress> scanrange;
+    private final BlockingQueue<Service> scanqueue;
+    private final Map<Service, Access> services;
+    private final Map<Runner, Object> runner;
+    private final int timeout;
+
+    public Scanner(final Set<InetAddress> scanrange, final int concurrentRunner, final int timeout) {
         this.runnerCount = concurrentRunner;
         this.scanrange = new HashSet<InetAddress>();
-        for (InetAddress a: scanrange) this.scanrange.add(normalize(a));
+        for (final InetAddress a: scanrange) this.scanrange.add(normalize(a));
         this.scanqueue = new LinkedBlockingQueue<Service>();
         this.services = Collections.synchronizedMap(new HashMap<Service, Access>());
         this.runner = new ConcurrentHashMap<Runner, Object>();
         this.timeout = timeout;
     }
-    
-    public Scanner(int concurrentRunner, int timeout) {
+
+    public Scanner(final int concurrentRunner, final int timeout) {
         this(Domains.myIntranetIPs(), concurrentRunner, timeout);
     }
-    
+
     @Override
     public void run() {
         Service uri;
         try {
-            while ((uri = scanqueue.take()) != POISONSERVICE) {
-                while (runner.size() >= this.runnerCount) {
+            while ((uri = this.scanqueue.take()) != POISONSERVICE) {
+                while (this.runner.size() >= this.runnerCount) {
                     /*for (Runner r: runner.keySet()) {
                         if (r.age() > 3000) synchronized(r) { r.interrupt(); }
                     }*/
-                    if (runner.size() >= this.runnerCount) Thread.sleep(20);
+                    if (this.runner.size() >= this.runnerCount) Thread.sleep(20);
                 }
-                Runner runner = new Runner(uri);
+                final Runner runner = new Runner(uri);
                 this.runner.put(runner, PRESENT);
                 runner.start();
             }
-        } catch (InterruptedException e) {
+        } catch (final InterruptedException e) {
         }
     }
 
     public int pending() {
         return this.scanqueue.size();
     }
-    
+
     public void terminate() {
-        for (int i = 0; i < runnerCount; i++) try {
+        for (int i = 0; i < this.runnerCount; i++) try {
             this.scanqueue.put(POISONSERVICE);
-        } catch (InterruptedException e) {
+        } catch (final InterruptedException e) {
         }
         try {
             this.join();
-        } catch (InterruptedException e) {
+        } catch (final InterruptedException e) {
         }
     }
-    
+
     public class Runner extends Thread {
-        private Service service;
-        private long starttime;
-        public Runner(Service service) {
+        private final Service service;
+        private final long starttime;
+        public Runner(final Service service) {
             this.service = service;
             this.starttime = System.currentTimeMillis();
         }
         @Override
         public void run() {
             try {
-                if (TimeoutRequest.ping(this.service.getInetAddress().getHostAddress(), this.service.getProtocol().port, timeout)) {
+                if (TimeoutRequest.ping(this.service.getInetAddress().getHostAddress(), this.service.getProtocol().port, Scanner.this.timeout)) {
                     Access access = this.service.getProtocol() == Protocol.http || this.service.getProtocol() == Protocol.https ? Access.granted : Access.unknown;
-                    services.put(service, access);
+                    Scanner.this.services.put(this.service, access);
                     if (access == Access.unknown) {
                         // ask the service if it lets us in
                         if (this.service.getProtocol() == Protocol.ftp) {
@@ -261,35 +261,35 @@ public class Scanner extends Thread {
                             try {
                                 ftpClient.open(this.service.getInetAddress().getHostAddress(), this.service.getProtocol().port);
                                 ftpClient.login("anonymous", "anomic@");
-                                List<String> list = ftpClient.list("/", false);
+                                final List<String> list = ftpClient.list("/", false);
                                 ftpClient.CLOSE();
                                 access = list == null || list.isEmpty() ? Access.empty : Access.granted;
-                            } catch (IOException e) {
+                            } catch (final IOException e) {
                                 access = Access.denied;
                             }
                         }
                         if (this.service.getProtocol() == Protocol.smb) {
                             try {
-                                MultiProtocolURI uri = new MultiProtocolURI(this.service.toString());
-                                String[] list = uri.list();
+                                final MultiProtocolURI uri = new MultiProtocolURI(this.service.toString());
+                                final String[] list = uri.list();
                                 access = list == null || list.length == 0 ? Access.empty : Access.granted;
-                            } catch (IOException e) {
+                            } catch (final IOException e) {
                                 access = Access.denied;
                             }
                         }
                     }
-                    if (access != Access.unknown) services.put(this.service, access);
+                    if (access != Access.unknown) Scanner.this.services.put(this.service, access);
                 }
-            } catch (ExecutionException e) {
+            } catch (final ExecutionException e) {
             }
-            Object r = runner.remove(this);
+            final Object r = Scanner.this.runner.remove(this);
             assert r != null;
         }
         public long age() {
             return System.currentTimeMillis() - this.starttime;
         }
         @Override
-        public boolean equals(Object o) {
+        public boolean equals(final Object o) {
             return (o instanceof Runner) && this.service.equals(((Runner) o).service);
         }
         @Override
@@ -297,76 +297,76 @@ public class Scanner extends Thread {
             return this.service.hashCode();
         }
     }
-    
-    public void addHTTP(boolean bigrange) {
+
+    public void addHTTP(final boolean bigrange) {
         addProtocol(Protocol.http, bigrange);
     }
 
-    public void addHTTPS(boolean bigrange) {
+    public void addHTTPS(final boolean bigrange) {
         addProtocol(Protocol.https, bigrange);
     }
 
-    public void addSMB(boolean bigrange) {
+    public void addSMB(final boolean bigrange) {
         addProtocol(Protocol.smb, bigrange);
     }
-    
-    public void addFTP(boolean bigrange) {
+
+    public void addFTP(final boolean bigrange) {
         addProtocol(Protocol.ftp, bigrange);
     }
-    
-    private void addProtocol(Protocol protocol, boolean bigrange) {
-        for (InetAddress i: genlist(bigrange)) {
+
+    private void addProtocol(final Protocol protocol, final boolean bigrange) {
+        for (final InetAddress i: genlist(bigrange)) {
             try {
                 this.scanqueue.put(new Service(protocol, i));
-            } catch (InterruptedException e) {
+            } catch (final InterruptedException e) {
             }
         }
     }
-    
-    private final List<InetAddress> genlist(boolean bigrange) {
-        ArrayList<InetAddress> c = new ArrayList<InetAddress>(10);
-        for (InetAddress i: scanrange) {
+
+    private final List<InetAddress> genlist(final boolean bigrange) {
+        final ArrayList<InetAddress> c = new ArrayList<InetAddress>(10);
+        for (final InetAddress i: this.scanrange) {
             for (int br = bigrange ? 1 : i.getAddress()[2]; br < (bigrange ? 255 : i.getAddress()[2] + 1); br++) {
                 for (int j = 1; j < 255; j++) {
-                    byte[] address = i.getAddress();
+                    final byte[] address = i.getAddress();
                     address[2] = (byte) br;
                     address[3] = (byte) j;
                     try {
                         c.add(InetAddress.getByAddress(address));
-                    } catch (UnknownHostException e) {
+                    } catch (final UnknownHostException e) {
                     }
                 }
             }
         }
         return c;
     }
-    
+
     public Map<Service, Access> services() {
         return this.services;
     }
-    
-    public static byte[] inIndex(Map<byte[], String> commentCache, String url) {
-        for (Map.Entry<byte[], String> comment: commentCache.entrySet()) {
+
+    public static byte[] inIndex(final Map<byte[], String> commentCache, final String url) {
+        for (final Map.Entry<byte[], String> comment: commentCache.entrySet()) {
             if (comment.getValue().contains(url)) return comment.getKey();
         }
         return null;
     }
-    
-    public static void main(String[] args) {
+
+    public static void main(final String[] args) {
         //try {System.out.println("192.168.1.91: " + ping(new MultiProtocolURI("smb://192.168.1.91/"), 1000));} catch (MalformedURLException e) {}
-        Scanner scanner = new Scanner(100, 10);
+        final Scanner scanner = new Scanner(100, 10);
         scanner.addFTP(false);
         scanner.addHTTP(false);
         scanner.addHTTPS(false);
         scanner.addSMB(false);
         scanner.start();
         scanner.terminate();
-        for (Service service: scanner.services().keySet()) {
+        for (final Service service: scanner.services().keySet()) {
             System.out.println(service.toString());
         }
         try {
             HTTPClient.closeConnectionManager();
-        } catch (InterruptedException e) {
+        } catch (final InterruptedException e) {
         }
     }
 }
diff --git a/source/net/yacy/cora/services/federated/solr/SolrScheme.java b/source/net/yacy/cora/services/federated/solr/SolrScheme.java
index 6b2a95ff2..28d43f778 100644
--- a/source/net/yacy/cora/services/federated/solr/SolrScheme.java
+++ b/source/net/yacy/cora/services/federated/solr/SolrScheme.java
@@ -37,7 +37,6 @@ import java.util.Set;
 
 import net.yacy.cora.document.MultiProtocolURI;
 import net.yacy.cora.document.UTF8;
-import net.yacy.cora.protocol.Domains;
 import net.yacy.cora.protocol.HeaderFramework;
 import net.yacy.cora.protocol.ResponseHeader;
 import net.yacy.cora.storage.ConfigurationSet;
@@ -103,7 +102,7 @@ public class SolrScheme extends ConfigurationSet {
         addSolr(solrdoc, "failreason_t", ""); // overwrite a possible fail reason (in case that there was a fail reason before)
         addSolr(solrdoc, "id", id);
         addSolr(solrdoc, "sku", digestURI.toNormalform(true, false), 3.0f);
-        final InetAddress address = Domains.dnsResolve(digestURI.getHost());
+        final InetAddress address = digestURI.getInetAddress();
         if (address != null) addSolr(solrdoc, "ip_s", address.getHostAddress());
         if (digestURI.getHost() != null) addSolr(solrdoc, "host_s", digestURI.getHost());
         addSolr(solrdoc, "title", yacydoc.dc_title());
@@ -354,16 +353,16 @@ public class SolrScheme extends ConfigurationSet {
         return solrdoc;
     }
 
-    private int relEval(String[] rel) {
+    private int relEval(final String[] rel) {
         int i = 0;
-        for (String s: rel) {
-            String s0 = s.toLowerCase().trim();
+        for (final String s: rel) {
+            final String s0 = s.toLowerCase().trim();
             if ("me".equals(s0)) i += 1;
             if ("nofollow".equals(s0)) i += 2;
         }
         return i;
     }
-    
+
     public String solrGetID(final SolrDocument solr) {
         return (String) solr.getFieldValue("id");
     }
diff --git a/source/net/yacy/cora/services/federated/solr/SolrSingleConnector.java b/source/net/yacy/cora/services/federated/solr/SolrSingleConnector.java
index e9c9dadf5..847da1bbc 100644
--- a/source/net/yacy/cora/services/federated/solr/SolrSingleConnector.java
+++ b/source/net/yacy/cora/services/federated/solr/SolrSingleConnector.java
@@ -279,7 +279,7 @@ public class SolrSingleConnector implements SolrConnector {
             final SolrInputDocument solrdoc = new SolrInputDocument();
             solrdoc.addField("id", ASCII.String(digestURI.hash()));
             solrdoc.addField("sku", digestURI.toNormalform(true, false), 3.0f);
-            final InetAddress address = Domains.dnsResolve(digestURI.getHost());
+            final InetAddress address = digestURI.getInetAddress();
             if (address != null) solrdoc.addField("ip_s", address.getHostAddress());
             if (digestURI.getHost() != null) solrdoc.addField("host_s", digestURI.getHost());
 
diff --git a/source/net/yacy/kelondro/rwi/ReferenceContainer.java b/source/net/yacy/kelondro/rwi/ReferenceContainer.java
index 5227a0288..1af1fd270 100644
--- a/source/net/yacy/kelondro/rwi/ReferenceContainer.java
+++ b/source/net/yacy/kelondro/rwi/ReferenceContainer.java
@@ -232,6 +232,7 @@ public class ReferenceContainer<ReferenceType extends Reference> extends RowSet
     	int pos = 0;
     	while (i.hasNext()) {
     		r = i.next();
+    		if (r == null) continue;
     		mod = r.lastModified();
     		positions = tm.get(mod);
     		if (positions == null) positions = new ArrayList<Integer>();
diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java
index 267cfd419..b0c6ac375 100644
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@@ -1909,8 +1909,8 @@ public final class Switchboard extends serverSwitch {
             this.log.logFine("processResourceStack processCase=" + processCase +
                 ", depth=" + response.depth() +
                 ", maxDepth=" + ((response.profile() == null) ? "null" : Integer.toString(response.profile().depth())) +
-                ", must-match=" + ((response.profile() == null) ? "null" : response.profile().mustMatchPattern().toString()) +
-                ", must-not-match=" + ((response.profile() == null) ? "null" : response.profile().mustNotMatchPattern().toString()) +
+                ", must-match=" + ((response.profile() == null) ? "null" : response.profile().urlMustMatchPattern().toString()) +
+                ", must-not-match=" + ((response.profile() == null) ? "null" : response.profile().urlMustNotMatchPattern().toString()) +
                 ", initiatorHash=" + ((response.initiator() == null) ? "null" : ASCII.String(response.initiator())) +
                 //", responseHeader=" + ((entry.responseHeader() == null) ? "null" : entry.responseHeader().toString()) +
                 ", url=" + response.url()); // DEBUG