From f6eebb6f99ef8fd0874be7bc29065c37fec90621 Mon Sep 17 00:00:00 2001
From: orbiter <orbiter@6c8d7289-2bf4-0310-a012-ef5d649a1542>
Date: Thu, 30 Sep 2010 12:50:34 +0000
Subject: [PATCH] replaced auto-dom filter with easy-to-understand Site
 Link-List crawler option - nobody understand the auto-dom filter without a
 lenghtly introduction about the function of a crawler - nobody ever used the
 auto-dom filter other than with a crawl depth of 1 - the auto-dom filter was
 buggy since the filter did not survive a restart and then a search index
 contained waste - the function of the auto-dom filter was in fact to just
 load a link list from the given start url and then start separate crawls for
 all these urls restricted by their domain - the new Site Link-List option
 shows the target urls in real-time during input of the start url (like the
 robots check) and gives a transparent feed-back what it does before it can be
 used - the new option also fits into the easy site-crawl start menu

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7213 6c8d7289-2bf4-0310-a012-ef5d649a1542
---
 htroot/CrawlProfileEditor_p.html              |   4 +-
 htroot/CrawlProfileEditor_p.java              |   3 +-
 htroot/CrawlProfileEditor_p.xml               |   1 -
 htroot/CrawlStartExpert_p.html                |  23 +-
 htroot/CrawlStartSite_p.html                  |  13 +-
 htroot/Crawler_p.java                         | 709 +++++++++---------
 htroot/QuickCrawlLink_p.java                  |   1 -
 htroot/api/util/getpageinfo_p.java            |  14 +
 htroot/api/util/getpageinfo_p.xml             |   7 +
 htroot/js/IndexCreate.js                      |  28 +-
 source/de/anomic/crawler/CrawlProfile.java    |  33 +-
 source/de/anomic/crawler/CrawlStacker.java    |   8 +-
 .../de/anomic/crawler/CrawlSwitchboard.java   |  17 +-
 13 files changed, 437 insertions(+), 424 deletions(-)
diff --git a/htroot/CrawlProfileEditor_p.html b/htroot/CrawlProfileEditor_p.html
index f00c8d814..0a027828e 100644
--- a/htroot/CrawlProfileEditor_p.html
+++ b/htroot/CrawlProfileEditor_p.html
@@ -38,8 +38,7 @@
     <td><strong>Must Match</strong></td>
     <td><strong>Must Not Match</strong></td>
     <td><strong>MaxAge</strong></td>
-    <td><strong>Auto Filter Depth</strong></td>
-    <td><strong>Auto Filter Content</strong></td>
+    <td><strong>Domain Counter Content</strong></td>
     <td><strong>Max Page Per Domain</strong></td>
     <td><strong>Accept '?' URLs</strong></td>
     <td><strong>Fill Proxy Cache</strong></td>
@@ -70,7 +69,6 @@
     <td>#[mustmatch]#</td>
     <td>#[mustnotmatch]#</td>
     <td>#[crawlingIfOlder]#</td>
-    <td>#[crawlingDomFilterDepth]#</td>
     <td>#{crawlingDomFilterContent}##[item]#<br />#{/crawlingDomFilterContent}#</td>
     <td>#[crawlingDomMaxPages]#</td>
     <td>#(withQuery)#no::yes#(/withQuery)#</td>
diff --git a/htroot/CrawlProfileEditor_p.java b/htroot/CrawlProfileEditor_p.java
index dd88cdb07..50f0cf8e2 100644
--- a/htroot/CrawlProfileEditor_p.java
+++ b/htroot/CrawlProfileEditor_p.java
@@ -87,7 +87,6 @@ public class CrawlProfileEditor_p {
         labels.add(new eentry(CrawlProfile.FILTER_MUSTNOTMATCH, "Must-Not-Match Filter", false, eentry.STRING));
         labels.add(new eentry(CrawlProfile.DEPTH,               "Crawl Depth",           false, eentry.INTEGER));
         labels.add(new eentry(CrawlProfile.RECRAWL_IF_OLDER,    "Recrawl If Older",      false, eentry.INTEGER));
-        labels.add(new eentry(CrawlProfile.DOM_FILTER_DEPTH,    "Domain Filter Depth",   false, eentry.INTEGER));
         labels.add(new eentry(CrawlProfile.DOM_MAX_PAGES,       "Domain Max. Pages",     false, eentry.INTEGER));
         labels.add(new eentry(CrawlProfile.CRAWLING_Q,          "CrawlingQ / '?'-URLs",  false, eentry.BOOLEAN));
         labels.add(new eentry(CrawlProfile.INDEX_TEXT,          "Index Text",            false, eentry.BOOLEAN));
@@ -245,7 +244,7 @@ public class CrawlProfileEditor_p {
         prop.put(CRAWL_PROFILE_PREFIX + count + "_mustmatch", profile.mustMatchPattern().toString());
         prop.put(CRAWL_PROFILE_PREFIX + count + "_mustnotmatch", profile.mustNotMatchPattern().toString());
         prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingIfOlder", (profile.recrawlIfOlder() == 0L) ? "no re-crawl" : DateFormat.getDateTimeInstance().format(profile.recrawlIfOlder()));
-        prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingDomFilterDepth", (profile.domFilterDepth() == Integer.MAX_VALUE) ? "inactive" : Integer.toString(profile.domFilterDepth()));
+        prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingDomFilterDepth", "inactive");
 
         // start contrib [MN]
         int i = 0;
diff --git a/htroot/CrawlProfileEditor_p.xml b/htroot/CrawlProfileEditor_p.xml
index 5b5f54bee..67a254261 100644
--- a/htroot/CrawlProfileEditor_p.xml
+++ b/htroot/CrawlProfileEditor_p.xml
@@ -9,7 +9,6 @@
 		<mustmatch>#[mustmatch]#</mustmatch>
 		<mustnotmatch>#[mustnotmatch]#</mustnotmatch>
 		<crawlingIfOlder>#[crawlingIfOlder]#</crawlingIfOlder>
-		<crawlingDomFilterDepth>#[crawlingDomFilterDepth]#</crawlingDomFilterDepth>
 		<crawlingDomFilterContent>
 		#{crawlingDomFilterContent}#
 			<item>#[item]#</item>
diff --git a/htroot/CrawlStartExpert_p.html b/htroot/CrawlStartExpert_p.html
index c4f9d0dd4..a05c05abe 100644
--- a/htroot/CrawlStartExpert_p.html
+++ b/htroot/CrawlStartExpert_p.html
@@ -44,6 +44,13 @@
                   <input name="crawlingURL" type="text" size="41" maxlength="256" value="#[starturl]#" onkeypress="changed()" onfocus="check('url')" />                                    
                 </td>
               </tr>
+              <tr>
+                <td><label for="url"><span class="nobr">From Link-List of URL</span></label>:</td>
+                <td><input type="radio" name="crawlingMode" id="sitelist" value="sitelist" disabled="disabled"/></td>
+                <td>
+                  <div id="sitelistURLs"></div>
+                </td>              
+              </tr>
               <tr>
                 <td><label for="url"><span class="nobr">From Sitemap</span></label>:</td>
                 <td><input type="radio" name="crawlingMode" id="sitemap" value="sitemap" disabled="disabled"/></td>
@@ -154,22 +161,6 @@
             If you don't know what this means, please leave this field empty.
           </td>
         </tr>
-        <tr valign="top" class="TableCellLight">
-          <td>Auto-Dom-Filter:</td>
-          <td>
-            <label for="crawlingDomFilterCheck">Use</label>:
-            <input type="checkbox" name="crawlingDomFilterCheck" id="crawlingDomFilterCheck" #(crawlingDomFilterCheck)#::checked="checked"#(/crawlingDomFilterCheck)# />&nbsp;&nbsp;
-            <label for="crawlingDomFilterDepth">Depth</label>:
-            <input name="crawlingDomFilterDepth" id="crawlingDomFilterDepth" type="text" size="2" maxlength="2" value="#[crawlingDomFilterDepth]#" />
-          </td>
-          <td>
-            This option will automatically create a domain-filter which limits the crawl on domains the crawler
-            will find on the given depth. You can use this option i.e. to crawl a page with bookmarks while
-            restricting the crawl on only those domains that appear on the bookmark-page. The adequate depth
-            for this example would be 1.<br />
-            The default value 0 gives no restrictions.
-          </td>
-        </tr>
         <tr valign="top" class="TableCellDark">
           <td>Maximum Pages per Domain:</td>
           <td>
diff --git a/htroot/CrawlStartSite_p.html b/htroot/CrawlStartSite_p.html
index a679b4f4d..153f752e8 100644
--- a/htroot/CrawlStartSite_p.html
+++ b/htroot/CrawlStartSite_p.html
@@ -42,13 +42,18 @@
             <input name="bookmarkTitle" id="bookmarkTitle" type="text" size="50" maxlength="256" value="" readonly="readonly" style="background:transparent; border:0px"/>
             </td>
 	        <td>
-          <span id="robotsOK"></span>
-          <img align="top" src="/env/grafics/empty.gif" name="ajax" alt="empty" />
-          </td></tr><tr>
+            <span id="robotsOK"></span><img align="top" src="/env/grafics/empty.gif" name="ajax" alt="empty" />
+            </td>
+          </tr><tr>
+            <td><input type="radio" name="crawlingMode" id="sitelist" value="sitelist" disabled="disabled"
+            onmousedown="document.getElementById('rangeDomain').disabled=true;document.getElementById('rangeSubpath').disabled=true;document.getElementById('crawlingDomMaxCheck').disabled=true;document.getElementById('crawlingDomMaxPages').disabled=true;document.getElementById('crawlingQ').disabled=true;"/>Link-List of URL</td>
+            <td><div id="sitelistURLs"></div></td>
+          </tr><tr>
             <td><input type="radio" name="crawlingMode" id="sitemap" value="sitemap" disabled="disabled"
             onmousedown="document.getElementById('rangeDomain').disabled=true;document.getElementById('rangeSubpath').disabled=true;document.getElementById('crawlingDomMaxCheck').disabled=true;document.getElementById('crawlingDomMaxPages').disabled=true;document.getElementById('crawlingQ').disabled=true;"/>Sitemap URL</td>
             <td><input name="sitemapURL" type="text" size="41" maxlength="256" value="" readonly="readonly" style="background:transparent; border:0px"/></td>
-          </tr></table><br/>
+          </tr>
+          </table><br/>
         </dd>
       <input type="hidden" name="crawlingDepth" id="crawlingDepth" value="99">
         <dt><label>Scheduler</label></dt>
diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java
index 79c0abf76..a9a25d0db 100644
--- a/htroot/Crawler_p.java
+++ b/htroot/Crawler_p.java
@@ -60,10 +60,6 @@ import de.anomic.server.serverSwitch;
 import de.anomic.yacy.yacyNewsPool;
 
 public class Crawler_p {
-	public static final String CRAWLING_MODE_URL = "url";
-	public static final String CRAWLING_MODE_FILE = "file";
-	public static final String CRAWLING_MODE_SITEMAP = "sitemap";
-	
 
     // this servlet does NOT create the Crawler servlet page content!
     // this servlet starts a web crawl. The interface for entering the web crawl parameters is in IndexCreate_p.html
@@ -102,372 +98,405 @@ public class Crawler_p {
         }
         
         prop.put("info", "0");
-        if (post != null) {
-            // a crawl start
             
-            if (post.containsKey("continue")) {
-                // continue queue
-                final String queue = post.get("continue", "");
-                if (queue.equals("localcrawler")) {
-                    sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
-                } else if (queue.equals("remotecrawler")) {
-                    sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL);
-                }
+        if (post != null && post.containsKey("continue")) {
+            // continue queue
+            final String queue = post.get("continue", "");
+            if (queue.equals("localcrawler")) {
+                sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
+            } else if (queue.equals("remotecrawler")) {
+                sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL);
             }
+        }
 
-            if (post.containsKey("pause")) {
-                // pause queue
-                final String queue = post.get("pause", "");
-                if (queue.equals("localcrawler")) {
-                    sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
-                } else if (queue.equals("remotecrawler")) {
-                    sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL);
-                }
+        if (post != null && post.containsKey("pause")) {
+            // pause queue
+            final String queue = post.get("pause", "");
+            if (queue.equals("localcrawler")) {
+                sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
+            } else if (queue.equals("remotecrawler")) {
+                sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL);
             }
-            
-            if (post.containsKey("crawlingstart")) {
-                // init crawl
-                if (sb.peers == null) {
-                    prop.put("info", "3");
-                } else {
-                    String crawlingStart = post.get("crawlingURL","").trim(); // the crawljob start url
-                    // add the prefix http:// if necessary
-                    int pos = crawlingStart.indexOf("://");
-                    if (pos == -1) crawlingStart = "http://" + crawlingStart;
+        }
+        
+        if (post != null && post.containsKey("crawlingstart")) {
+            // init crawl
+            if (sb.peers == null) {
+                prop.put("info", "3");
+            } else {
+                String crawlingStart = post.get("crawlingURL","").trim(); // the crawljob start url
+                // add the prefix http:// if necessary
+                int pos = crawlingStart.indexOf("://");
+                if (pos == -1) crawlingStart = "http://" + crawlingStart;
 
-                    // normalizing URL
-                    DigestURI crawlingStartURL = null;
-                    try {crawlingStartURL = new DigestURI(crawlingStart, null);} catch (final MalformedURLException e1) {}
-                    crawlingStart = (crawlingStartURL == null) ? null : crawlingStartURL.toNormalform(true, true);
-                   
-                    // set new properties
-                    final boolean fullDomain = post.get("range", "wide").equals("domain"); // special property in simple crawl start
-                    final boolean subPath    = post.get("range", "wide").equals("subpath"); // special property in simple crawl start
-                    
-                    
-                    // set the crawling filter
-                    String newcrawlingMustMatch = post.get("mustmatch", CrawlProfile.MATCH_ALL);
-                    String newcrawlingMustNotMatch = post.get("mustnotmatch", CrawlProfile.MATCH_NEVER);
-                    if (newcrawlingMustMatch.length() < 2) newcrawlingMustMatch = CrawlProfile.MATCH_ALL; // avoid that all urls are filtered out if bad value was submitted
-                    // special cases:
-                    if (crawlingStartURL!= null && fullDomain) {
-                        newcrawlingMustMatch = ".*" + crawlingStartURL.getHost() + ".*";
-                    }
-                    if (crawlingStart!= null && subPath && (pos = crawlingStart.lastIndexOf('/')) > 0) {
-                        newcrawlingMustMatch = crawlingStart.substring(0, pos + 1) + ".*";
-                    }
-                    
-                    final boolean crawlOrder = post.get("crawlOrder", "off").equals("on");
-                    env.setConfig("crawlOrder", (crawlOrder) ? "true" : "false");
-                    
-                    int newcrawlingdepth = Integer.parseInt(post.get("crawlingDepth", "8"));
-                    env.setConfig("crawlingDepth", Integer.toString(newcrawlingdepth));
-                    if ((crawlOrder) && (newcrawlingdepth > 8)) newcrawlingdepth = 8;
-                    
-                    // recrawl
-                    final String recrawl = post.get("recrawl", "nodoubles"); // nodoubles, reload, scheduler
-                    boolean crawlingIfOlderCheck = post.get("crawlingIfOlderCheck", "off").equals("on");
-                    int crawlingIfOlderNumber = Integer.parseInt(post.get("crawlingIfOlderNumber", "-1"));
-                    String crawlingIfOlderUnit = post.get("crawlingIfOlderUnit","year"); // year, month, day, hour
-                    int repeat_time = Integer.parseInt(post.get("repeat_time", "-1"));
-                    final String repeat_unit = post.get("repeat_unit", "seldays"); // selminutes, selhours, seldays
-                    
-                    if (recrawl.equals("scheduler") && repeat_time > 0) {
-                        // set crawlingIfOlder attributes that are appropriate for scheduled crawling 
-                        crawlingIfOlderCheck = true;
-                        crawlingIfOlderNumber = repeat_unit.equals("selminutes") ? 1 : repeat_unit.equals("selhours") ? repeat_time / 2 : repeat_time * 12;
-                        crawlingIfOlderUnit = "hour";
-                    } else if (recrawl.equals("reload")) {
-                        repeat_time = -1;
-                        crawlingIfOlderCheck = true;
-                    } else if (recrawl.equals("nodoubles")) {
-                        repeat_time = -1;
-                        crawlingIfOlderCheck = false;
-                    }
-                    long crawlingIfOlder = recrawlIfOlderC(crawlingIfOlderCheck, crawlingIfOlderNumber, crawlingIfOlderUnit);
-                    env.setConfig("crawlingIfOlder", crawlingIfOlder);
+                // normalize URL
+                DigestURI crawlingStartURL = null;
+                try {crawlingStartURL = new DigestURI(crawlingStart, null);} catch (final MalformedURLException e1) {}
+                crawlingStart = (crawlingStartURL == null) ? null : crawlingStartURL.toNormalform(true, true);
+               
+                // set new properties
+                final boolean fullDomain = post.get("range", "wide").equals("domain"); // special property in simple crawl start
+                final boolean subPath    = post.get("range", "wide").equals("subpath"); // special property in simple crawl start
+                
+                
+                // set the crawl filter
+                String newcrawlingMustMatch = post.get("mustmatch", CrawlProfile.MATCH_ALL);
+                String newcrawlingMustNotMatch = post.get("mustnotmatch", CrawlProfile.MATCH_NEVER);
+                if (newcrawlingMustMatch.length() < 2) newcrawlingMustMatch = CrawlProfile.MATCH_ALL; // avoid that all urls are filtered out if bad value was submitted
+                // special cases:
+                if (crawlingStartURL!= null && fullDomain) {
+                    newcrawlingMustMatch = ".*" + crawlingStartURL.getHost() + ".*";
+                }
+                if (crawlingStart!= null && subPath && (pos = crawlingStart.lastIndexOf('/')) > 0) {
+                    newcrawlingMustMatch = crawlingStart.substring(0, pos + 1) + ".*";
+                }
+                
+                final boolean crawlOrder = post.get("crawlOrder", "off").equals("on");
+                env.setConfig("crawlOrder", (crawlOrder) ? "true" : "false");
+                
+                int newcrawlingdepth = Integer.parseInt(post.get("crawlingDepth", "8"));
+                env.setConfig("crawlingDepth", Integer.toString(newcrawlingdepth));
+                if ((crawlOrder) && (newcrawlingdepth > 8)) newcrawlingdepth = 8;
+                
+                // recrawl
+                final String recrawl = post.get("recrawl", "nodoubles"); // nodoubles, reload, scheduler
+                boolean crawlingIfOlderCheck = post.get("crawlingIfOlderCheck", "off").equals("on");
+                int crawlingIfOlderNumber = Integer.parseInt(post.get("crawlingIfOlderNumber", "-1"));
+                String crawlingIfOlderUnit = post.get("crawlingIfOlderUnit","year"); // year, month, day, hour
+                int repeat_time = Integer.parseInt(post.get("repeat_time", "-1"));
+                final String repeat_unit = post.get("repeat_unit", "seldays"); // selminutes, selhours, seldays
+                
+                if (recrawl.equals("scheduler") && repeat_time > 0) {
+                    // set crawlingIfOlder attributes that are appropriate for scheduled crawling 
+                    crawlingIfOlderCheck = true;
+                    crawlingIfOlderNumber = repeat_unit.equals("selminutes") ? 1 : repeat_unit.equals("selhours") ? repeat_time / 2 : repeat_time * 12;
+                    crawlingIfOlderUnit = "hour";
+                } else if (recrawl.equals("reload")) {
+                    repeat_time = -1;
+                    crawlingIfOlderCheck = true;
+                } else if (recrawl.equals("nodoubles")) {
+                    repeat_time = -1;
+                    crawlingIfOlderCheck = false;
+                }
+                long crawlingIfOlder = recrawlIfOlderC(crawlingIfOlderCheck, crawlingIfOlderNumber, crawlingIfOlderUnit);
+                env.setConfig("crawlingIfOlder", crawlingIfOlder);
 
-                    // store this call as api call
-                    if (repeat_time > 0) {
-                        // store as scheduled api call
-                        sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + crawlingStart, repeat_time, repeat_unit.substring(3));
-                    } else {
-                        // store just a protocol
-                        sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + crawlingStart);
-                    }                    
-                    final boolean crawlingDomFilterCheck = post.get("crawlingDomFilterCheck", "off").equals("on");
-                    final int crawlingDomFilterDepth = (crawlingDomFilterCheck) ? Integer.parseInt(post.get("crawlingDomFilterDepth", "-1")) : -1;
-                    env.setConfig("crawlingDomFilterDepth", Integer.toString(crawlingDomFilterDepth));
-                    
-                    final boolean crawlingDomMaxCheck = post.get("crawlingDomMaxCheck", "off").equals("on");
-                    final int crawlingDomMaxPages = (crawlingDomMaxCheck) ? Integer.parseInt(post.get("crawlingDomMaxPages", "-1")) : -1;
-                    env.setConfig("crawlingDomMaxPages", Integer.toString(crawlingDomMaxPages));
-                    
-                    final boolean crawlingQ = post.get("crawlingQ", "off").equals("on");
-                    env.setConfig("crawlingQ", (crawlingQ) ? "true" : "false");
-                    
-                    final boolean indexText = post.get("indexText", "off").equals("on");
-                    env.setConfig("indexText", (indexText) ? "true" : "false");
-                    
-                    final boolean indexMedia = post.get("indexMedia", "off").equals("on");
-                    env.setConfig("indexMedia", (indexMedia) ? "true" : "false");
-                    
-                    final boolean storeHTCache = post.get("storeHTCache", "off").equals("on");
-                    env.setConfig("storeHTCache", (storeHTCache) ? "true" : "false");
-                    
-                    final String cachePolicyString = post.get("cachePolicy", "iffresh");
-                    CrawlProfile.CacheStrategy cachePolicy = CrawlProfile.CacheStrategy.IFFRESH;
-                    if (cachePolicyString.equals("nocache")) cachePolicy = CrawlProfile.CacheStrategy.NOCACHE;
-                    if (cachePolicyString.equals("iffresh")) cachePolicy = CrawlProfile.CacheStrategy.IFFRESH;
-                    if (cachePolicyString.equals("ifexist")) cachePolicy = CrawlProfile.CacheStrategy.IFEXIST;
-                    if (cachePolicyString.equals("cacheonly")) cachePolicy = CrawlProfile.CacheStrategy.CACHEONLY;
-                    
-                    final boolean xsstopw = post.get("xsstopw", "off").equals("on");
-                    env.setConfig("xsstopw", (xsstopw) ? "true" : "false");
-                    
-                    final boolean xdstopw = post.get("xdstopw", "off").equals("on");
-                    env.setConfig("xdstopw", (xdstopw) ? "true" : "false");
-                    
-                    final boolean xpstopw = post.get("xpstopw", "off").equals("on");
-                    env.setConfig("xpstopw", (xpstopw) ? "true" : "false");
+                // store this call as api call
+                if (repeat_time > 0) {
+                    // store as scheduled api call
+                    sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + crawlingStart, repeat_time, repeat_unit.substring(3));
+                } else {
+                    // store just a protocol
+                    sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + crawlingStart);
+                }                    
+                
+                final boolean crawlingDomMaxCheck = post.get("crawlingDomMaxCheck", "off").equals("on");
+                final int crawlingDomMaxPages = (crawlingDomMaxCheck) ? Integer.parseInt(post.get("crawlingDomMaxPages", "-1")) : -1;
+                env.setConfig("crawlingDomMaxPages", Integer.toString(crawlingDomMaxPages));
+                
+                final boolean crawlingQ = post.get("crawlingQ", "off").equals("on");
+                env.setConfig("crawlingQ", (crawlingQ) ? "true" : "false");
+                
+                final boolean indexText = post.get("indexText", "off").equals("on");
+                env.setConfig("indexText", (indexText) ? "true" : "false");
+                
+                final boolean indexMedia = post.get("indexMedia", "off").equals("on");
+                env.setConfig("indexMedia", (indexMedia) ? "true" : "false");
+                
+                final boolean storeHTCache = post.get("storeHTCache", "off").equals("on");
+                env.setConfig("storeHTCache", (storeHTCache) ? "true" : "false");
+                
+                final String cachePolicyString = post.get("cachePolicy", "iffresh");
+                CrawlProfile.CacheStrategy cachePolicy = CrawlProfile.CacheStrategy.IFFRESH;
+                if (cachePolicyString.equals("nocache")) cachePolicy = CrawlProfile.CacheStrategy.NOCACHE;
+                if (cachePolicyString.equals("iffresh")) cachePolicy = CrawlProfile.CacheStrategy.IFFRESH;
+                if (cachePolicyString.equals("ifexist")) cachePolicy = CrawlProfile.CacheStrategy.IFEXIST;
+                if (cachePolicyString.equals("cacheonly")) cachePolicy = CrawlProfile.CacheStrategy.CACHEONLY;
+                
+                final boolean xsstopw = post.get("xsstopw", "off").equals("on");
+                env.setConfig("xsstopw", (xsstopw) ? "true" : "false");
+                
+                final boolean xdstopw = post.get("xdstopw", "off").equals("on");
+                env.setConfig("xdstopw", (xdstopw) ? "true" : "false");
+                
+                final boolean xpstopw = post.get("xpstopw", "off").equals("on");
+                env.setConfig("xpstopw", (xpstopw) ? "true" : "false");
+                
+                final String crawlingMode = post.get("crawlingMode","url");
+                if (crawlingMode.equals("url")) {
                     
-                    final String crawlingMode = post.get("crawlingMode","url");
-                    if (crawlingMode.equals(CRAWLING_MODE_URL)) {
+                    // check if pattern matches
+                    if ((crawlingStart == null || crawlingStartURL == null) /* || (!(crawlingStart.matches(newcrawlingfilter))) */) {
+                        // print error message
+                        prop.put("info", "4"); //crawlfilter does not match url
+                        prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
+                        prop.putHTML("info_crawlingStart", crawlingStart);
+                    } else try {
                         
-                        // check if pattern matches
-                        if ((crawlingStart == null || crawlingStartURL == null) /* || (!(crawlingStart.matches(newcrawlingfilter))) */) {
-                            // print error message
-                            prop.put("info", "4"); //crawlfilter does not match url
-                            prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
-                            prop.putHTML("info_crawlingStart", crawlingStart);
-                        } else try {
+                        // check if the crawl filter works correctly
+                        Pattern.compile(newcrawlingMustMatch);
+                        
+                        // stack request
+                        // first delete old entry, if exists
+                        final DigestURI url = new DigestURI(crawlingStart, null);
+                        final byte[] urlhash = url.hash();
+                        indexSegment.urlMetadata().remove(urlhash);
+                        sb.crawlQueues.noticeURL.removeByURLHash(urlhash);
+                        sb.crawlQueues.errorURL.remove(urlhash);
+                        
+                        // stack url
+                        sb.crawler.profilesPassiveCrawls.remove(crawlingStartURL.hash()); // if there is an old entry, delete it
+                        final CrawlProfile pe = new CrawlProfile(
+                                (crawlingStartURL.getHost() == null) ? Long.toHexString(System.currentTimeMillis()) : crawlingStartURL.getHost(),
+                                crawlingStartURL,
+                                newcrawlingMustMatch,
+                                newcrawlingMustNotMatch,
+                                newcrawlingdepth,
+                                crawlingIfOlder, crawlingDomMaxPages,
+                                crawlingQ,
+                                indexText, indexMedia,
+                                storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw, cachePolicy);
+                        sb.crawler.profilesActiveCrawls.put(pe.handle().getBytes(), pe);
+                        final String reasonString = sb.crawlStacker.stackCrawl(new Request(
+                                sb.peers.mySeed().hash.getBytes(),
+                                url,
+                                null,
+                                "CRAWLING-ROOT",
+                                new Date(),
+                                pe.handle(),
+                                0,
+                                0,
+                                0
+                                ));
+                        
+                        if (reasonString == null) {
+                        	// create a bookmark from crawl start url
+                        	Set<String> tags=listManager.string2set(BookmarkHelper.cleanTagsString(post.get("bookmarkFolder","/crawlStart")));                                
+                            tags.add("crawlStart");
+                        	if (post.get("createBookmark","off").equals("on")) {
+                            	bookmarksDB.Bookmark bookmark = sb.bookmarksDB.createBookmark(crawlingStart, "admin");
+                    			if(bookmark != null){
+                    				bookmark.setProperty(bookmarksDB.Bookmark.BOOKMARK_TITLE, post.get("bookmarkTitle", crawlingStart));                        				
+                    				bookmark.setOwner("admin");                        				
+                    				bookmark.setPublic(false);    
+                    				bookmark.setTags(tags, true);
+                    				sb.bookmarksDB.saveBookmark(bookmark);
+                    			}
+                            }
+                            // liftoff!
+                            prop.put("info", "8");//start msg
+                            prop.putHTML("info_crawlingURL", (post.get("crawlingURL")));
+                            
+                            // generate a YaCyNews if the global flag was set
+                            if (crawlOrder) {
+                                final Map<String, String> m = new HashMap<String, String>(pe); // must be cloned
+                                m.remove("specificDepth");
+                                m.remove("indexText");
+                                m.remove("indexMedia");
+                                m.remove("remoteIndexing");
+                                m.remove("xsstopw");
+                                m.remove("xpstopw");
+                                m.remove("xdstopw");
+                                m.remove("storeTXCache");
+                                m.remove("storeHTCache");
+                                m.remove("generalFilter");
+                                m.remove("specificFilter");
+                                m.put("intention", post.get("intention", "").replace(',', '/'));
+                                sb.peers.newsPool.publishMyNews(sb.peers.mySeed(), yacyNewsPool.CATEGORY_CRAWL_START, m);
+                            }                                
+                        } else {
+                            prop.put("info", "5"); //Crawling failed
+                            prop.putHTML("info_crawlingURL", (post.get("crawlingURL")));
+                            prop.putHTML("info_reasonString", reasonString);
                             
+                            sb.crawlQueues.errorURL.push(
+                                    new Request(
+                                            sb.peers.mySeed().hash.getBytes(), 
+                                            crawlingStartURL, 
+                                            null, 
+                                            "", 
+                                            new Date(),
+                                            pe.handle(),
+                                            0, 
+                                            0, 
+                                            0),
+                                    sb.peers.mySeed().hash.getBytes(),
+                                    new Date(),
+                                    1,
+                                    reasonString);
+                        }
+                    } catch (final PatternSyntaxException e) {
+                        prop.put("info", "4"); //crawlfilter does not match url
+                        prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
+                        prop.putHTML("info_error", e.getMessage());
+                    } catch (final Exception e) {
+                        // mist
+                        prop.put("info", "6");//Error with url
+                        prop.putHTML("info_crawlingStart", crawlingStart);
+                        prop.putHTML("info_error", e.getMessage());
+                        Log.logException(e);
+                    }
+                    
+                } else if (crawlingMode.equals("file")) {
+                    if (post.containsKey("crawlingFile")) {
+                        final String fileName = post.get("crawlingFile");  
+                        try {
                             // check if the crawl filter works correctly
                             Pattern.compile(newcrawlingMustMatch);
-                            
-                            // stack request
-                            // first delete old entry, if exists
-                            final DigestURI url = new DigestURI(crawlingStart, null);
-                            final byte[] urlhash = url.hash();
-                            indexSegment.urlMetadata().remove(urlhash);
-                            sb.crawlQueues.noticeURL.removeByURLHash(urlhash);
-                            sb.crawlQueues.errorURL.remove(urlhash);
-                            
-                            // stack url
-                            sb.crawler.profilesPassiveCrawls.remove(crawlingStartURL.hash()); // if there is an old entry, delete it
-                            final CrawlProfile pe = new CrawlProfile(
-                                    (crawlingStartURL.getHost() == null) ? Long.toHexString(System.currentTimeMillis()) : crawlingStartURL.getHost(),
-                                    crawlingStartURL,
+                            final File file = new File(fileName);
+                            final String fileString = post.get("crawlingFile$file");
+                            final ContentScraper scraper = new ContentScraper(new DigestURI(file));
+                            final Writer writer = new TransformerWriter(null, null, scraper, null, false);
+                            FileUtils.copy(fileString, writer);
+                            writer.close();
+                            final Map<MultiProtocolURI, String> hyperlinks = scraper.getAnchors();
+                            final DigestURI crawlURL = new DigestURI("file://" + file.toString(), null);
+                            final CrawlProfile profile = new CrawlProfile(
+                                    fileName, crawlURL,
                                     newcrawlingMustMatch,
-                                    newcrawlingMustNotMatch,
+                                    CrawlProfile.MATCH_NEVER,
                                     newcrawlingdepth,
-                                    crawlingIfOlder, crawlingDomFilterDepth, crawlingDomMaxPages,
+                                    crawlingIfOlder,
+                                    crawlingDomMaxPages,
                                     crawlingQ,
-                                    indexText, indexMedia,
-                                    storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw, cachePolicy);
-                            sb.crawler.profilesActiveCrawls.put(pe.handle().getBytes(), pe);
-                            final String reasonString = sb.crawlStacker.stackCrawl(new Request(
-                                    sb.peers.mySeed().hash.getBytes(),
-                                    url,
-                                    null,
-                                    "CRAWLING-ROOT",
-                                    new Date(),
-                                    pe.handle(),
-                                    0,
-                                    0,
-                                    0
-                                    ));
-                            
-                            if (reasonString == null) {
-                            	// create a bookmark from crawl start url
-                            	Set<String> tags=listManager.string2set(BookmarkHelper.cleanTagsString(post.get("bookmarkFolder","/crawlStart")));                                
-                                tags.add("crawlStart");
-                            	if (post.get("createBookmark","off").equals("on")) {
-                                	bookmarksDB.Bookmark bookmark = sb.bookmarksDB.createBookmark(crawlingStart, "admin");
-                        			if(bookmark != null){
-                        				bookmark.setProperty(bookmarksDB.Bookmark.BOOKMARK_TITLE, post.get("bookmarkTitle", crawlingStart));                        				
-                        				bookmark.setOwner("admin");                        				
-                        				bookmark.setPublic(false);    
-                        				bookmark.setTags(tags, true);
-                        				sb.bookmarksDB.saveBookmark(bookmark);
-                        			}
-                                }
-                                // liftoff!
-                                prop.put("info", "8");//start msg
-                                prop.putHTML("info_crawlingURL", (post.get("crawlingURL")));
-                                
-                                // generate a YaCyNews if the global flag was set
-                                if (crawlOrder) {
-                                    final Map<String, String> m = new HashMap<String, String>(pe); // must be cloned
-                                    m.remove("specificDepth");
-                                    m.remove("indexText");
-                                    m.remove("indexMedia");
-                                    m.remove("remoteIndexing");
-                                    m.remove("xsstopw");
-                                    m.remove("xpstopw");
-                                    m.remove("xdstopw");
-                                    m.remove("storeTXCache");
-                                    m.remove("storeHTCache");
-                                    m.remove("generalFilter");
-                                    m.remove("specificFilter");
-                                    m.put("intention", post.get("intention", "").replace(',', '/'));
-                                    sb.peers.newsPool.publishMyNews(sb.peers.mySeed(), yacyNewsPool.CATEGORY_CRAWL_START, m);
-                                }                                
-                            } else {
-                                prop.put("info", "5"); //Crawling failed
-                                prop.putHTML("info_crawlingURL", (post.get("crawlingURL")));
-                                prop.putHTML("info_reasonString", reasonString);
-                                
-                                sb.crawlQueues.errorURL.push(
-                                        new Request(
-                                                sb.peers.mySeed().hash.getBytes(), 
-                                                crawlingStartURL, 
-                                                null, 
-                                                "", 
-                                                new Date(),
-                                                pe.handle(),
-                                                0, 
-                                                0, 
-                                                0),
-                                        sb.peers.mySeed().hash.getBytes(),
+                                    indexText,
+                                    indexMedia,
+                                    storeHTCache,
+                                    true,
+                                    crawlOrder,
+                                    xsstopw, xdstopw, xpstopw,
+                                    cachePolicy);
+                            sb.crawler.profilesActiveCrawls.put(profile.handle().getBytes(), profile);
+                            sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
+                            final Iterator<Map.Entry<MultiProtocolURI, String>> linkiterator = hyperlinks.entrySet().iterator();
+                            DigestURI nexturl;
+                            while (linkiterator.hasNext()) {
+                                final Map.Entry<MultiProtocolURI, String> e = linkiterator.next();
+                                if (e.getKey() == null) continue;
+                                nexturl = new DigestURI(e.getKey());
+                                sb.crawlStacker.enqueueEntry(new Request(
+                                        sb.peers.mySeed().hash.getBytes(), 
+                                        nexturl, 
+                                        null, 
+                                        e.getValue(), 
                                         new Date(),
-                                        1,
-                                        reasonString);
+                                        profile.handle(),
+                                        0,
+                                        0,
+                                        0
+                                        ));
                             }
+                           
                         } catch (final PatternSyntaxException e) {
                             prop.put("info", "4"); //crawlfilter does not match url
                             prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
                             prop.putHTML("info_error", e.getMessage());
                         } catch (final Exception e) {
                             // mist
-                            prop.put("info", "6");//Error with url
-                            prop.putHTML("info_crawlingStart", crawlingStart);
+                            prop.put("info", "7");//Error with file
+                            prop.putHTML("info_crawlingStart", fileName);
                             prop.putHTML("info_error", e.getMessage());
                             Log.logException(e);
                         }
+                        sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
+                    }
+                } else if (crawlingMode.equals("sitemap")) {
+                    String sitemapURLStr = post.get("sitemapURL","");
+                	try {
+                		final DigestURI sitemapURL = new DigestURI(sitemapURLStr, null);
+                		final CrawlProfile pe = new CrawlProfile(
+                				sitemapURLStr, sitemapURL,
+                				newcrawlingMustMatch,
+                				CrawlProfile.MATCH_NEVER,
+                				newcrawlingdepth,
+                				crawlingIfOlder, crawlingDomMaxPages,
+                				crawlingQ,
+                				indexText, indexMedia,
+                				storeHTCache, true, crawlOrder,
+                				xsstopw, xdstopw, xpstopw,
+                				cachePolicy);
+                		sb.crawler.profilesActiveCrawls.put(pe.handle().getBytes(), pe);
+                		final SitemapImporter importer = new SitemapImporter(sb, sitemapURL, pe);
+                		importer.start();
+                	} catch (final Exception e) {
+                		// mist
+                		prop.put("info", "6");//Error with url
+                		prop.putHTML("info_crawlingStart", sitemapURLStr);
+                		prop.putHTML("info_error", e.getMessage());
+                		Log.logException(e);
+                	}
+                } else if (crawlingMode.equals("sitelist")) {
+                    try {
+                        final DigestURI sitelistURL = new DigestURI(crawlingStart, null);
+                        // download document
+                        ContentScraper scraper = null;
+                        scraper = sb.loader.parseResource(sitelistURL, CrawlProfile.CacheStrategy.IFFRESH);
+                        String title = scraper.getTitle();
+                        // String description = scraper.getDescription();
                         
-                    } else if (crawlingMode.equals(CRAWLING_MODE_FILE)) {
-                        if (post.containsKey("crawlingFile")) {
-                            // getting the name of the uploaded file
-                            final String fileName = post.get("crawlingFile");  
-                            try {
-                                // check if the crawl filter works correctly
-                                Pattern.compile(newcrawlingMustMatch);
-                                
-                                // loading the file content
-                                final File file = new File(fileName);
-                                
-                                // getting the content of the bookmark file
-                                final String fileString = post.get("crawlingFile$file");
-                                
-                                // parsing the bookmark file and fetching the headline and contained links
-                                final ContentScraper scraper = new ContentScraper(new DigestURI(file));
-                                //OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);
-                                final Writer writer = new TransformerWriter(null,null,scraper,null,false);
-                                FileUtils.copy(fileString, writer);
-                                writer.close();
-                                
-                                //String headline = scraper.getHeadline();
-                                final Map<MultiProtocolURI, String> hyperlinks = scraper.getAnchors();
-                                
-                                // creating a crawler profile
-                                final DigestURI crawlURL = new DigestURI("file://" + file.toString(), null);
-                                final CrawlProfile profile = new CrawlProfile(
-                                        fileName, crawlURL,
-                                        newcrawlingMustMatch,
-                                        CrawlProfile.MATCH_NEVER,
-                                        newcrawlingdepth,
-                                        crawlingIfOlder,
-                                        crawlingDomFilterDepth,
-                                        crawlingDomMaxPages,
-                                        crawlingQ,
-                                        indexText,
-                                        indexMedia,
-                                        storeHTCache,
-                                        true,
-                                        crawlOrder,
-                                        xsstopw, xdstopw, xpstopw,
-                                        cachePolicy);
-                                sb.crawler.profilesActiveCrawls.put(profile.handle().getBytes(), profile);
-                                
-                                // pause local crawl here
-                                sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
-                                
-                                // loop through the contained links
-                                final Iterator<Map.Entry<MultiProtocolURI, String>> linkiterator = hyperlinks.entrySet().iterator();
-                                DigestURI nexturl;
-                                while (linkiterator.hasNext()) {
-                                    final Map.Entry<MultiProtocolURI, String> e = linkiterator.next();
-                                    if (e.getKey() == null) continue;
-                                    nexturl = new DigestURI(e.getKey());
-                                    
-                                    // enqueuing the url for crawling
-                                    sb.crawlStacker.enqueueEntry(new Request(
-                                            sb.peers.mySeed().hash.getBytes(), 
-                                            nexturl, 
-                                            null, 
-                                            e.getValue(), 
-                                            new Date(),
-                                            profile.handle(),
-                                            0,
-                                            0,
-                                            0
-                                            ));
-                                }
-                               
-                            } catch (final PatternSyntaxException e) {
-                                // print error message
-                                prop.put("info", "4"); //crawlfilter does not match url
-                                prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
-                                prop.putHTML("info_error", e.getMessage());
-                            } catch (final Exception e) {
-                                // mist
-                                prop.put("info", "7");//Error with file
-                                prop.putHTML("info_crawlingStart", fileName);
-                                prop.putHTML("info_error", e.getMessage());
-                                Log.logException(e);
-                            }
-                            sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
+                        // get links and generate filter
+                        StringBuilder filter = new StringBuilder();
+                        final Map<MultiProtocolURI, String> hyperlinks = scraper.getAnchors();
+                        for (MultiProtocolURI uri: hyperlinks.keySet()) {
+                            filter.append('|').append(uri.getProtocol()).append("://").append(uri.getHost()).append(".*");
                         }
-                    } else if (crawlingMode.equals(CRAWLING_MODE_SITEMAP)) { 
-                    	String sitemapURLStr = null;
-                    	try {
-                    		// getting the sitemap URL
-                    		sitemapURLStr = post.get("sitemapURL","");
-                    		final DigestURI sitemapURL = new DigestURI(sitemapURLStr, null);
-                            
-                    		// create a new profile
-                    		final CrawlProfile pe = new CrawlProfile(
-                    				sitemapURLStr, sitemapURL,
-                    				newcrawlingMustMatch,
-                    				CrawlProfile.MATCH_NEVER,
-                    				newcrawlingdepth,
-                    				crawlingIfOlder, crawlingDomFilterDepth, crawlingDomMaxPages,
-                    				crawlingQ,
-                    				indexText, indexMedia,
-                    				storeHTCache, true, crawlOrder,
-                    				xsstopw, xdstopw, xpstopw,
-                    				cachePolicy);
-                    		sb.crawler.profilesActiveCrawls.put(pe.handle().getBytes(), pe);
-                    		
-                    		// create a new sitemap importer
-                    		final SitemapImporter importer = new SitemapImporter(sb, new DigestURI(sitemapURLStr, null), pe);
-                    		importer.start();
-                    		
-                    	} catch (final Exception e) {
-                    		// mist
-                    		prop.put("info", "6");//Error with url
-                    		prop.putHTML("info_crawlingStart", sitemapURLStr);
-                    		prop.putHTML("info_error", e.getMessage());
-                    		Log.logException(e);
-                    	}
+                        newcrawlingMustMatch = filter.length() > 0 ? filter.substring(1) : "";
+
+                        // put links onto crawl queue
+                        final CrawlProfile profile = new CrawlProfile(
+                                title == null || title.length() == 0 ? sitelistURL.getHost() : title,
+                                sitelistURL,
+                                newcrawlingMustMatch,
+                                CrawlProfile.MATCH_NEVER,
+                                newcrawlingdepth,
+                                crawlingIfOlder,
+                                crawlingDomMaxPages,
+                                crawlingQ,
+                                indexText,
+                                indexMedia,
+                                storeHTCache,
+                                true,
+                                crawlOrder,
+                                xsstopw, xdstopw, xpstopw,
+                                cachePolicy);
+                        sb.crawler.profilesActiveCrawls.put(profile.handle().getBytes(), profile);
+                        sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
+                        final Iterator<Map.Entry<MultiProtocolURI, String>> linkiterator = hyperlinks.entrySet().iterator();
+                        DigestURI nexturl;
+                        while (linkiterator.hasNext()) {
+                            final Map.Entry<MultiProtocolURI, String> e = linkiterator.next();
+                            if (e.getKey() == null) continue;
+                            nexturl = new DigestURI(e.getKey());
+                            // remove the url from the database to be prepared to crawl them again
+                            final byte[] urlhash = nexturl.hash();
+                            indexSegment.urlMetadata().remove(urlhash);
+                            sb.crawlQueues.noticeURL.removeByURLHash(urlhash);
+                            sb.crawlQueues.errorURL.remove(urlhash);
+                            sb.crawlStacker.enqueueEntry(new Request(
+                                    sb.peers.mySeed().hash.getBytes(), 
+                                    nexturl, 
+                                    null, 
+                                    e.getValue(), 
+                                    new Date(),
+                                    profile.handle(),
+                                    0,
+                                    0,
+                                    0
+                                    ));
+                        }
+                    } catch (final Exception e) {
+                        // mist
+                        prop.put("info", "6");//Error with url
+                        prop.putHTML("info_crawlingStart", crawlingStart);
+                        prop.putHTML("info_error", e.getMessage());
+                        Log.logException(e);
                     }
                 }
             }
-            
-            if (post.containsKey("crawlingPerformance")) {
-                setPerformance(sb, post);
-            }
+        }
+        
+        if (post != null && post.containsKey("crawlingPerformance")) {
+            setPerformance(sb, post);
         }
         
         // performance settings
diff --git a/htroot/QuickCrawlLink_p.java b/htroot/QuickCrawlLink_p.java
index c951e7d10..c470db791 100644
--- a/htroot/QuickCrawlLink_p.java
+++ b/htroot/QuickCrawlLink_p.java
@@ -152,7 +152,6 @@ public class QuickCrawlLink_p {
                         crawlingMustNotMatch,
                         CrawlingDepth,
                         60 * 24 * 30, // recrawlIfOlder (minutes); here: one month
-                        -1, // domFilterDepth, if negative: no auto-filter
                         -1, // domMaxPages, if negative: no count restriction
                         crawlDynamic,
                         indexText,
diff --git a/htroot/api/util/getpageinfo_p.java b/htroot/api/util/getpageinfo_p.java
index acd2bcb68..89bc7ad8e 100755
--- a/htroot/api/util/getpageinfo_p.java
+++ b/htroot/api/util/getpageinfo_p.java
@@ -81,6 +81,20 @@ public class getpageinfo_p {
                     // put language
                     Set<String> languages = scraper.getContentLanguages();
                     prop.putXML("lang", (languages == null) ? "unknown" : languages.iterator().next());
+                    
+                    // get links and put them into a semicolon-separated list
+                    StringBuilder links = new StringBuilder();
+                    StringBuilder filter = new StringBuilder();
+                    count = 0;
+                    for (MultiProtocolURI uri: scraper.getAnchors().keySet()) {
+                        links.append(';').append(uri.toNormalform(true, false));
+                        filter.append('|').append(uri.getProtocol()).append("://").append(uri.getHost()).append(".*");
+                        prop.putXML("links_" + count + "_link", uri.toNormalform(true, false));
+                        count++;
+                    }
+                    prop.put("links", count);
+                    prop.putXML("sitelist", links.length() > 0 ? links.substring(1) : "");
+                    prop.putXML("filter", filter.length() > 0 ? filter.substring(1) : ".*");
                 }
             }
             if(actions.indexOf("robots")>=0){
diff --git a/htroot/api/util/getpageinfo_p.xml b/htroot/api/util/getpageinfo_p.xml
index 4942826da..b9590c990 100644
--- a/htroot/api/util/getpageinfo_p.xml
+++ b/htroot/api/util/getpageinfo_p.xml
@@ -6,9 +6,16 @@
   <robots>#(robots-allowed)#0::1::#(/robots-allowed)#</robots>
   <sitemap>#[sitemap]#</sitemap>
   <favicon>#[favicon]#</favicon>
+  <sitelist>#[sitelist]#</sitelist>
+  <filter>#[filter]#</filter>
   <tags>
     #{tags}#
     <tag name="#[tag]#" />
     #{/tags}#
   </tags>
+  <links>
+    #{links}#
+    <link name="#[link]#" />
+    #{/links}#
+  </links>
 </pageinfo>
diff --git a/htroot/js/IndexCreate.js b/htroot/js/IndexCreate.js
index fdb26ba84..b411f2261 100644
--- a/htroot/js/IndexCreate.js
+++ b/htroot/js/IndexCreate.js
@@ -3,12 +3,12 @@ var AJAX_ON="/env/grafics/ajax.gif";
 var timeout="";
 
 function handleResponse(){
-    if(http.readyState == 4){
+    if (http.readyState == 4){
         var response = http.responseXML;
 
-		// getting the document title
+		// get the document title
         doctitle="";		
-        if(response.getElementsByTagName("title")[0].firstChild!=null){
+        if (response.getElementsByTagName("title")[0].firstChild!=null){
 	        doctitle=response.getElementsByTagName("title")[0].firstChild.nodeValue;
 	    }
 		// document.getElementById("title").innerHTML=doctitle;
@@ -23,43 +23,51 @@ function handleResponse(){
         if(robotsOKspan.firstChild){
 	        robotsOKspan.removeChild(robotsOKspan.firstChild);
         }
-        if(docrobotsOK==1){
+        if (docrobotsOK==1){
         	img=document.createElement("img");
         	img.setAttribute("src", "/env/grafics/ok.png");
         	img.setAttribute("width", "32px");
         	img.setAttribute("height", "32px");
         	robotsOKspan.appendChild(img);
-        }else if(docrobotsOK==0){
+        } else if(docrobotsOK==0){
 			img=document.createElement("img");
         	img.setAttribute("src", "/env/grafics/bad.png");
         	img.setAttribute("width", "32px");
         	img.setAttribute("height", "32px");
         	robotsOKspan.appendChild(img);
         	robotsOKspan.appendChild(img);
-        }else{
+        } else {
 	        robotsOKspan.appendChild(document.createTextNode(""));
 	        document.getElementById("robotsOK").innerHTML="";
         }		
 		
-		// getting the sitemap URL contained in the robots.txt
+		// get the sitemap URL contained in the robots.txt
 		if (document.getElementsByName("sitemapURL").length > 0) {
 			sitemap="";		
-	        if(response.getElementsByTagName("sitemap")[0].firstChild!=null){
+	        if (response.getElementsByTagName("sitemap")[0].firstChild!=null){
 		        sitemap=response.getElementsByTagName("sitemap")[0].firstChild.nodeValue;
 		    }		
 			document.getElementsByName("sitemapURL")[0].value=sitemap;
 			document.getElementById("sitemap").disabled=false;
 		}
+			sitelist="";		
+	        if (response.getElementsByTagName("sitelist")[0].firstChild!=null){
+		        sitelist=response.getElementsByTagName("sitelist")[0].firstChild.nodeValue;
+		    }
+			document.getElementById("sitelistURLs").innerHTML = sitelist;
+			document.getElementById("sitelist").disabled=false;
         
 		// clear the ajax image
 		document.getElementsByName("ajax")[0].setAttribute("src", AJAX_OFF);
     }
 }
-function changed(){
+
+function changed() {
 	window.clearTimeout(timeout);
 	timeout=window.setTimeout("loadInfos()", 1500);
 }
-function loadInfos(){
+
+function loadInfos() {
 	// displaying ajax image
 	document.getElementsByName("ajax")[0].setAttribute("src",AJAX_ON);	
 	
diff --git a/source/de/anomic/crawler/CrawlProfile.java b/source/de/anomic/crawler/CrawlProfile.java
index 8dc5e13fc..23e26fa9d 100644
--- a/source/de/anomic/crawler/CrawlProfile.java
+++ b/source/de/anomic/crawler/CrawlProfile.java
@@ -48,7 +48,6 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
     public static final String FILTER_MUSTNOTMATCH = "nevermatch";
     public static final String DEPTH            = "generalDepth";
     public static final String RECRAWL_IF_OLDER = "recrawlIfOlder";
-    public static final String DOM_FILTER_DEPTH = "domFilterDepth";
     public static final String DOM_MAX_PAGES    = "domMaxPages";
     public static final String CRAWLING_Q       = "crawlingQ";
     public static final String INDEX_TEXT       = "indexText";
@@ -70,7 +69,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
                  final String mustnotmatch,
                  final int depth,
                  final long recrawlIfOlder /*date*/,
-                 final int domFilterDepth, final int domMaxPages,
+                 final int domMaxPages,
                  final boolean crawlingQ,
                  final boolean indexText, final boolean indexMedia,
                  final boolean storeHTCache, final boolean storeTXCache,
@@ -87,7 +86,6 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
         put(FILTER_MUSTNOTMATCH,   (mustnotmatch == null) ? CrawlProfile.MATCH_NEVER : mustnotmatch);
         put(DEPTH,            depth);
         put(RECRAWL_IF_OLDER, recrawlIfOlder);
-        put(DOM_FILTER_DEPTH, domFilterDepth);
         put(DOM_MAX_PAGES,    domMaxPages);
         put(CRAWLING_Q,       crawlingQ); // crawling of urls with '?'
         put(INDEX_TEXT,       indexText);
@@ -186,21 +184,6 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
             return 0L;
         }
     }
-    public int domFilterDepth() {
-        // if the depth is equal or less to this depth,
-        // then the current url feeds with its domain the crawl filter
-        // if this is -1, all domains are feeded
-        final String r = get(DOM_FILTER_DEPTH);
-        if (r == null) return Integer.MAX_VALUE;
-        try {
-            final int i = Integer.parseInt(r);
-            if (i < 0) return Integer.MAX_VALUE;
-            return i;
-        } catch (final NumberFormatException e) {
-            Log.logException(e);
-            return Integer.MAX_VALUE;
-        }
-    }
     public int domMaxPages() {
         // this is the maximum number of pages that are crawled for a single domain
         // if -1, this means no limit
@@ -270,16 +253,6 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
             dp.inc();
         }
     }
-    public boolean grantedDomAppearance(final String domain) {
-        final int max = domFilterDepth();
-        if (max == Integer.MAX_VALUE) return true;
-        final DomProfile dp = doms.get(domain);
-        if (dp == null) {
-            return 0 < max;
-        }
-        return dp.depth <= max;
-    }
-
     public boolean grantedDomCount(final String domain) {
         final int max = domMaxPages();
         if (max == Integer.MAX_VALUE) return true;
@@ -292,10 +265,6 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
     public int domSize() {
         return doms.size();
     }
-    public boolean domExists(final String domain) {
-        if (domFilterDepth() == Integer.MAX_VALUE) return true;
-        return doms.containsKey(domain);
-    }
 
     public String domName(final boolean attr, final int index){
         final Iterator<Map.Entry<String, DomProfile>> domnamesi = doms.entrySet().iterator();
diff --git a/source/de/anomic/crawler/CrawlStacker.java b/source/de/anomic/crawler/CrawlStacker.java
index 82c955440..8c056ec64 100644
--- a/source/de/anomic/crawler/CrawlStacker.java
+++ b/source/de/anomic/crawler/CrawlStacker.java
@@ -196,7 +196,7 @@ public final class CrawlStacker {
         final DigestURI referrerURL = (entry.referrerhash() == null || entry.referrerhash().length == 0) ? null : nextQueue.getURL(entry.referrerhash());
 
         // add domain to profile domain list
-        if ((profile.domFilterDepth() != Integer.MAX_VALUE) || (profile.domMaxPages() != Integer.MAX_VALUE)) {
+        if (profile.domMaxPages() != Integer.MAX_VALUE) {
             profile.domInc(entry.url().getHost(), (referrerURL == null) ? null : referrerURL.getHost().toLowerCase(), entry.depth());
         }
 
@@ -296,12 +296,6 @@ public final class CrawlStacker {
             return "post url not allowed";
         }
 
-        // deny urls that do not match with the profile domain list
-        if (!(profile.grantedDomAppearance(url.getHost()))) {
-            if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' is not listed in granted domains.");
-            return "url does not match domain filter";
-        }
-
         // deny urls that exceed allowed number of occurrences
         if (!(profile.grantedDomCount(url.getHost()))) {
             if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' appeared too often, a maximum of " + profile.domMaxPages() + " is allowed.");
diff --git a/source/de/anomic/crawler/CrawlSwitchboard.java b/source/de/anomic/crawler/CrawlSwitchboard.java
index 2b74c91b6..f90b0f40b 100644
--- a/source/de/anomic/crawler/CrawlSwitchboard.java
+++ b/source/de/anomic/crawler/CrawlSwitchboard.java
@@ -164,9 +164,10 @@ public final class CrawlSwitchboard {
         
         if (this.defaultProxyProfile == null) {
             // generate new default entry for proxy crawling
-            this.defaultProxyProfile = new CrawlProfile("proxy", null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER,
+            this.defaultProxyProfile = new CrawlProfile(
+                    "proxy", null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER,
                     0 /*Integer.parseInt(getConfig(PROXY_PREFETCH_DEPTH, "0"))*/,
-                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), -1, -1, false,
+                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), -1, false,
                     true /*getConfigBool(PROXY_INDEXING_LOCAL_TEXT, true)*/,
                     true /*getConfigBool(PROXY_INDEXING_LOCAL_MEDIA, true)*/,
                     true, true,
@@ -177,38 +178,38 @@ public final class CrawlSwitchboard {
         if (this.defaultRemoteProfile == null) {
             // generate new default entry for remote crawling
             this.defaultRemoteProfile = new CrawlProfile(CRAWL_PROFILE_REMOTE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
-                    -1, -1, -1, true, true, true, false, true, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH);
+                    -1, -1, true, true, true, false, true, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH);
             this.profilesActiveCrawls.put(this.defaultRemoteProfile.handle().getBytes(), this.defaultRemoteProfile);
         }
         if (this.defaultTextSnippetLocalProfile == null) {
             // generate new default entry for snippet fetch and optional crawling
             this.defaultTextSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
-                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, -1, true, false, false, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH);
+                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, true, false, false, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH);
             this.profilesActiveCrawls.put(this.defaultTextSnippetLocalProfile.handle().getBytes(), this.defaultTextSnippetLocalProfile);
         }
         if (this.defaultTextSnippetGlobalProfile == null) {
             // generate new default entry for snippet fetch and optional crawling
             this.defaultTextSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
-                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, -1, true, true, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
+                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, true, true, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
             this.profilesActiveCrawls.put(this.defaultTextSnippetGlobalProfile.handle().getBytes(), this.defaultTextSnippetGlobalProfile);
         }
         this.defaultTextSnippetGlobalProfile.setCacheStrategy(CrawlProfile.CacheStrategy.IFEXIST);
         if (this.defaultMediaSnippetLocalProfile == null) {
             // generate new default entry for snippet fetch and optional crawling
             this.defaultMediaSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
-                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, false, true, false, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
+                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, true, false, false, true, false, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
             this.profilesActiveCrawls.put(this.defaultMediaSnippetLocalProfile.handle().getBytes(), this.defaultMediaSnippetLocalProfile);
         }
         if (this.defaultMediaSnippetGlobalProfile == null) {
             // generate new default entry for snippet fetch and optional crawling
             this.defaultMediaSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
-                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
+                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, true, false, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
             this.profilesActiveCrawls.put(this.defaultMediaSnippetGlobalProfile.handle().getBytes(), this.defaultMediaSnippetGlobalProfile);
         }
         if (this.defaultSurrogateProfile == null) {
             // generate new default entry for surrogate parsing
             this.defaultSurrogateProfile = new CrawlProfile(CRAWL_PROFILE_SURROGATE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
-                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, -1, true, true, false, false, false, false, true, true, false, CrawlProfile.CacheStrategy.NOCACHE);
+                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, true, true, false, false, false, false, true, true, false, CrawlProfile.CacheStrategy.NOCACHE);
             this.profilesActiveCrawls.put(this.defaultSurrogateProfile.handle().getBytes(), this.defaultSurrogateProfile);
         }
     }