replaced auto-dom filter with easy-to-understand Site Link-List crawler option

- nobody understand the auto-dom filter without a lenghtly introduction about the function of a crawler - nobody ever used the auto-dom filter other than with a crawl depth of 1 - the auto-dom filter was buggy since the filter did not survive a restart and then a search index contained waste - the function of the auto-dom filter was in fact to just load a link list from the given start url and then start separate crawls for all these urls restricted by their domain - the new Site Link-List option shows the target urls in real-time during input of the start url (like the robots check) and gives a transparent feed-back what it does before it can be used - the new option also fits into the easy site-crawl start menu git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7213 6c8d7289-2bf4-0310-a012-ef5d649a1542
15 years ago · f6eebb6f99
parent 63e387508c
commit f6eebb6f99
13 changed files with 437 additions and 424 deletions
--- a/htroot/CrawlProfileEditor_p.html
+++ b/htroot/CrawlProfileEditor_p.html
@ -38,8 +38,7 @@
    <td><strong>Must Match</strong></td>
    <td><strong>Must Not Match</strong></td>
    <td><strong>MaxAge</strong></td>
-    <td><strong>Auto Filter Depth</strong></td>
+    <td><strong>Domain Counter Content</strong></td>
    <td><strong>Auto Filter Content</strong></td>
    <td><strong>Max Page Per Domain</strong></td>
    <td><strong>Accept '?' URLs</strong></td>
    <td><strong>Fill Proxy Cache</strong></td>
@ -70,7 +69,6 @@
    <td>#[mustmatch]#</td>
    <td>#[mustnotmatch]#</td>
    <td>#[crawlingIfOlder]#</td>
    <td>#[crawlingDomFilterDepth]#</td>
    <td>#{crawlingDomFilterContent}##[item]#<br />#{/crawlingDomFilterContent}#</td>
    <td>#[crawlingDomMaxPages]#</td>
    <td>#(withQuery)#no::yes#(/withQuery)#</td>
--- a/htroot/CrawlProfileEditor_p.java
+++ b/htroot/CrawlProfileEditor_p.java
@ -87,7 +87,6 @@ public class CrawlProfileEditor_p {
        labels.add(new eentry(CrawlProfile.FILTER_MUSTNOTMATCH, "Must-Not-Match Filter", false, eentry.STRING));
        labels.add(new eentry(CrawlProfile.DEPTH,               "Crawl Depth",           false, eentry.INTEGER));
        labels.add(new eentry(CrawlProfile.RECRAWL_IF_OLDER,    "Recrawl If Older",      false, eentry.INTEGER));
        labels.add(new eentry(CrawlProfile.DOM_FILTER_DEPTH,    "Domain Filter Depth",   false, eentry.INTEGER));
        labels.add(new eentry(CrawlProfile.DOM_MAX_PAGES,       "Domain Max. Pages",     false, eentry.INTEGER));
        labels.add(new eentry(CrawlProfile.CRAWLING_Q,          "CrawlingQ / '?'-URLs",  false, eentry.BOOLEAN));
        labels.add(new eentry(CrawlProfile.INDEX_TEXT,          "Index Text",            false, eentry.BOOLEAN));
@ -245,7 +244,7 @@ public class CrawlProfileEditor_p {
        prop.put(CRAWL_PROFILE_PREFIX + count + "_mustmatch", profile.mustMatchPattern().toString());
        prop.put(CRAWL_PROFILE_PREFIX + count + "_mustnotmatch", profile.mustNotMatchPattern().toString());
        prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingIfOlder", (profile.recrawlIfOlder() == 0L) ? "no re-crawl" : DateFormat.getDateTimeInstance().format(profile.recrawlIfOlder()));
-        prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingDomFilterDepth", (profile.domFilterDepth() == Integer.MAX_VALUE) ? "inactive" : Integer.toString(profile.domFilterDepth()));
+        prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingDomFilterDepth", "inactive");
        // start contrib [MN]
        int i = 0;
--- a/htroot/CrawlProfileEditor_p.xml
+++ b/htroot/CrawlProfileEditor_p.xml
@ -9,7 +9,6 @@
 		<mustmatch>#[mustmatch]#</mustmatch>
 		<mustnotmatch>#[mustnotmatch]#</mustnotmatch>
 		<crawlingIfOlder>#[crawlingIfOlder]#</crawlingIfOlder>
 		<crawlingDomFilterDepth>#[crawlingDomFilterDepth]#</crawlingDomFilterDepth>
 		<crawlingDomFilterContent>
 		#{crawlingDomFilterContent}#
 			<item>#[item]#</item>
--- a/htroot/CrawlStartExpert_p.html
+++ b/htroot/CrawlStartExpert_p.html
@ -44,6 +44,13 @@
                  <input name="crawlingURL" type="text" size="41" maxlength="256" value="#[starturl]#" onkeypress="changed()" onfocus="check('url')" />                                    
                </td>
              </tr>
              <tr>
                <td><label for="url"><span class="nobr">From Link-List of URL</span></label>:</td>
                <td><input type="radio" name="crawlingMode" id="sitelist" value="sitelist" disabled="disabled"/></td>
                <td>
                  <div id="sitelistURLs"></div>
                </td>              
              </tr>
              <tr>
                <td><label for="url"><span class="nobr">From Sitemap</span></label>:</td>
                <td><input type="radio" name="crawlingMode" id="sitemap" value="sitemap" disabled="disabled"/></td>
@ -154,22 +161,6 @@
            If you don't know what this means, please leave this field empty.
          </td>
        </tr>
        <tr valign="top" class="TableCellLight">
          <td>Auto-Dom-Filter:</td>
          <td>
            <label for="crawlingDomFilterCheck">Use</label>:
            <input type="checkbox" name="crawlingDomFilterCheck" id="crawlingDomFilterCheck" #(crawlingDomFilterCheck)#::checked="checked"#(/crawlingDomFilterCheck)# />&nbsp;&nbsp;
            <label for="crawlingDomFilterDepth">Depth</label>:
            <input name="crawlingDomFilterDepth" id="crawlingDomFilterDepth" type="text" size="2" maxlength="2" value="#[crawlingDomFilterDepth]#" />
          </td>
          <td>
            This option will automatically create a domain-filter which limits the crawl on domains the crawler
            will find on the given depth. You can use this option i.e. to crawl a page with bookmarks while
            restricting the crawl on only those domains that appear on the bookmark-page. The adequate depth
            for this example would be 1.<br />
            The default value 0 gives no restrictions.
          </td>
        </tr>
        <tr valign="top" class="TableCellDark">
          <td>Maximum Pages per Domain:</td>
          <td>
--- a/htroot/CrawlStartSite_p.html
+++ b/htroot/CrawlStartSite_p.html
@ -42,13 +42,18 @@
            <input name="bookmarkTitle" id="bookmarkTitle" type="text" size="50" maxlength="256" value="" readonly="readonly" style="background:transparent; border:0px"/>
            </td>
 	        <td>
-          <span id="robotsOK"></span>
+            <span id="robotsOK"></span><img align="top" src="/env/grafics/empty.gif" name="ajax" alt="empty" />
-          <img align="top" src="/env/grafics/empty.gif" name="ajax" alt="empty" />
+            </td>
-          </td></tr><tr>
+          </tr><tr>
            <td><input type="radio" name="crawlingMode" id="sitelist" value="sitelist" disabled="disabled"
            onmousedown="document.getElementById('rangeDomain').disabled=true;document.getElementById('rangeSubpath').disabled=true;document.getElementById('crawlingDomMaxCheck').disabled=true;document.getElementById('crawlingDomMaxPages').disabled=true;document.getElementById('crawlingQ').disabled=true;"/>Link-List of URL</td>
            <td><div id="sitelistURLs"></div></td>
          </tr><tr>
            <td><input type="radio" name="crawlingMode" id="sitemap" value="sitemap" disabled="disabled"
            onmousedown="document.getElementById('rangeDomain').disabled=true;document.getElementById('rangeSubpath').disabled=true;document.getElementById('crawlingDomMaxCheck').disabled=true;document.getElementById('crawlingDomMaxPages').disabled=true;document.getElementById('crawlingQ').disabled=true;"/>Sitemap URL</td>
            <td><input name="sitemapURL" type="text" size="41" maxlength="256" value="" readonly="readonly" style="background:transparent; border:0px"/></td>
-          </tr></table><br/>
+          </tr>
          </table><br/>
        </dd>
      <input type="hidden" name="crawlingDepth" id="crawlingDepth" value="99">
        <dt><label>Scheduler</label></dt>
--- a/htroot/Crawler_p.java
+++ b/htroot/Crawler_p.java
@ -60,10 +60,6 @@ import de.anomic.server.serverSwitch;
 import de.anomic.yacy.yacyNewsPool;
 public class Crawler_p {
 	public static final String CRAWLING_MODE_URL = "url";
 	public static final String CRAWLING_MODE_FILE = "file";
 	public static final String CRAWLING_MODE_SITEMAP = "sitemap";
    // this servlet does NOT create the Crawler servlet page content!
    // this servlet starts a web crawl. The interface for entering the web crawl parameters is in IndexCreate_p.html
@ -102,372 +98,405 @@ public class Crawler_p {
        }
        prop.put("info", "0");
        if (post != null) {
            // a crawl start
-            if (post.containsKey("continue")) {
+        if (post != null && post.containsKey("continue")) {
-                // continue queue
+            // continue queue
-                final String queue = post.get("continue", "");
+            final String queue = post.get("continue", "");
-                if (queue.equals("localcrawler")) {
+            if (queue.equals("localcrawler")) {
-                    sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
+                sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
-                } else if (queue.equals("remotecrawler")) {
+            } else if (queue.equals("remotecrawler")) {
-                    sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL);
+                sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL);
                }
            }
        }
-            if (post.containsKey("pause")) {
+        if (post != null && post.containsKey("pause")) {
-                // pause queue
+            // pause queue
-                final String queue = post.get("pause", "");
+            final String queue = post.get("pause", "");
-                if (queue.equals("localcrawler")) {
+            if (queue.equals("localcrawler")) {
-                    sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
+                sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
-                } else if (queue.equals("remotecrawler")) {
+            } else if (queue.equals("remotecrawler")) {
-                    sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL);
+                sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL);
                }
            }
-            
+        }
-            if (post.containsKey("crawlingstart")) {
+        
-                // init crawl
+        if (post != null && post.containsKey("crawlingstart")) {
-                if (sb.peers == null) {
+            // init crawl
-                    prop.put("info", "3");
+            if (sb.peers == null) {
-                } else {
+                prop.put("info", "3");
-                    String crawlingStart = post.get("crawlingURL","").trim(); // the crawljob start url
+            } else {
-                    // add the prefix http:// if necessary
+                String crawlingStart = post.get("crawlingURL","").trim(); // the crawljob start url
-                    int pos = crawlingStart.indexOf("://");
+                // add the prefix http:// if necessary
-                    if (pos == -1) crawlingStart = "http://" + crawlingStart;
+                int pos = crawlingStart.indexOf("://");
                if (pos == -1) crawlingStart = "http://" + crawlingStart;
-                    // normalizing URL
+                // normalize URL
-                    DigestURI crawlingStartURL = null;
+                DigestURI crawlingStartURL = null;
-                    try {crawlingStartURL = new DigestURI(crawlingStart, null);} catch (final MalformedURLException e1) {}
+                try {crawlingStartURL = new DigestURI(crawlingStart, null);} catch (final MalformedURLException e1) {}
-                    crawlingStart = (crawlingStartURL == null) ? null : crawlingStartURL.toNormalform(true, true);
+                crawlingStart = (crawlingStartURL == null) ? null : crawlingStartURL.toNormalform(true, true);
-                   
+               
-                    // set new properties
+                // set new properties
-                    final boolean fullDomain = post.get("range", "wide").equals("domain"); // special property in simple crawl start
+                final boolean fullDomain = post.get("range", "wide").equals("domain"); // special property in simple crawl start
-                    final boolean subPath    = post.get("range", "wide").equals("subpath"); // special property in simple crawl start
+                final boolean subPath    = post.get("range", "wide").equals("subpath"); // special property in simple crawl start
-                    
+                
-                    
+                
-                    // set the crawling filter
+                // set the crawl filter
-                    String newcrawlingMustMatch = post.get("mustmatch", CrawlProfile.MATCH_ALL);
+                String newcrawlingMustMatch = post.get("mustmatch", CrawlProfile.MATCH_ALL);
-                    String newcrawlingMustNotMatch = post.get("mustnotmatch", CrawlProfile.MATCH_NEVER);
+                String newcrawlingMustNotMatch = post.get("mustnotmatch", CrawlProfile.MATCH_NEVER);
-                    if (newcrawlingMustMatch.length() < 2) newcrawlingMustMatch = CrawlProfile.MATCH_ALL; // avoid that all urls are filtered out if bad value was submitted
+                if (newcrawlingMustMatch.length() < 2) newcrawlingMustMatch = CrawlProfile.MATCH_ALL; // avoid that all urls are filtered out if bad value was submitted
-                    // special cases:
+                // special cases:
-                    if (crawlingStartURL!= null && fullDomain) {
+                if (crawlingStartURL!= null && fullDomain) {
-                        newcrawlingMustMatch = ".*" + crawlingStartURL.getHost() + ".*";
+                    newcrawlingMustMatch = ".*" + crawlingStartURL.getHost() + ".*";
-                    }
+                }
-                    if (crawlingStart!= null && subPath && (pos = crawlingStart.lastIndexOf('/')) > 0) {
+                if (crawlingStart!= null && subPath && (pos = crawlingStart.lastIndexOf('/')) > 0) {
-                        newcrawlingMustMatch = crawlingStart.substring(0, pos + 1) + ".*";
+                    newcrawlingMustMatch = crawlingStart.substring(0, pos + 1) + ".*";
-                    }
+                }
-                    
+                
-                    final boolean crawlOrder = post.get("crawlOrder", "off").equals("on");
+                final boolean crawlOrder = post.get("crawlOrder", "off").equals("on");
-                    env.setConfig("crawlOrder", (crawlOrder) ? "true" : "false");
+                env.setConfig("crawlOrder", (crawlOrder) ? "true" : "false");
-                    
+                
-                    int newcrawlingdepth = Integer.parseInt(post.get("crawlingDepth", "8"));
+                int newcrawlingdepth = Integer.parseInt(post.get("crawlingDepth", "8"));
-                    env.setConfig("crawlingDepth", Integer.toString(newcrawlingdepth));
+                env.setConfig("crawlingDepth", Integer.toString(newcrawlingdepth));
-                    if ((crawlOrder) && (newcrawlingdepth > 8)) newcrawlingdepth = 8;
+                if ((crawlOrder) && (newcrawlingdepth > 8)) newcrawlingdepth = 8;
-                    
+                
-                    // recrawl
+                // recrawl
-                    final String recrawl = post.get("recrawl", "nodoubles"); // nodoubles, reload, scheduler
+                final String recrawl = post.get("recrawl", "nodoubles"); // nodoubles, reload, scheduler
-                    boolean crawlingIfOlderCheck = post.get("crawlingIfOlderCheck", "off").equals("on");
+                boolean crawlingIfOlderCheck = post.get("crawlingIfOlderCheck", "off").equals("on");
-                    int crawlingIfOlderNumber = Integer.parseInt(post.get("crawlingIfOlderNumber", "-1"));
+                int crawlingIfOlderNumber = Integer.parseInt(post.get("crawlingIfOlderNumber", "-1"));
-                    String crawlingIfOlderUnit = post.get("crawlingIfOlderUnit","year"); // year, month, day, hour
+                String crawlingIfOlderUnit = post.get("crawlingIfOlderUnit","year"); // year, month, day, hour
-                    int repeat_time = Integer.parseInt(post.get("repeat_time", "-1"));
+                int repeat_time = Integer.parseInt(post.get("repeat_time", "-1"));
-                    final String repeat_unit = post.get("repeat_unit", "seldays"); // selminutes, selhours, seldays
+                final String repeat_unit = post.get("repeat_unit", "seldays"); // selminutes, selhours, seldays
-                    
+                
-                    if (recrawl.equals("scheduler") && repeat_time > 0) {
+                if (recrawl.equals("scheduler") && repeat_time > 0) {
-                        // set crawlingIfOlder attributes that are appropriate for scheduled crawling 
+                    // set crawlingIfOlder attributes that are appropriate for scheduled crawling 
-                        crawlingIfOlderCheck = true;
+                    crawlingIfOlderCheck = true;
-                        crawlingIfOlderNumber = repeat_unit.equals("selminutes") ? 1 : repeat_unit.equals("selhours") ? repeat_time / 2 : repeat_time * 12;
+                    crawlingIfOlderNumber = repeat_unit.equals("selminutes") ? 1 : repeat_unit.equals("selhours") ? repeat_time / 2 : repeat_time * 12;
-                        crawlingIfOlderUnit = "hour";
+                    crawlingIfOlderUnit = "hour";
-                    } else if (recrawl.equals("reload")) {
+                } else if (recrawl.equals("reload")) {
-                        repeat_time = -1;
+                    repeat_time = -1;
-                        crawlingIfOlderCheck = true;
+                    crawlingIfOlderCheck = true;
-                    } else if (recrawl.equals("nodoubles")) {
+                } else if (recrawl.equals("nodoubles")) {
-                        repeat_time = -1;
+                    repeat_time = -1;
-                        crawlingIfOlderCheck = false;
+                    crawlingIfOlderCheck = false;
-                    }
+                }
-                    long crawlingIfOlder = recrawlIfOlderC(crawlingIfOlderCheck, crawlingIfOlderNumber, crawlingIfOlderUnit);
+                long crawlingIfOlder = recrawlIfOlderC(crawlingIfOlderCheck, crawlingIfOlderNumber, crawlingIfOlderUnit);
-                    env.setConfig("crawlingIfOlder", crawlingIfOlder);
+                env.setConfig("crawlingIfOlder", crawlingIfOlder);
-                    // store this call as api call
+                // store this call as api call
-                    if (repeat_time > 0) {
+                if (repeat_time > 0) {
-                        // store as scheduled api call
+                    // store as scheduled api call
-                        sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + crawlingStart, repeat_time, repeat_unit.substring(3));
+                    sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + crawlingStart, repeat_time, repeat_unit.substring(3));
-                    } else {
+                } else {
-                        // store just a protocol
+                    // store just a protocol
-                        sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + crawlingStart);
+                    sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + crawlingStart);
-                    }                    
+                }                    
-                    final boolean crawlingDomFilterCheck = post.get("crawlingDomFilterCheck", "off").equals("on");
+                
-                    final int crawlingDomFilterDepth = (crawlingDomFilterCheck) ? Integer.parseInt(post.get("crawlingDomFilterDepth", "-1")) : -1;
+                final boolean crawlingDomMaxCheck = post.get("crawlingDomMaxCheck", "off").equals("on");
-                    env.setConfig("crawlingDomFilterDepth", Integer.toString(crawlingDomFilterDepth));
+                final int crawlingDomMaxPages = (crawlingDomMaxCheck) ? Integer.parseInt(post.get("crawlingDomMaxPages", "-1")) : -1;
-                    
+                env.setConfig("crawlingDomMaxPages", Integer.toString(crawlingDomMaxPages));
-                    final boolean crawlingDomMaxCheck = post.get("crawlingDomMaxCheck", "off").equals("on");
+                
-                    final int crawlingDomMaxPages = (crawlingDomMaxCheck) ? Integer.parseInt(post.get("crawlingDomMaxPages", "-1")) : -1;
+                final boolean crawlingQ = post.get("crawlingQ", "off").equals("on");
-                    env.setConfig("crawlingDomMaxPages", Integer.toString(crawlingDomMaxPages));
+                env.setConfig("crawlingQ", (crawlingQ) ? "true" : "false");
-                    
+                
-                    final boolean crawlingQ = post.get("crawlingQ", "off").equals("on");
+                final boolean indexText = post.get("indexText", "off").equals("on");
-                    env.setConfig("crawlingQ", (crawlingQ) ? "true" : "false");
+                env.setConfig("indexText", (indexText) ? "true" : "false");
-                    
+                
-                    final boolean indexText = post.get("indexText", "off").equals("on");
+                final boolean indexMedia = post.get("indexMedia", "off").equals("on");
-                    env.setConfig("indexText", (indexText) ? "true" : "false");
+                env.setConfig("indexMedia", (indexMedia) ? "true" : "false");
-                    
+                
-                    final boolean indexMedia = post.get("indexMedia", "off").equals("on");
+                final boolean storeHTCache = post.get("storeHTCache", "off").equals("on");
-                    env.setConfig("indexMedia", (indexMedia) ? "true" : "false");
+                env.setConfig("storeHTCache", (storeHTCache) ? "true" : "false");
-                    
+                
-                    final boolean storeHTCache = post.get("storeHTCache", "off").equals("on");
+                final String cachePolicyString = post.get("cachePolicy", "iffresh");
-                    env.setConfig("storeHTCache", (storeHTCache) ? "true" : "false");
+                CrawlProfile.CacheStrategy cachePolicy = CrawlProfile.CacheStrategy.IFFRESH;
-                    
+                if (cachePolicyString.equals("nocache")) cachePolicy = CrawlProfile.CacheStrategy.NOCACHE;
-                    final String cachePolicyString = post.get("cachePolicy", "iffresh");
+                if (cachePolicyString.equals("iffresh")) cachePolicy = CrawlProfile.CacheStrategy.IFFRESH;
-                    CrawlProfile.CacheStrategy cachePolicy = CrawlProfile.CacheStrategy.IFFRESH;
+                if (cachePolicyString.equals("ifexist")) cachePolicy = CrawlProfile.CacheStrategy.IFEXIST;
-                    if (cachePolicyString.equals("nocache")) cachePolicy = CrawlProfile.CacheStrategy.NOCACHE;
+                if (cachePolicyString.equals("cacheonly")) cachePolicy = CrawlProfile.CacheStrategy.CACHEONLY;
-                    if (cachePolicyString.equals("iffresh")) cachePolicy = CrawlProfile.CacheStrategy.IFFRESH;
+                
-                    if (cachePolicyString.equals("ifexist")) cachePolicy = CrawlProfile.CacheStrategy.IFEXIST;
+                final boolean xsstopw = post.get("xsstopw", "off").equals("on");
-                    if (cachePolicyString.equals("cacheonly")) cachePolicy = CrawlProfile.CacheStrategy.CACHEONLY;
+                env.setConfig("xsstopw", (xsstopw) ? "true" : "false");
-                    
+                
-                    final boolean xsstopw = post.get("xsstopw", "off").equals("on");
+                final boolean xdstopw = post.get("xdstopw", "off").equals("on");
-                    env.setConfig("xsstopw", (xsstopw) ? "true" : "false");
+                env.setConfig("xdstopw", (xdstopw) ? "true" : "false");
-                    
+                
-                    final boolean xdstopw = post.get("xdstopw", "off").equals("on");
+                final boolean xpstopw = post.get("xpstopw", "off").equals("on");
-                    env.setConfig("xdstopw", (xdstopw) ? "true" : "false");
+                env.setConfig("xpstopw", (xpstopw) ? "true" : "false");
-                    
+                
-                    final boolean xpstopw = post.get("xpstopw", "off").equals("on");
+                final String crawlingMode = post.get("crawlingMode","url");
-                    env.setConfig("xpstopw", (xpstopw) ? "true" : "false");
+                if (crawlingMode.equals("url")) {
-                    final String crawlingMode = post.get("crawlingMode","url");
+                    // check if pattern matches
-                    if (crawlingMode.equals(CRAWLING_MODE_URL)) {
+                    if ((crawlingStart == null || crawlingStartURL == null) /* || (!(crawlingStart.matches(newcrawlingfilter))) */) {
                        // print error message
                        prop.put("info", "4"); //crawlfilter does not match url
                        prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
                        prop.putHTML("info_crawlingStart", crawlingStart);
                    } else try {
-                        // check if pattern matches
+                        // check if the crawl filter works correctly
-                        if ((crawlingStart == null || crawlingStartURL == null) /* || (!(crawlingStart.matches(newcrawlingfilter))) */) {
+                        Pattern.compile(newcrawlingMustMatch);
-                            // print error message
+                        
-                            prop.put("info", "4"); //crawlfilter does not match url
+                        // stack request
-                            prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
+                        // first delete old entry, if exists
-                            prop.putHTML("info_crawlingStart", crawlingStart);
+                        final DigestURI url = new DigestURI(crawlingStart, null);
-                        } else try {
+                        final byte[] urlhash = url.hash();
                        indexSegment.urlMetadata().remove(urlhash);
                        sb.crawlQueues.noticeURL.removeByURLHash(urlhash);
                        sb.crawlQueues.errorURL.remove(urlhash);
                        // stack url
                        sb.crawler.profilesPassiveCrawls.remove(crawlingStartURL.hash()); // if there is an old entry, delete it
                        final CrawlProfile pe = new CrawlProfile(
                                (crawlingStartURL.getHost() == null) ? Long.toHexString(System.currentTimeMillis()) : crawlingStartURL.getHost(),
                                crawlingStartURL,
                                newcrawlingMustMatch,
                                newcrawlingMustNotMatch,
                                newcrawlingdepth,
                                crawlingIfOlder, crawlingDomMaxPages,
                                crawlingQ,
                                indexText, indexMedia,
                                storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw, cachePolicy);
                        sb.crawler.profilesActiveCrawls.put(pe.handle().getBytes(), pe);
                        final String reasonString = sb.crawlStacker.stackCrawl(new Request(
                                sb.peers.mySeed().hash.getBytes(),
                                url,
                                null,
                                "CRAWLING-ROOT",
                                new Date(),
                                pe.handle(),
                                0,
                                0,
                                0
                                ));
                        if (reasonString == null) {
                        	// create a bookmark from crawl start url
                        	Set<String> tags=listManager.string2set(BookmarkHelper.cleanTagsString(post.get("bookmarkFolder","/crawlStart")));                                
                            tags.add("crawlStart");
                        	if (post.get("createBookmark","off").equals("on")) {
                            	bookmarksDB.Bookmark bookmark = sb.bookmarksDB.createBookmark(crawlingStart, "admin");
                    			if(bookmark != null){
                    				bookmark.setProperty(bookmarksDB.Bookmark.BOOKMARK_TITLE, post.get("bookmarkTitle", crawlingStart));                        				
                    				bookmark.setOwner("admin");                        				
                    				bookmark.setPublic(false);    
                    				bookmark.setTags(tags, true);
                    				sb.bookmarksDB.saveBookmark(bookmark);
                    			}
                            }
                            // liftoff!
                            prop.put("info", "8");//start msg
                            prop.putHTML("info_crawlingURL", (post.get("crawlingURL")));
                            // generate a YaCyNews if the global flag was set
                            if (crawlOrder) {
                                final Map<String, String> m = new HashMap<String, String>(pe); // must be cloned
                                m.remove("specificDepth");
                                m.remove("indexText");
                                m.remove("indexMedia");
                                m.remove("remoteIndexing");
                                m.remove("xsstopw");
                                m.remove("xpstopw");
                                m.remove("xdstopw");
                                m.remove("storeTXCache");
                                m.remove("storeHTCache");
                                m.remove("generalFilter");
                                m.remove("specificFilter");
                                m.put("intention", post.get("intention", "").replace(',', '/'));
                                sb.peers.newsPool.publishMyNews(sb.peers.mySeed(), yacyNewsPool.CATEGORY_CRAWL_START, m);
                            }                                
                        } else {
                            prop.put("info", "5"); //Crawling failed
                            prop.putHTML("info_crawlingURL", (post.get("crawlingURL")));
                            prop.putHTML("info_reasonString", reasonString);
                            sb.crawlQueues.errorURL.push(
                                    new Request(
                                            sb.peers.mySeed().hash.getBytes(), 
                                            crawlingStartURL, 
                                            null, 
                                            "", 
                                            new Date(),
                                            pe.handle(),
                                            0, 
                                            0, 
                                            0),
                                    sb.peers.mySeed().hash.getBytes(),
                                    new Date(),
                                    1,
                                    reasonString);
                        }
                    } catch (final PatternSyntaxException e) {
                        prop.put("info", "4"); //crawlfilter does not match url
                        prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
                        prop.putHTML("info_error", e.getMessage());
                    } catch (final Exception e) {
                        // mist
                        prop.put("info", "6");//Error with url
                        prop.putHTML("info_crawlingStart", crawlingStart);
                        prop.putHTML("info_error", e.getMessage());
                        Log.logException(e);
                    }
                } else if (crawlingMode.equals("file")) {
                    if (post.containsKey("crawlingFile")) {
                        final String fileName = post.get("crawlingFile");  
                        try {
                            // check if the crawl filter works correctly
                            Pattern.compile(newcrawlingMustMatch);
-                            
+                            final File file = new File(fileName);
-                            // stack request
+                            final String fileString = post.get("crawlingFile$file");
-                            // first delete old entry, if exists
+                            final ContentScraper scraper = new ContentScraper(new DigestURI(file));
-                            final DigestURI url = new DigestURI(crawlingStart, null);
+                            final Writer writer = new TransformerWriter(null, null, scraper, null, false);
-                            final byte[] urlhash = url.hash();
+                            FileUtils.copy(fileString, writer);
-                            indexSegment.urlMetadata().remove(urlhash);
+                            writer.close();
-                            sb.crawlQueues.noticeURL.removeByURLHash(urlhash);
+                            final Map<MultiProtocolURI, String> hyperlinks = scraper.getAnchors();
-                            sb.crawlQueues.errorURL.remove(urlhash);
+                            final DigestURI crawlURL = new DigestURI("file://" + file.toString(), null);
-                            
+                            final CrawlProfile profile = new CrawlProfile(
-                            // stack url
+                                    fileName, crawlURL,
                            sb.crawler.profilesPassiveCrawls.remove(crawlingStartURL.hash()); // if there is an old entry, delete it
                            final CrawlProfile pe = new CrawlProfile(
                                    (crawlingStartURL.getHost() == null) ? Long.toHexString(System.currentTimeMillis()) : crawlingStartURL.getHost(),
                                    crawlingStartURL,
                                    newcrawlingMustMatch,
-                                    newcrawlingMustNotMatch,
+                                    CrawlProfile.MATCH_NEVER,
                                    newcrawlingdepth,
-                                    crawlingIfOlder, crawlingDomFilterDepth, crawlingDomMaxPages,
+                                    crawlingIfOlder,
                                    crawlingDomMaxPages,
                                    crawlingQ,
-                                    indexText, indexMedia,
+                                    indexText,
-                                    storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw, cachePolicy);
+                                    indexMedia,
-                            sb.crawler.profilesActiveCrawls.put(pe.handle().getBytes(), pe);
+                                    storeHTCache,
-                            final String reasonString = sb.crawlStacker.stackCrawl(new Request(
+                                    true,
-                                    sb.peers.mySeed().hash.getBytes(),
+                                    crawlOrder,
-                                    url,
+                                    xsstopw, xdstopw, xpstopw,
-                                    null,
+                                    cachePolicy);
-                                    "CRAWLING-ROOT",
+                            sb.crawler.profilesActiveCrawls.put(profile.handle().getBytes(), profile);
-                                    new Date(),
+                            sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
-                                    pe.handle(),
+                            final Iterator<Map.Entry<MultiProtocolURI, String>> linkiterator = hyperlinks.entrySet().iterator();
-                                    0,
+                            DigestURI nexturl;
-                                    0,
+                            while (linkiterator.hasNext()) {
-                                    0
+                                final Map.Entry<MultiProtocolURI, String> e = linkiterator.next();
-                                    ));
+                                if (e.getKey() == null) continue;
-                            
+                                nexturl = new DigestURI(e.getKey());
-                            if (reasonString == null) {
+                                sb.crawlStacker.enqueueEntry(new Request(
-                            	// create a bookmark from crawl start url
+                                        sb.peers.mySeed().hash.getBytes(), 
-                            	Set<String> tags=listManager.string2set(BookmarkHelper.cleanTagsString(post.get("bookmarkFolder","/crawlStart")));                                
+                                        nexturl, 
-                                tags.add("crawlStart");
+                                        null, 
-                            	if (post.get("createBookmark","off").equals("on")) {
+                                        e.getValue(), 
                                	bookmarksDB.Bookmark bookmark = sb.bookmarksDB.createBookmark(crawlingStart, "admin");
                        			if(bookmark != null){
                        				bookmark.setProperty(bookmarksDB.Bookmark.BOOKMARK_TITLE, post.get("bookmarkTitle", crawlingStart));                        				
                        				bookmark.setOwner("admin");                        				
                        				bookmark.setPublic(false);    
                        				bookmark.setTags(tags, true);
                        				sb.bookmarksDB.saveBookmark(bookmark);
                        			}
                                }
                                // liftoff!
                                prop.put("info", "8");//start msg
                                prop.putHTML("info_crawlingURL", (post.get("crawlingURL")));
                                // generate a YaCyNews if the global flag was set
                                if (crawlOrder) {
                                    final Map<String, String> m = new HashMap<String, String>(pe); // must be cloned
                                    m.remove("specificDepth");
                                    m.remove("indexText");
                                    m.remove("indexMedia");
                                    m.remove("remoteIndexing");
                                    m.remove("xsstopw");
                                    m.remove("xpstopw");
                                    m.remove("xdstopw");
                                    m.remove("storeTXCache");
                                    m.remove("storeHTCache");
                                    m.remove("generalFilter");
                                    m.remove("specificFilter");
                                    m.put("intention", post.get("intention", "").replace(',', '/'));
                                    sb.peers.newsPool.publishMyNews(sb.peers.mySeed(), yacyNewsPool.CATEGORY_CRAWL_START, m);
                                }                                
                            } else {
                                prop.put("info", "5"); //Crawling failed
                                prop.putHTML("info_crawlingURL", (post.get("crawlingURL")));
                                prop.putHTML("info_reasonString", reasonString);
                                sb.crawlQueues.errorURL.push(
                                        new Request(
                                                sb.peers.mySeed().hash.getBytes(), 
                                                crawlingStartURL, 
                                                null, 
                                                "", 
                                                new Date(),
                                                pe.handle(),
                                                0, 
                                                0, 
                                                0),
                                        sb.peers.mySeed().hash.getBytes(),
                                        new Date(),
-                                        1,
+                                        profile.handle(),
-                                        reasonString);
+                                        0,
                                        0,
                                        0
                                        ));
                            }
                        } catch (final PatternSyntaxException e) {
                            prop.put("info", "4"); //crawlfilter does not match url
                            prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
                            prop.putHTML("info_error", e.getMessage());
                        } catch (final Exception e) {
                            // mist
-                            prop.put("info", "6");//Error with url
+                            prop.put("info", "7");//Error with file
-                            prop.putHTML("info_crawlingStart", crawlingStart);
+                            prop.putHTML("info_crawlingStart", fileName);
                            prop.putHTML("info_error", e.getMessage());
                            Log.logException(e);
                        }
                        sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
                    }
                } else if (crawlingMode.equals("sitemap")) {
                    String sitemapURLStr = post.get("sitemapURL","");
                	try {
                		final DigestURI sitemapURL = new DigestURI(sitemapURLStr, null);
                		final CrawlProfile pe = new CrawlProfile(
                				sitemapURLStr, sitemapURL,
                				newcrawlingMustMatch,
                				CrawlProfile.MATCH_NEVER,
                				newcrawlingdepth,
                				crawlingIfOlder, crawlingDomMaxPages,
                				crawlingQ,
                				indexText, indexMedia,
                				storeHTCache, true, crawlOrder,
                				xsstopw, xdstopw, xpstopw,
                				cachePolicy);
                		sb.crawler.profilesActiveCrawls.put(pe.handle().getBytes(), pe);
                		final SitemapImporter importer = new SitemapImporter(sb, sitemapURL, pe);
                		importer.start();
                	} catch (final Exception e) {
                		// mist
                		prop.put("info", "6");//Error with url
                		prop.putHTML("info_crawlingStart", sitemapURLStr);
                		prop.putHTML("info_error", e.getMessage());
                		Log.logException(e);
                	}
                } else if (crawlingMode.equals("sitelist")) {
                    try {
                        final DigestURI sitelistURL = new DigestURI(crawlingStart, null);
                        // download document
                        ContentScraper scraper = null;
                        scraper = sb.loader.parseResource(sitelistURL, CrawlProfile.CacheStrategy.IFFRESH);
                        String title = scraper.getTitle();
                        // String description = scraper.getDescription();
-                    } else if (crawlingMode.equals(CRAWLING_MODE_FILE)) {
+                        // get links and generate filter
-                        if (post.containsKey("crawlingFile")) {
+                        StringBuilder filter = new StringBuilder();
-                            // getting the name of the uploaded file
+                        final Map<MultiProtocolURI, String> hyperlinks = scraper.getAnchors();
-                            final String fileName = post.get("crawlingFile");  
+                        for (MultiProtocolURI uri: hyperlinks.keySet()) {
-                            try {
+                            filter.append('|').append(uri.getProtocol()).append("://").append(uri.getHost()).append(".*");
                                // check if the crawl filter works correctly
                                Pattern.compile(newcrawlingMustMatch);
                                // loading the file content
                                final File file = new File(fileName);
                                // getting the content of the bookmark file
                                final String fileString = post.get("crawlingFile$file");
                                // parsing the bookmark file and fetching the headline and contained links
                                final ContentScraper scraper = new ContentScraper(new DigestURI(file));
                                //OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);
                                final Writer writer = new TransformerWriter(null,null,scraper,null,false);
                                FileUtils.copy(fileString, writer);
                                writer.close();
                                //String headline = scraper.getHeadline();
                                final Map<MultiProtocolURI, String> hyperlinks = scraper.getAnchors();
                                // creating a crawler profile
                                final DigestURI crawlURL = new DigestURI("file://" + file.toString(), null);
                                final CrawlProfile profile = new CrawlProfile(
                                        fileName, crawlURL,
                                        newcrawlingMustMatch,
                                        CrawlProfile.MATCH_NEVER,
                                        newcrawlingdepth,
                                        crawlingIfOlder,
                                        crawlingDomFilterDepth,
                                        crawlingDomMaxPages,
                                        crawlingQ,
                                        indexText,
                                        indexMedia,
                                        storeHTCache,
                                        true,
                                        crawlOrder,
                                        xsstopw, xdstopw, xpstopw,
                                        cachePolicy);
                                sb.crawler.profilesActiveCrawls.put(profile.handle().getBytes(), profile);
                                // pause local crawl here
                                sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
                                // loop through the contained links
                                final Iterator<Map.Entry<MultiProtocolURI, String>> linkiterator = hyperlinks.entrySet().iterator();
                                DigestURI nexturl;
                                while (linkiterator.hasNext()) {
                                    final Map.Entry<MultiProtocolURI, String> e = linkiterator.next();
                                    if (e.getKey() == null) continue;
                                    nexturl = new DigestURI(e.getKey());
                                    // enqueuing the url for crawling
                                    sb.crawlStacker.enqueueEntry(new Request(
                                            sb.peers.mySeed().hash.getBytes(), 
                                            nexturl, 
                                            null, 
                                            e.getValue(), 
                                            new Date(),
                                            profile.handle(),
                                            0,
                                            0,
                                            0
                                            ));
                                }
                            } catch (final PatternSyntaxException e) {
                                // print error message
                                prop.put("info", "4"); //crawlfilter does not match url
                                prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
                                prop.putHTML("info_error", e.getMessage());
                            } catch (final Exception e) {
                                // mist
                                prop.put("info", "7");//Error with file
                                prop.putHTML("info_crawlingStart", fileName);
                                prop.putHTML("info_error", e.getMessage());
                                Log.logException(e);
                            }
                            sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
                        }
-                    } else if (crawlingMode.equals(CRAWLING_MODE_SITEMAP)) { 
+                        newcrawlingMustMatch = filter.length() > 0 ? filter.substring(1) : "";
-                    	String sitemapURLStr = null;
+
-                    	try {
+                        // put links onto crawl queue
-                    		// getting the sitemap URL
+                        final CrawlProfile profile = new CrawlProfile(
-                    		sitemapURLStr = post.get("sitemapURL","");
+                                title == null || title.length() == 0 ? sitelistURL.getHost() : title,
-                    		final DigestURI sitemapURL = new DigestURI(sitemapURLStr, null);
+                                sitelistURL,
-                            
+                                newcrawlingMustMatch,
-                    		// create a new profile
+                                CrawlProfile.MATCH_NEVER,
-                    		final CrawlProfile pe = new CrawlProfile(
+                                newcrawlingdepth,
-                    				sitemapURLStr, sitemapURL,
+                                crawlingIfOlder,
-                    				newcrawlingMustMatch,
+                                crawlingDomMaxPages,
-                    				CrawlProfile.MATCH_NEVER,
+                                crawlingQ,
-                    				newcrawlingdepth,
+                                indexText,
-                    				crawlingIfOlder, crawlingDomFilterDepth, crawlingDomMaxPages,
+                                indexMedia,
-                    				crawlingQ,
+                                storeHTCache,
-                    				indexText, indexMedia,
+                                true,
-                    				storeHTCache, true, crawlOrder,
+                                crawlOrder,
-                    				xsstopw, xdstopw, xpstopw,
+                                xsstopw, xdstopw, xpstopw,
-                    				cachePolicy);
+                                cachePolicy);
-                    		sb.crawler.profilesActiveCrawls.put(pe.handle().getBytes(), pe);
+                        sb.crawler.profilesActiveCrawls.put(profile.handle().getBytes(), profile);
-                    		
+                        sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
-                    		// create a new sitemap importer
+                        final Iterator<Map.Entry<MultiProtocolURI, String>> linkiterator = hyperlinks.entrySet().iterator();
-                    		final SitemapImporter importer = new SitemapImporter(sb, new DigestURI(sitemapURLStr, null), pe);
+                        DigestURI nexturl;
-                    		importer.start();
+                        while (linkiterator.hasNext()) {
-                    		
+                            final Map.Entry<MultiProtocolURI, String> e = linkiterator.next();
-                    	} catch (final Exception e) {
+                            if (e.getKey() == null) continue;
-                    		// mist
+                            nexturl = new DigestURI(e.getKey());
-                    		prop.put("info", "6");//Error with url
+                            // remove the url from the database to be prepared to crawl them again
-                    		prop.putHTML("info_crawlingStart", sitemapURLStr);
+                            final byte[] urlhash = nexturl.hash();
-                    		prop.putHTML("info_error", e.getMessage());
+                            indexSegment.urlMetadata().remove(urlhash);
-                    		Log.logException(e);
+                            sb.crawlQueues.noticeURL.removeByURLHash(urlhash);
-                    	}
+                            sb.crawlQueues.errorURL.remove(urlhash);
                            sb.crawlStacker.enqueueEntry(new Request(
                                    sb.peers.mySeed().hash.getBytes(), 
                                    nexturl, 
                                    null, 
                                    e.getValue(), 
                                    new Date(),
                                    profile.handle(),
                                    0,
                                    0,
                                    0
                                    ));
                        }
                    } catch (final Exception e) {
                        // mist
                        prop.put("info", "6");//Error with url
                        prop.putHTML("info_crawlingStart", crawlingStart);
                        prop.putHTML("info_error", e.getMessage());
                        Log.logException(e);
                    }
                }
            }
-            
+        }
-            if (post.containsKey("crawlingPerformance")) {
+        
-                setPerformance(sb, post);
+        if (post != null && post.containsKey("crawlingPerformance")) {
-            }
+            setPerformance(sb, post);
        }
        // performance settings
--- a/htroot/QuickCrawlLink_p.java
+++ b/htroot/QuickCrawlLink_p.java
@ -152,7 +152,6 @@ public class QuickCrawlLink_p {
                        crawlingMustNotMatch,
                        CrawlingDepth,
                        60 * 24 * 30, // recrawlIfOlder (minutes); here: one month
                        -1, // domFilterDepth, if negative: no auto-filter
                        -1, // domMaxPages, if negative: no count restriction
                        crawlDynamic,
                        indexText,
--- a/htroot/api/util/getpageinfo_p.java
+++ b/htroot/api/util/getpageinfo_p.java
@ -81,6 +81,20 @@ public class getpageinfo_p {
                    // put language
                    Set<String> languages = scraper.getContentLanguages();
                    prop.putXML("lang", (languages == null) ? "unknown" : languages.iterator().next());
                    // get links and put them into a semicolon-separated list
                    StringBuilder links = new StringBuilder();
                    StringBuilder filter = new StringBuilder();
                    count = 0;
                    for (MultiProtocolURI uri: scraper.getAnchors().keySet()) {
                        links.append(';').append(uri.toNormalform(true, false));
                        filter.append('|').append(uri.getProtocol()).append("://").append(uri.getHost()).append(".*");
                        prop.putXML("links_" + count + "_link", uri.toNormalform(true, false));
                        count++;
                    }
                    prop.put("links", count);
                    prop.putXML("sitelist", links.length() > 0 ? links.substring(1) : "");
                    prop.putXML("filter", filter.length() > 0 ? filter.substring(1) : ".*");
                }
            }
            if(actions.indexOf("robots")>=0){
--- a/htroot/api/util/getpageinfo_p.xml
+++ b/htroot/api/util/getpageinfo_p.xml
@ -6,9 +6,16 @@
  <robots>#(robots-allowed)#0::1::#(/robots-allowed)#</robots>
  <sitemap>#[sitemap]#</sitemap>
  <favicon>#[favicon]#</favicon>
  <sitelist>#[sitelist]#</sitelist>
  <filter>#[filter]#</filter>
  <tags>
    #{tags}#
    <tag name="#[tag]#" />
    #{/tags}#
  </tags>
  <links>
    #{links}#
    <link name="#[link]#" />
    #{/links}#
  </links>
 </pageinfo>
--- a/htroot/js/IndexCreate.js
+++ b/htroot/js/IndexCreate.js
@ -3,12 +3,12 @@ var AJAX_ON="/env/grafics/ajax.gif";
 var timeout="";
 function handleResponse(){
-    if(http.readyState == 4){
+    if (http.readyState == 4){
        var response = http.responseXML;
-		// getting the document title
+		// get the document title
        doctitle="";		
-        if(response.getElementsByTagName("title")[0].firstChild!=null){
+        if (response.getElementsByTagName("title")[0].firstChild!=null){
 	        doctitle=response.getElementsByTagName("title")[0].firstChild.nodeValue;
 	    }
 		// document.getElementById("title").innerHTML=doctitle;
@ -23,43 +23,51 @@ function handleResponse(){
        if(robotsOKspan.firstChild){
 	        robotsOKspan.removeChild(robotsOKspan.firstChild);
        }
-        if(docrobotsOK==1){
+        if (docrobotsOK==1){
        	img=document.createElement("img");
        	img.setAttribute("src", "/env/grafics/ok.png");
        	img.setAttribute("width", "32px");
        	img.setAttribute("height", "32px");
        	robotsOKspan.appendChild(img);
-        }else if(docrobotsOK==0){
+        } else if(docrobotsOK==0){
 			img=document.createElement("img");
        	img.setAttribute("src", "/env/grafics/bad.png");
        	img.setAttribute("width", "32px");
        	img.setAttribute("height", "32px");
        	robotsOKspan.appendChild(img);
        	robotsOKspan.appendChild(img);
-        }else{
+        } else {
 	        robotsOKspan.appendChild(document.createTextNode(""));
 	        document.getElementById("robotsOK").innerHTML="";
        }		
-		// getting the sitemap URL contained in the robots.txt
+		// get the sitemap URL contained in the robots.txt
 		if (document.getElementsByName("sitemapURL").length > 0) {
 			sitemap="";		
-	        if(response.getElementsByTagName("sitemap")[0].firstChild!=null){
+	        if (response.getElementsByTagName("sitemap")[0].firstChild!=null){
 		        sitemap=response.getElementsByTagName("sitemap")[0].firstChild.nodeValue;
 		    }		
 			document.getElementsByName("sitemapURL")[0].value=sitemap;
 			document.getElementById("sitemap").disabled=false;
 		}
 			sitelist="";		
 	        if (response.getElementsByTagName("sitelist")[0].firstChild!=null){
 		        sitelist=response.getElementsByTagName("sitelist")[0].firstChild.nodeValue;
 		    }
 			document.getElementById("sitelistURLs").innerHTML = sitelist;
 			document.getElementById("sitelist").disabled=false;
 		// clear the ajax image
 		document.getElementsByName("ajax")[0].setAttribute("src", AJAX_OFF);
    }
 }
-function changed(){
+
 function changed() {
 	window.clearTimeout(timeout);
 	timeout=window.setTimeout("loadInfos()", 1500);
 }
-function loadInfos(){
+
 function loadInfos() {
 	// displaying ajax image
 	document.getElementsByName("ajax")[0].setAttribute("src",AJAX_ON);	
--- a/source/de/anomic/crawler/CrawlProfile.java
+++ b/source/de/anomic/crawler/CrawlProfile.java
@ -48,7 +48,6 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
    public static final String FILTER_MUSTNOTMATCH = "nevermatch";
    public static final String DEPTH            = "generalDepth";
    public static final String RECRAWL_IF_OLDER = "recrawlIfOlder";
    public static final String DOM_FILTER_DEPTH = "domFilterDepth";
    public static final String DOM_MAX_PAGES    = "domMaxPages";
    public static final String CRAWLING_Q       = "crawlingQ";
    public static final String INDEX_TEXT       = "indexText";
@ -70,7 +69,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
                 final String mustnotmatch,
                 final int depth,
                 final long recrawlIfOlder /*date*/,
-                 final int domFilterDepth, final int domMaxPages,
+                 final int domMaxPages,
                 final boolean crawlingQ,
                 final boolean indexText, final boolean indexMedia,
                 final boolean storeHTCache, final boolean storeTXCache,
@ -87,7 +86,6 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        put(FILTER_MUSTNOTMATCH,   (mustnotmatch == null) ? CrawlProfile.MATCH_NEVER : mustnotmatch);
        put(DEPTH,            depth);
        put(RECRAWL_IF_OLDER, recrawlIfOlder);
        put(DOM_FILTER_DEPTH, domFilterDepth);
        put(DOM_MAX_PAGES,    domMaxPages);
        put(CRAWLING_Q,       crawlingQ); // crawling of urls with '?'
        put(INDEX_TEXT,       indexText);
@ -186,21 +184,6 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
            return 0L;
        }
    }
    public int domFilterDepth() {
        // if the depth is equal or less to this depth,
        // then the current url feeds with its domain the crawl filter
        // if this is -1, all domains are feeded
        final String r = get(DOM_FILTER_DEPTH);
        if (r == null) return Integer.MAX_VALUE;
        try {
            final int i = Integer.parseInt(r);
            if (i < 0) return Integer.MAX_VALUE;
            return i;
        } catch (final NumberFormatException e) {
            Log.logException(e);
            return Integer.MAX_VALUE;
        }
    }
    public int domMaxPages() {
        // this is the maximum number of pages that are crawled for a single domain
        // if -1, this means no limit
@ -270,16 +253,6 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
            dp.inc();
        }
    }
    public boolean grantedDomAppearance(final String domain) {
        final int max = domFilterDepth();
        if (max == Integer.MAX_VALUE) return true;
        final DomProfile dp = doms.get(domain);
        if (dp == null) {
            return 0 < max;
        }
        return dp.depth <= max;
    }
    public boolean grantedDomCount(final String domain) {
        final int max = domMaxPages();
        if (max == Integer.MAX_VALUE) return true;
@ -292,10 +265,6 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
    public int domSize() {
        return doms.size();
    }
    public boolean domExists(final String domain) {
        if (domFilterDepth() == Integer.MAX_VALUE) return true;
        return doms.containsKey(domain);
    }
    public String domName(final boolean attr, final int index){
        final Iterator<Map.Entry<String, DomProfile>> domnamesi = doms.entrySet().iterator();
--- a/source/de/anomic/crawler/CrawlStacker.java
+++ b/source/de/anomic/crawler/CrawlStacker.java
@ -196,7 +196,7 @@ public final class CrawlStacker {
        final DigestURI referrerURL = (entry.referrerhash() == null || entry.referrerhash().length == 0) ? null : nextQueue.getURL(entry.referrerhash());
        // add domain to profile domain list
-        if ((profile.domFilterDepth() != Integer.MAX_VALUE) || (profile.domMaxPages() != Integer.MAX_VALUE)) {
+        if (profile.domMaxPages() != Integer.MAX_VALUE) {
            profile.domInc(entry.url().getHost(), (referrerURL == null) ? null : referrerURL.getHost().toLowerCase(), entry.depth());
        }
@ -296,12 +296,6 @@ public final class CrawlStacker {
            return "post url not allowed";
        }
        // deny urls that do not match with the profile domain list
        if (!(profile.grantedDomAppearance(url.getHost()))) {
            if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' is not listed in granted domains.");
            return "url does not match domain filter";
        }
        // deny urls that exceed allowed number of occurrences
        if (!(profile.grantedDomCount(url.getHost()))) {
            if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' appeared too often, a maximum of " + profile.domMaxPages() + " is allowed.");
--- a/source/de/anomic/crawler/CrawlSwitchboard.java
+++ b/source/de/anomic/crawler/CrawlSwitchboard.java
@ -164,9 +164,10 @@ public final class CrawlSwitchboard {
        if (this.defaultProxyProfile == null) {
            // generate new default entry for proxy crawling
-            this.defaultProxyProfile = new CrawlProfile("proxy", null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER,
+            this.defaultProxyProfile = new CrawlProfile(
                    "proxy", null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER,
                    0 /*Integer.parseInt(getConfig(PROXY_PREFETCH_DEPTH, "0"))*/,
-                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), -1, -1, false,
+                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), -1, false,
                    true /*getConfigBool(PROXY_INDEXING_LOCAL_TEXT, true)*/,
                    true /*getConfigBool(PROXY_INDEXING_LOCAL_MEDIA, true)*/,
                    true, true,
@ -177,38 +178,38 @@ public final class CrawlSwitchboard {
        if (this.defaultRemoteProfile == null) {
            // generate new default entry for remote crawling
            this.defaultRemoteProfile = new CrawlProfile(CRAWL_PROFILE_REMOTE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
-                    -1, -1, -1, true, true, true, false, true, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH);
+                    -1, -1, true, true, true, false, true, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH);
            this.profilesActiveCrawls.put(this.defaultRemoteProfile.handle().getBytes(), this.defaultRemoteProfile);
        }
        if (this.defaultTextSnippetLocalProfile == null) {
            // generate new default entry for snippet fetch and optional crawling
            this.defaultTextSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
-                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, -1, true, false, false, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH);
+                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, true, false, false, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH);
            this.profilesActiveCrawls.put(this.defaultTextSnippetLocalProfile.handle().getBytes(), this.defaultTextSnippetLocalProfile);
        }
        if (this.defaultTextSnippetGlobalProfile == null) {
            // generate new default entry for snippet fetch and optional crawling
            this.defaultTextSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
-                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, -1, true, true, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
+                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, true, true, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
            this.profilesActiveCrawls.put(this.defaultTextSnippetGlobalProfile.handle().getBytes(), this.defaultTextSnippetGlobalProfile);
        }
        this.defaultTextSnippetGlobalProfile.setCacheStrategy(CrawlProfile.CacheStrategy.IFEXIST);
        if (this.defaultMediaSnippetLocalProfile == null) {
            // generate new default entry for snippet fetch and optional crawling
            this.defaultMediaSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
-                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, false, true, false, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
+                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, true, false, false, true, false, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
            this.profilesActiveCrawls.put(this.defaultMediaSnippetLocalProfile.handle().getBytes(), this.defaultMediaSnippetLocalProfile);
        }
        if (this.defaultMediaSnippetGlobalProfile == null) {
            // generate new default entry for snippet fetch and optional crawling
            this.defaultMediaSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
-                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
+                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, true, false, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
            this.profilesActiveCrawls.put(this.defaultMediaSnippetGlobalProfile.handle().getBytes(), this.defaultMediaSnippetGlobalProfile);
        }
        if (this.defaultSurrogateProfile == null) {
            // generate new default entry for surrogate parsing
            this.defaultSurrogateProfile = new CrawlProfile(CRAWL_PROFILE_SURROGATE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
-                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, -1, true, true, false, false, false, false, true, true, false, CrawlProfile.CacheStrategy.NOCACHE);
+                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, true, true, false, false, false, false, true, true, false, CrawlProfile.CacheStrategy.NOCACHE);
            this.profilesActiveCrawls.put(this.defaultSurrogateProfile.handle().getBytes(), this.defaultSurrogateProfile);
        }
    }