- crawl profile: don't add null-values

- added some settings and statistics for url-fetcher 'server'-mode - added own stack for fetchable URLs - added possibility to fill stack via shift from peer's queues, via POST (addurls=$count and url$num=$url) or via file-upload - added "htroot" to classpath of linux start-script git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3370 6c8d7289-2bf4-0310-a012-ef5d649a1542
18 years ago · d114a0136e
parent a46dc43f45
commit d114a0136e
10 changed files with 685 additions and 127 deletions
--- a/2
+++ b/2
@ -10,6 +10,7 @@ Roland Ramthun
 Alexander Schier (Allo)
 Matthias Söhnholz
 Jan Sandbrink (NN)
+Franz Brausse (FB, karlchenofhell)

 Designers:
 =========
@ -21,6 +22,7 @@ Packagers:
 slick
 Alexander Schier
 Oliver Wunder (daburna)
+Franz Brausse

 Translators:
 ============
--- a/htroot/CrawlURLFetchStack_p.html
+++ b/htroot/CrawlURLFetchStack_p.html
@ -0,0 +1,70 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+  <head>
+    <title>YaCy '#[clientname]#': URL Fetcher Stack Management</title>
+    #%env/templates/metas.template%#
+  </head>
+  <body id="CrawlURLFetchStack_p">
+    #%env/templates/header.template%#
+    #%env/templates/submenuCrawlURLFetch.template%#
+    <h2>Manage stack for remote URL fetches</h2>
+    
+    #(addedUrls)#::<span class="success">Added #[added]# URLs!</span>#(/addedUrls)#
+    <form method="post" action="CrawlURLFetchStack_p.html" enctype="multipart/form-data">
+      <fieldset><legend>Statistics</legend>
+        <dl>
+          <dt>Currently stacked URLs:</dt><dd>#[urlCount]#</dd>
+          <dt>Totally fetched / added URLs:</dt><dd>#[totalFetched]# / #[totalAdded]#</dd>
+          #{peers}# 
+          <dt>Fetched from #[peer]#</dt><dd>#[amount]#</dd>#{/peers}# 
+        </dl>
+      </fieldset>
+      
+      <fieldset><legend>Settings</legend>
+        <dl>
+          <dt><label for="maxSize">Maximum URLs for each transfer</label>:</dt>
+          <dd>
+            <input type="text" name="maxSize" id="maxSize" value="#[maxSize]#" maxlength="3" size="3" />
+            <input type="submit" name="setMaxSize" value="Set" />
+            #(set)#::
+            <span class="success">Set max. size for each transfer to #[value]#</span>::
+            <span class="error">Setting max. size for each transfer to #[value]# was unsuccessful: may not be negative</span>#(/set)#
+          </dd>
+        </dl>
+      </fieldset>
+      
+      <fieldset><legend>Add URLs to stack</legend>
+          <dt><label for="shiftloc">Shift URLs from Local Crawler</label>:</dt>
+          <dd>
+            <input type="text" name="shiftloc" id="shiftloc" value="#[locurlsVal]#" size="5" maxlength="5" style="text-align: right;" />
+            of <span class="tt">#[locurls]#</span> URLs
+            <input type="submit" name="shiftlcq" value="Shift" />
+            #(shiftloc)#::
+            <span class="success">Shifted #[value]# URLs from Local Crawler Queue to URL Fetcher Stack</span>::
+            <span class="error">Shifting URLs from Local Crawler Queue to URL Fetcher Stack was unsuccessful: #[error]#</span>#(/shiftloc)#
+          </dd>
+          <dt><label for="shiftrem">Shift URLs from Remote Crawler</label>:</dt>
+          <dd>
+            <input type="text" name="shiftrem" id="shiftrem" value="#[remurlsVal]#" size="5" maxlength="5" style="text-align: right;" />
+            of <span class="tt">#[remurls]#</span> URLs
+            <input type="submit" name="shiftrcq" value="Shift" />
+            #(shiftrem)#::
+            <span class="success">Shifted #[value]# URLs from Remote Crawler Queue to URL Fetcher Stack</span>::
+            <span class="error">Shifting URLs from Remote Crawler Queue to URL Fetcher Stack was unsuccessful: #[error]#</span>#(/shiftrem)#
+          </dd>
+          <dt><label for="upload">Upload URL-List</label>:</dt>
+          <dd>
+            <input type="file" name="upload" id="upload" /> #(uploadError)#::&nbsp;<span class="error">No file entered for upload</span>#(/uploadError)#<br />
+            <input type="radio" name="uploadType" id="plain" value="plain" checked="checked" /> <label for="plain">Plain text, line-seperated</label><br />
+            <input type="radio" name="uploadType" id="html" value="html" disabled="disabled" /> <label for="html">HTML file, links will be added</label><br />
+            <input type="submit" name="subupload" value="Upload File" />
+            #(upload)#::
+            <span class="success">Added #[added]# and rejected #[failed]# URLs from uploaded file successfully</span>::
+            <span class="error">An internal error occured processing the uploaded file: #[error]#</span>#(/upload)#
+          </dd>
+        </dl>
+      </fieldset>
+    </form>
+    #%env/templates/footer.template%#
+  </body>
+</html>
--- a/htroot/CrawlURLFetchStack_p.java
+++ b/htroot/CrawlURLFetchStack_p.java
@ -0,0 +1,216 @@
+// CrawlURLFetchStack_p.java 
+// -------------------------------------
+// part of YACY
+//
+// (C) 2007 by Franz Brausse
+//
+// last change: $LastChangedDate: $ by $LastChangedBy: $
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+//
+// Using this software in any meaning (reading, learning, copying, compiling,
+// running) means that you agree that the Author(s) is (are) not responsible
+// for cost, loss of data or any harm that may be caused directly or indirectly
+// by usage of this softare or this documentation. The usage of this software
+// is on your own risk. The installation and usage (starting/running) of this
+// software may allow other people or application to access your computer and
+// any attached devices and is highly dependent on the configuration of the
+// software which must be done by the user of the software; the author(s) is
+// (are) also not responsible for proper configuration and usage of the
+// software, even if provoked by documentation provided together with
+// the software.
+//
+// Any changes to this file according to the GPL as documented in the file
+// gpl.txt aside this file in the shipment you received can be done to the
+// lines that follows this copyright notice here, but changes must not be
+// done inside the copyright notive above. A re-distribution must contain
+// the intact and unchanged copyright notice.
+// Contributions and changes to the program code must be marked as such.
+
+import java.io.File;
+import java.io.IOException;
+import java.io.Writer;
+import java.net.MalformedURLException;
+import java.util.HashMap;
+import java.util.Iterator;
+
+import de.anomic.data.URLFetcherStack;
+import de.anomic.htmlFilter.htmlFilterContentScraper;
+import de.anomic.htmlFilter.htmlFilterWriter;
+import de.anomic.http.httpHeader;
+import de.anomic.net.URL;
+import de.anomic.plasma.plasmaCrawlNURL;
+import de.anomic.plasma.plasmaSwitchboard;
+import de.anomic.server.serverFileUtils;
+import de.anomic.server.serverObjects;
+import de.anomic.server.serverSwitch;
+import de.anomic.server.logging.serverLog;
+
+public class CrawlURLFetchStack_p {
+    
+    public static final HashMap /* of PeerName, sent URLs */ fetchMap = new HashMap();
+    private static URLFetcherStack stack = null;
+    public static int maxURLsPerFetch = 50;
+    
+    public static URLFetcherStack getURLFetcherStack(serverSwitch env) {
+        if (stack == null) try {
+            stack = new URLFetcherStack(env.getConfig(plasmaSwitchboard.DBPATH, plasmaSwitchboard.DBPATH_DEFAULT));
+        } catch (IOException e) {
+            serverLog.logSevere("URLFETCHER", "Couldn't initialize URL stack: " + e.getMessage());
+        }
+        return stack;
+    }
+    
+    public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) {
+        final serverObjects prop = new serverObjects();
+        plasmaSwitchboard sb = (plasmaSwitchboard)env;
+        
+        if (post != null) {
+            if (post.containsKey("addurls")) {
+                prop.put("addedUrls", 1);
+                prop.put("addedUrls_added", addURLs(post, post.getInt("addurls", -1), getURLFetcherStack(env)));
+            }
+            else if (post.containsKey("setMaxSize")) {
+                final int count = post.getInt("maxSize", maxURLsPerFetch);
+                if (count > 0) {
+                    maxURLsPerFetch = count;
+                    prop.put("set", 1);
+                    prop.put("set_value", maxURLsPerFetch);
+                } else {
+                    prop.put("set", 2);
+                    prop.put("set_value", count);
+                }
+            }
+            else if (post.containsKey("shiftlcq")) {
+                int count = Math.min(post.getInt("shiftloc", 0), sb.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE));
+                try {
+                    shiftFromNotice(sb.noticeURL, plasmaCrawlNURL.STACK_TYPE_CORE, getURLFetcherStack(env), count);
+                    prop.put("shiftloc", 1);
+                    prop.put("shiftloc_value", count);
+                } catch (IOException e) {
+                    prop.put("shiftloc", 2);
+                    prop.put("shiftloc_error", e.getMessage());
+                }
+            }
+            else if (post.containsKey("shiftrcq")) {
+                int count = post.getInt("shiftrem", 0);
+                try {
+                    shiftFromNotice(sb.noticeURL, plasmaCrawlNURL.STACK_TYPE_LIMIT, getURLFetcherStack(env), count);
+                    prop.put("shiftrem", 1);
+                    prop.put("shiftrem_value", count);
+                } catch (IOException e) {
+                    prop.put("shiftrem", 2);
+                    prop.put("shiftrem_error", e.getMessage());
+                }
+            }
+            else if (post.containsKey("subupload")) {
+                if (post.get("upload", "").length() == 0) {
+                    prop.put("uploadError", 1);
+                } else {
+                    final File file = new File(post.get("upload", ""));
+                    final String content = new String((byte[])post.get("upload$file"));
+                    
+                    final String type = post.get("uploadType", "");
+                    if (type.equals("plain")) {
+                        prop.put("upload_added", addURLs(content.split("\n"), getURLFetcherStack(env)));
+                        prop.put("upload_failed", 0);
+                        prop.put("upload", 1);
+                    } else if (type.equals("html")) {
+                        try {
+                            final htmlFilterContentScraper scraper = new htmlFilterContentScraper(new URL(file));
+                            final Writer writer = new htmlFilterWriter(null, null, scraper, null, false);
+                            serverFileUtils.write(content, writer);
+                            writer.close();
+                            
+                            final Iterator it = ((HashMap)scraper.getAnchors()).keySet().iterator();
+                            int added = 0, failed = 0;
+                            String url;
+                            while (it.hasNext()) try {
+                                url = (String)it.next();
+                                getURLFetcherStack(env).push(new URL(url));
+                                added++;
+                            } catch (MalformedURLException e) { failed++; }
+                            prop.put("upload", 1);
+                            prop.put("upload_added", added);
+                            prop.put("upload_failed", failed);
+                        } catch (Exception e) {
+                            e.printStackTrace();
+                            prop.put("upload", 2);
+                            prop.put("upload_error", e.getMessage());
+                        }
+                    }
+                }
+            }
+        }
+        
+        putFetched(prop);
+        prop.put("urlCount", getURLFetcherStack(env).size());
+        prop.put("totalFetched", getURLFetcherStack(env).getPopped());
+        prop.put("totalAdded", getURLFetcherStack(env).getPushed());
+        prop.put("maxSize", maxURLsPerFetch);
+        prop.put("locurls", sb.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE));
+        prop.put("remurls", sb.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT));
+        prop.put("locurlsVal", Math.min(sb.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE), 500));
+        prop.put("remurlsVal", Math.min(sb.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT), 500));
+        
+        return prop;
+    }
+    
+    private static void putFetched(serverObjects prop) {
+        Iterator it = fetchMap.keySet().iterator();
+        int count = 0;
+        while (it.hasNext()) {
+            String key = (String)it.next();
+            prop.put("peers_" + count + "_peer", key);
+            prop.put("peers_" + count + "_amount", ((Integer)fetchMap.get(key)).intValue());
+            count++;
+        }
+        prop.put("peers", count);
+    }
+    
+    private static int addURLs(String[] urls, URLFetcherStack stack) {
+        int count = -1;
+        for (int i=0; i<urls.length; i++) try {
+            if (urls[i].length() == 0) continue;
+            stack.push(new URL(urls[i]));
+            count++;
+        } catch (MalformedURLException e) { /* ignore this */ }
+        return count;
+    }
+    
+    private static void shiftFromNotice(plasmaCrawlNURL nurl, int fromStackType, URLFetcherStack stack, int count) throws IOException {
+        plasmaCrawlNURL.Entry entry;
+        for (int i=0; i<count; i++) {
+            entry = nurl.pop(fromStackType);
+            stack.push(entry.url());
+        }
+    }
+    
+    private static int addURLs(serverObjects post, int amount, URLFetcherStack stack) {
+        int count = 0;
+        String url;
+        for (int i=0; i<amount; i++) {
+            url = post.get("url" + count++, null);
+            if (url == null || url.length() == 0) continue;
+            try {
+                stack.push(new URL(url));
+                count++;
+            } catch (MalformedURLException e) {
+                serverLog.logInfo("URLFETCHER", "retrieved invalid url for adding to the stack: " + url);
+            }
+        }
+        return count;
+    }
+}
--- a/htroot/CrawlURLFetch_p.html
+++ b/htroot/CrawlURLFetch_p.html
@ -1,11 +1,12 @@
 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
 <html xmlns="http://www.w3.org/1999/xhtml">
  <head>
-    <title>YaCy '#[clientname]#': Local Cache Management</title>
+    <title>YaCy '#[clientname]#': URL Fetcher Management</title>
    #%env/templates/metas.template%#
  </head>
  <body id="CrawlURLFetch_p">
    #%env/templates/header.template%#
+    #%env/templates/submenuCrawlURLFetch.template%#
    <h2>URL-Fetcher</h2>
    <form method="post" action="CrawlURLFetch_p.html" enctype="multipart/form-data">
      <fieldset><legend>Fetch new URLs to crawl</legend>
@ -38,6 +39,7 @@
              <option value="random" selected="selected">Choose a random peer</option>#{peers}# 
              <option value="#[hash]#">#[name]#</option>#{/peers}# 
            </select>
+            <input type="submit" name="checkPeerURLCount" value="Check URL count" />
            &nbsp;<label for="amount">Amount of URLs to request</label>:
            <input type="text" name="amount" id="amount" value="50" maxlength="3" size="3" />
            #(peerError)#::
@ -53,9 +55,9 @@
            <label for="frequency">every</label>
            &nbsp;<input type="text" name="frequency" id="frequency" size="2" style="text-align: right;" maxlength="2"/>
            <select name="freq_type">
-              <option value="weeks">Weeks</option>
-              <option value="days" selected="selected">Days</option>
-              <option value="hours">Hours</option>
+              <option value="days">Days</option>
+              <option value="hours" selected="selected">Hours</option>
+              <option value="minutes">Minutes</option>
            </select>
            #(freqError)#::&nbsp;<span class="error">Invalid period, fetching only once</span>#(/freqError)#
          </dd>
--- a/htroot/CrawlURLFetch_p.java
+++ b/htroot/CrawlURLFetch_p.java
@ -1,4 +1,43 @@
 // CrawlURLFetch_p.java
+// -------------------------------------
+// part of YACY
+//
+// (C) 2007 by Franz Brausse
+//
+// last change: $LastChangedDate: $ by $LastChangedBy: $
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+//
+// Using this software in any meaning (reading, learning, copying, compiling,
+// running) means that you agree that the Author(s) is (are) not responsible
+// for cost, loss of data or any harm that may be caused directly or indirectly
+// by usage of this softare or this documentation. The usage of this software
+// is on your own risk. The installation and usage (starting/running) of this
+// software may allow other people or application to access your computer and
+// any attached devices and is highly dependent on the configuration of the
+// software which must be done by the user of the software; the author(s) is
+// (are) also not responsible for proper configuration and usage of the
+// software, even if provoked by documentation provided together with
+// the software.
+//
+// Any changes to this file according to the GPL as documented in the file
+// gpl.txt aside this file in the shipment you received can be done to the
+// lines that follows this copyright notice here, but changes must not be
+// done inside the copyright notive above. A re-distribution must contain
+// the intact and unchanged copyright notice.
+// Contributions and changes to the program code must be marked as such.

 import java.io.IOException;
 import java.net.MalformedURLException;
@ -17,6 +56,7 @@ import de.anomic.plasma.plasmaCrawlProfile;
 import de.anomic.plasma.plasmaSwitchboard;
 import de.anomic.server.serverSwitch;
 import de.anomic.http.httpHeader;
+import de.anomic.http.httpRemoteProxyConfig;
 import de.anomic.http.httpc;
 import de.anomic.server.serverObjects;
 import de.anomic.server.logging.serverLog;
@ -41,16 +81,11 @@ public class CrawlURLFetch_p {
    private static plasmaCrawlProfile.entry profile = null;
    private static ArrayList savedURLs = new ArrayList();
    
-    public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) {
-        serverObjects prop = new serverObjects();
-        prop.put("host", "");
-        listURLs(prop);                     // List previously saved URLs for easy selection
-        listPeers(prop);                    // List known hosts
-        
+    public static plasmaCrawlProfile.entry getCrawlProfile(serverSwitch env) {
        if (profile == null) {
            profile = ((plasmaSwitchboard)env).profiles.newEntry(
                    "URLFetcher",           // Name
-                    null,                   // URL
+                    "",                     // URL
                    ".*", ".*",             // General / specific filter
                    0, 0,                   // General / specific depth
                    -1, -1, -1,             // Recrawl / Dom-filter depth / Dom-max-pages
@ -60,6 +95,20 @@ public class CrawlURLFetch_p {
                    false,                  // Remote indexing
                    true, false, false);    // Exclude static / dynamic / parent stopwords
        }
+        return profile;
+    }
+    
+    public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) {
+        serverObjects prop = new serverObjects();
+        prop.put("host", "");
+        
+        // List previously saved URLs for easy selection
+        listURLs(prop);
+        
+        // List known hosts
+        listPeers(prop,
+                post != null && post.containsKey("checkPeerURLCount"),
+                ((plasmaSwitchboard)env).remoteProxyConfig);
        
        if (post != null) {
            if (post.containsKey("start")) {
@ -82,59 +131,65 @@ public class CrawlURLFetch_p {
                
                if (fetcher != null) fetcher.interrupt();
                fetcher = null;
-                if (post.get("source", "").equals("peer") &&
-                        post.get("peerhash", "").equals("random")) {
-                    fetcher = new URLFetcher(
-                            env,
-                            profile,
-                            count,
-                            frequency);
-                } else {
-                    URL url = null;
-                    if (post.get("source", "").equals("url")) {
-                        try {
-                            url = new URL(post.get("host", null));
-                            if (!savedURLs.contains(url.toNormalform()))
-                                savedURLs.add(url.toNormalform());
-                            prop.put("host", post.get("host", url.toString()));
-                        } catch (MalformedURLException e) {
-                            prop.put("host", post.get("host", ""));
-                            prop.put("hostError", ERR_HOST_MALFORMED_URL);
-                        }
-                    } else if (post.get("source", "").equals("savedURL")) {
-                        try {
-                            url = new URL(post.get("saved", ""));
-                        } catch (MalformedURLException e) {
-                            /* should never appear, except for invalid input, see above */
-                        }
-                    } else if (post.get("source", "").equals("peer")) {
-                        yacySeed ys = null;
-                        try {
-                            ys = yacyCore.seedDB.get(post.get("peerhash", ""));
+                try {
+                    if (post.get("source", "").equals("peer") &&
+                            post.get("peerhash", "").equals("random")) {
+                        fetcher = new URLFetcher(
+                                env,
+                                getCrawlProfile(env),
+                                count,
+                                frequency);
+                    } else {
+                        URL url = null;
+                        if (post.get("source", "").equals("url")) {
+                            try {
+                                url = new URL(post.get("host", null));
+                                if (!savedURLs.contains(url.toNormalform()))
+                                    savedURLs.add(url.toNormalform());
+                                prop.put("host", post.get("host", url.toString()));
+                            } catch (MalformedURLException e) {
+                                prop.put("host", post.get("host", ""));
+                                prop.put("hostError", ERR_HOST_MALFORMED_URL);
+                            }
+                        } else if (post.get("source", "").equals("savedURL")) {
+                            try {
+                                url = new URL(post.get("saved", ""));
+                            } catch (MalformedURLException e) {
+                                /* should never appear, except for invalid input, see above */
+                            }
+                        } else if (post.get("source", "").equals("peer")) {
+                            yacySeed ys = null;
+                            ys = yacyCore.seedDB.get(post.get("peerhash", null));
                            if (ys != null) {
-                                url = new URL("http://" + ys.getAddress() + URLFetcher.LIST_SERVLET);
+                                if ((url = URLFetcher.getListServletURL(
+                                        ys.getAddress(),
+                                        URLFetcher.MODE_LIST,
+                                        count,
+                                        yacyCore.seedDB.mySeed.hash)) == null) {
+                                    prop.put("peerError", ERR_PEER_GENERAL_CONN);
+                                    prop.put("peerError_hash", post.get("peerhash", ""));
+                                    prop.put("peerError_name", ys.getName());
+                                }
                            } else {
                                prop.put("peerError", ERR_PEER_OFFLINE);
                                prop.put("peerError_hash", post.get("peerhash", ""));
                            }
-                        } catch (MalformedURLException e) {
-                            prop.put("peerError", ERR_PEER_GENERAL_CONN);
-                            prop.put("peerError_hash", post.get("peerhash", ""));
-                            prop.put("peerError_name", ys.getName());
+                        }
+                        
+                        if (url != null) {
+                            fetcher = new URLFetcher(
+                                    env,
+                                    getCrawlProfile(env),
+                                    url,
+                                    count,
+                                    frequency);
                        }
                    }
-                    
-                    if (url != null) {
-                        fetcher = new URLFetcher(
-                                env,
-                                profile,
-                                url,
-                                count,
-                                frequency);
-                    }
+                    if (fetcher != null)
+                        fetcher.start();
+                } catch (IOException e) {
+                    e.printStackTrace();
                }
-                if (fetcher != null)
-                    fetcher.start();
            }
            else if (post.containsKey("stop")) {
                if (fetcher != null) {
@ -145,22 +200,26 @@ public class CrawlURLFetch_p {
            }
            else if (post.containsKey("restart")) {
                if (fetcher != null) {
-                    fetcher.interrupt();
-                    if (fetcher.url == null) {
-                        fetcher = new URLFetcher(
-                                env,
-                                profile,
-                                fetcher.count,
-                                fetcher.delay);
-                    } else {
-                        fetcher = new URLFetcher(
-                                env,
-                                profile,
-                                fetcher.url,
-                                fetcher.count,
-                                fetcher.delay);
+                    try {
+                        fetcher.interrupt();
+                        if (fetcher.url == null) {
+                            fetcher = new URLFetcher(
+                                    env,
+                                    getCrawlProfile(env),
+                                    fetcher.count,
+                                    fetcher.delay);
+                        } else {
+                            fetcher = new URLFetcher(
+                                    env,
+                                    getCrawlProfile(env),
+                                    fetcher.url,
+                                    fetcher.count,
+                                    fetcher.delay);
+                        }
+                        fetcher.start();
+                    } catch (IOException e) {
+                        e.printStackTrace();
                    }
-                    fetcher.start();
                } else {
                    prop.put("threadError", ERR_THREAD_RESUME);
                }
@ -204,7 +263,7 @@ public class CrawlURLFetch_p {
        return savedURLs.size();
    }
    
-    private static int listPeers(serverObjects prop) {
+    private static int listPeers(serverObjects prop, boolean checkURLCount, httpRemoteProxyConfig theRemoteProxyConfig) {
        int peerCount = 0;
        if (yacyCore.seedDB != null && yacyCore.seedDB.sizeConnected() > 0) {
            prop.put("peersKnown", 1);
@ -213,14 +272,15 @@ public class CrawlURLFetch_p {
                final Enumeration e = yacyCore.seedDB.seedsConnected(true, false, null, yacyVersion.YACY_PROVIDES_CRAWLS_VIA_LIST_HTML);
                while (e.hasMoreElements()) {
                    yacySeed seed = (yacySeed) e.nextElement();
-                    if (seed != null) hostList.put(seed.get(yacySeed.NAME, "nameless"),seed.hash);
+                    if (seed != null && (!checkURLCount || getURLs2Fetch(seed, theRemoteProxyConfig) > 0))
+                        hostList.put(seed.get(yacySeed.NAME, "nameless"), seed.hash);
                }
-
+                
                String peername;
                while ((peername = (String) hostList.firstKey()) != null) {
-                    final String Hash = (String) hostList.get(peername);
-                    if (Hash.equals(yacyCore.seedDB.mySeed.hash)) continue;
-                    prop.put("peersKnown_peers_" + peerCount + "_hash", Hash);
+                    final String hash = (String) hostList.get(peername);
+                    if (hash.equals(yacyCore.seedDB.mySeed.hash)) continue;
+                    prop.put("peersKnown_peers_" + peerCount + "_hash", hash);
                    prop.put("peersKnown_peers_" + peerCount + "_name", peername);
                    hostList.remove(peername);
                    peerCount++;
@ -233,15 +293,37 @@ public class CrawlURLFetch_p {
        return peerCount;
    }
    
+    private static int getURLs2Fetch(yacySeed seed, httpRemoteProxyConfig theRemoteProxyConfig) {
+        try {
+            String answer = new String(httpc.wget(
+                    URLFetcher.getListServletURL(seed.getAddress(), URLFetcher.MODE_COUNT, 0, null),
+                    seed.getIP(),
+                    5000,
+                    null, null,
+                    theRemoteProxyConfig));
+            if (answer.matches("\\d+"))
+                return Integer.parseInt(answer);
+            else {
+                System.err.println("RETRIEVED INVALID ANSWER FROM " + seed.getName() + ": '" + answer + "'");
+                return -1;
+            }
+        } catch (MalformedURLException e) {
+            /* should not happen */
+            return -3;
+        } catch (IOException e) {
+            return -2;
+        }
+    }
+    
    private static long getDate(String count, String type) {
        long r = 0;
        if (count != null && count.matches("\\d+")) r = Long.parseLong(count);
        if (r < 1) return -1;
        
-        r *= 3600000;
-        if (type.equals("weeks"))       return r * 24 * 7;
-        else if (type.equals("days"))   return r * 24;
-        else if (type.equals("hours"))  return r;
+        r *= 60000;
+        if (type.equals("days"))            return r * 60 * 24;
+        else if (type.equals("hours"))      return r * 60;
+        else if (type.equals("minutes"))    return r;
        else return -1;
    }
    
@ -250,7 +332,8 @@ public class CrawlURLFetch_p {
        public static final long DELAY_ONCE = -1;
        public static final long DELAY_SELF_DET = 0;
        
-        private static final String LIST_SERVLET = "/yacy/list.html?list=queueUrls";
+        public static final int MODE_LIST = 0;
+        public static final int MODE_COUNT = 1;
        
        public static int totalRuns = 0;
        public static int totalFetchedURLs = 0;
@ -271,12 +354,35 @@ public class CrawlURLFetch_p {
        
        public boolean paused = false;
        
+        public static URL getListServletURL(String host, int mode, int count, String peerHash) {
+            String r = "http://" + host + "/yacy/list.html?list=queueUrls&display=";
+            
+            switch (mode) {
+            case MODE_LIST: r += "list"; break;
+            case MODE_COUNT: r += "count"; break;
+            }
+            
+            if (count > 0) r += "&count=" + count;
+            
+            if (peerHash != null && peerHash.length() > 0) {
+                r += "&iam=" + peerHash;
+            } else if (mode == MODE_LIST) {
+                r += "&iam=" + yacyCore.seedDB.mySeed.hash;
+            }
+            
+            try {
+                return new URL(r);
+            } catch (MalformedURLException e) {
+                return null;
+            }
+        }
+        
        public URLFetcher(
                serverSwitch env,
                plasmaCrawlProfile.entry profile,
                URL url,
                int count,
-                long delayMs) {
+                long delayMs) throws IOException {
            if (env == null || profile == null || url == null)
                throw new NullPointerException("env, profile or url must not be null");
            this.sb = (plasmaSwitchboard)env;
@ -291,7 +397,7 @@ public class CrawlURLFetch_p {
                serverSwitch env,
                plasmaCrawlProfile.entry profile,
                int count,
-                long delayMs) {
+                long delayMs) throws IOException {
            if (env == null || profile == null)
                throw new NullPointerException("env or profile must not be null");
            this.sb = (plasmaSwitchboard)env;
@ -317,6 +423,7 @@ public class CrawlURLFetch_p {
                    totalFetchedURLs += stackURLs(getURLs(url));
                    this.lastRun = System.currentTimeMillis() - start;
                    totalRuns++;
+                    serverLog.logInfo(this.getName(), "Loaded " + this.lastFetchedURLs + " URLs from " + url + " in " + this.lastRun + " ms into stackcrawler.");
                    if (this.delay < 0 || isInterrupted()) {
                        return;
                    } else synchronized (this) {
@ -347,9 +454,7 @@ public class CrawlURLFetch_p {
            }
            if (ys == null) return null;
            
-            try {
-                return new URL("http://" + ys.getAddress() + LIST_SERVLET + "&count=" + this.count);
-            } catch (MalformedURLException ee) { return null; }
+            return getListServletURL(ys.getAddress(), MODE_LIST, this.count, yacyCore.seedDB.mySeed.hash);
        }
        
        private int stackURLs(String[] urls) throws InterruptedException {
@ -359,7 +464,6 @@ public class CrawlURLFetch_p {
            String reason;
            for (int i=0; i<urls.length && !isInterrupted(); i++) {
                if (urls[i].trim().length() == 0) continue;
-                serverLog.logFine(this.getName(), "stacking " + urls[i]);
                reason = this.sb.sbStackCrawlThread.stackCrawl(
                        urls[i],
                        null,
@ -369,8 +473,10 @@ public class CrawlURLFetch_p {
                        this.profile.generalDepth(),
                        this.profile);
                if (reason == null) {
+                    serverLog.logFine(this.getName(), "stacked " + urls[i]);
                    this.lastFetchedURLs++;
                } else {
+                    serverLog.logFine(this.getName(), "error on stacking " + urls[i] + ": " + reason);
                    this.lastFailed++;
                    totalFailed++;
                    this.failed.put(urls[i], reason);
--- a/htroot/env/templates/submenuCrawlURLFetch.template
+++ b/htroot/env/templates/submenuCrawlURLFetch.template
@ -0,0 +1,7 @@
+<div class="SubMenu">
+  <h3>URL Fetcher Menu</h3>
+  <ul class="SubMenu">
+    <li><a href="/CrawlURLFetch_p.html" class="MenuItemLink lock">URL Fetcher</a></li>
+    <li><a href="/CrawlURLFetchStack_p.html" class="MenuItemLink lock">URL Stack</a></li>
+  </ul>
+</div>
--- a/htroot/yacy/list.java
+++ b/htroot/yacy/list.java
@ -48,18 +48,21 @@
 // javac -classpath .:../../classes list.java
 // if the shell's current path is HTROOT

+// contains contributions by [FB] to support listing URLs for URL Fetcher
+
 import java.io.File;
-import java.io.IOException;

+import de.anomic.data.URLFetcherStack;
 import de.anomic.data.listManager;
 import de.anomic.data.wikiCode;
 import de.anomic.http.httpHeader;
-import de.anomic.plasma.plasmaCrawlNURL;
-import de.anomic.plasma.plasmaSwitchboard;
+import de.anomic.net.URL;
 import de.anomic.server.serverCore;
 import de.anomic.server.serverObjects;
 import de.anomic.server.serverSwitch;
 import de.anomic.server.logging.serverLog;
+import de.anomic.yacy.yacyCore;
+import de.anomic.yacy.yacySeed;

 public final class list {

@ -72,6 +75,7 @@ public final class list {
        
        final String col = post.get("col", "");
        final File listsPath = new File(ss.getRootPath(),ss.getConfig("listsPath", "DATA/LISTS"));
+        final String otherPeerName = yacyCore.seedDB.get(post.get("iam", null)).get(yacySeed.NAME, "unknown");

        if (col.equals("black")) {
            final StringBuffer out = new StringBuffer();
@ -89,27 +93,39 @@ public final class list {
            } // if filenamesarray.length > 0

            prop.put("list",out);
-        } else if (col.length() == 0 && post.get("list", "").equals("queueUrls")) {
-            // list urls from remote crawler queue for other peers
-            int count = 50;
-            if (post.get("count", "").length() > 0 && post.get("count", "").matches("\\d+"))
-                count = Integer.parseInt(post.get("count", ""));
-            
-            final StringBuffer sb = new StringBuffer();
-            plasmaCrawlNURL.Entry entry;
-            for (int i=0; i<count && count - i<((plasmaSwitchboard)ss).noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT); i++) {
-                try {
-                    entry = ((plasmaSwitchboard)ss).noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_LIMIT);
-                    sb.append(wikiCode.deReplaceHTMLEntities(entry.url().toNormalform())).append("\n");
-                } catch (IOException e) {
-                    serverLog.logSevere("/yacy/list.html", "CANNOT FETCH ENTRY " + i + "/" + count + ": " + e.getMessage());
+        }
+        // start contrib by [FB]
+        else if (col.length() == 0 && post.get("list", "").equals("queueUrls")) {
+            final URLFetcherStack db = CrawlURLFetchStack_p.getURLFetcherStack(ss);
+            final String display = post.get("display", "list");
+            if (display.equals("list")) {
+                // list urls from remote crawler queue for other peers
+                final int count = Math.min(post.getInt("count", 50), CrawlURLFetchStack_p.maxURLsPerFetch);
+                if (count > 0 && db.size() > 0) {
+                    final StringBuffer sb = new StringBuffer();
+                    
+                    URL url;
+                    int cnt = 0;
+                    for (int i=0; i<count; i++) {
+                        if ((url = db.pop()) == null) continue;
+                        sb.append(wikiCode.deReplaceHTMLEntities(url.toNormalform())).append("\n");
+                        cnt++;
+                    }
+                    prop.put("list", sb);
+                    CrawlURLFetchStack_p.fetchMap.put(otherPeerName, new Integer(cnt));
+                    serverLog.logInfo("URLFETCHER", "sent " + cnt + " URLs to peer " + otherPeerName);
+                } else {
+                    prop.put("list", "");
+                    serverLog.logInfo("URLFETCHER", "couldn't satisfy URL request of " + otherPeerName + ": stack is empty");
                }
+            } else if (display.equals("count")) {
+                prop.put("list", db.size());
            }
-            prop.put("list", sb);
+        // end contrib by [FB]
        } else {
            prop.putASIS("list","");
        }
-
+        
        return prop;
    }
 }
--- a/source/de/anomic/data/URLFetcherStack.java
+++ b/source/de/anomic/data/URLFetcherStack.java
@ -0,0 +1,138 @@
+// URLFetcherStack.java 
+// -------------------------------------
+// part of YACY
+//
+// (C) 2007 by Franz Brausse
+//
+// last change: $LastChangedDate: $ by $LastChangedBy: $
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+//
+// Using this software in any meaning (reading, learning, copying, compiling,
+// running) means that you agree that the Author(s) is (are) not responsible
+// for cost, loss of data or any harm that may be caused directly or indirectly
+// by usage of this softare or this documentation. The usage of this software
+// is on your own risk. The installation and usage (starting/running) of this
+// software may allow other people or application to access your computer and
+// any attached devices and is highly dependent on the configuration of the
+// software which must be done by the user of the software; the author(s) is
+// (are) also not responsible for proper configuration and usage of the
+// software, even if provoked by documentation provided together with
+// the software.
+//
+// Any changes to this file according to the GPL as documented in the file
+// gpl.txt aside this file in the shipment you received can be done to the
+// lines that follows this copyright notice here, but changes must not be
+// done inside the copyright notive above. A re-distribution must contain
+// the intact and unchanged copyright notice.
+// Contributions and changes to the program code must be marked as such.
+
+package de.anomic.data;
+
+import java.io.File;
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.util.ArrayList;
+import java.util.Iterator;
+
+import de.anomic.kelondro.kelondroBase64Order;
+import de.anomic.kelondro.kelondroException;
+import de.anomic.kelondro.kelondroRow;
+import de.anomic.kelondro.kelondroStack;
+import de.anomic.net.URL;
+import de.anomic.server.logging.serverLog;
+
+public class URLFetcherStack {
+    
+    public static final String DBFILE = "urlRemote2.stack";
+    
+    private static final kelondroRow rowdef = new kelondroRow(
+            "String urlstring-256",
+            kelondroBase64Order.enhancedCoder,
+            0
+    );
+    private final kelondroStack db;
+    private final serverLog log;
+    
+    private int popped = 0;
+    private int pushed = 0;
+    
+    public URLFetcherStack(String path) throws IOException {
+        this.db = new kelondroStack(
+                new File(path + File.separator + DBFILE),
+                rowdef);
+        this.log = new serverLog("URLFETCHERSTACK");
+    }
+    
+    public int getPopped() { return this.popped; }
+    public int getPushed() { return this.pushed; }
+    public void clearStat() { this.popped = 0; this.pushed = 0; }
+    
+    public void finalize() throws Throwable {
+        this.db.close();
+    }
+    
+    public boolean push(URL url) {
+        try {
+            this.db.push(this.db.row().newEntry(
+                    new byte[][] { url.toNormalform().getBytes() }
+            ));
+            this.pushed++;
+            return true;
+        } catch (IOException e) {
+            this.log.logSevere("error storing entry", e);
+            return false;
+        }
+    }
+    
+    public URL pop() {
+        try {
+            kelondroRow.Entry r = this.db.pop();
+            if (r == null) return null;
+            final String url = r.getColString(0, null);
+            try {
+                this.popped++;
+                return new URL(url);
+            } catch (MalformedURLException e) {
+                this.log.logSevere("found invalid URL-entry: " + url);
+                return null;
+            }
+        } catch (IOException e) {
+            this.log.logSevere("error retrieving entry", e);
+            return null;
+        }
+    }
+    
+    public String[] top(int count) {
+        try {
+            final ArrayList ar = new ArrayList();
+            Iterator it = db.contentRows(500);
+            kelondroRow.EntryIndex ei;
+            for (int i=0; i<count && it.hasNext(); i++) {
+                ei = (kelondroRow.EntryIndex)it.next();
+                if (ei == null) continue;
+                ar.add(ei.getColString(0, null));
+            }
+            return (String[])ar.toArray(new String[ar.size()]);
+        } catch (kelondroException e) {
+            this.log.logSevere("error retrieving entry", e);
+            return null;
+        }
+    }
+    
+    public int size() {
+        return this.db.size();
+    }
+}
--- a/source/de/anomic/plasma/plasmaCrawlProfile.java
+++ b/source/de/anomic/plasma/plasmaCrawlProfile.java
@ -258,27 +258,28 @@ public class plasmaCrawlProfile {
                     boolean storeHTCache, boolean storeTXCache,
                     boolean remoteIndexing,
                     boolean xsstopw, boolean xdstopw, boolean xpstopw) {
+            if (name == null || name.length() == 0) throw new NullPointerException("name must not be null");
            String handle = kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(Long.toString(System.currentTimeMillis()))).substring(0, crawlProfileHandleLength);
            mem = new HashMap();
-            mem.put("handle", handle);
-            mem.put("name", name);
-            mem.put("startURL", startURL);
-            mem.put("generalFilter", generalFilter);
-            mem.put("specificFilter", specificFilter);
-            mem.put("generalDepth", Integer.toString(generalDepth));
-            mem.put("specificDepth", Integer.toString(specificDepth));
-            mem.put("recrawlIfOlder", Integer.toString(recrawlIfOlder));
-            mem.put("domFilterDepth", Integer.toString(domFilterDepth));
-            mem.put("domMaxPages", Integer.toString(domMaxPages));
-            mem.put("crawlingQ", (crawlingQ) ? "true" : "false"); // crawling of urls with '?'
-            mem.put("indexText", (indexText) ? "true" : "false");
-            mem.put("indexMedia", (indexMedia) ? "true" : "false");
-            mem.put("storeHTCache", (storeHTCache) ? "true" : "false");
-            mem.put("storeTXCache", (storeTXCache) ? "true" : "false");
-            mem.put("remoteIndexing", (remoteIndexing) ? "true" : "false");
-            mem.put("xsstopw", (xsstopw) ? "true" : "false"); // exclude static stop-words
-            mem.put("xdstopw", (xdstopw) ? "true" : "false"); // exclude dynamic stop-word
-            mem.put("xpstopw", (xpstopw) ? "true" : "false"); // exclude parent stop-words
+            mem.put("handle",           handle);
+            mem.put("name",             name);
+            mem.put("startURL",         (startURL == null) ? "" : startURL);
+            mem.put("generalFilter",    (generalFilter == null) ? ".*" : generalFilter);
+            mem.put("specificFilter",   (specificFilter == null) ? ".*" : specificFilter);
+            mem.put("generalDepth",     Integer.toString(generalDepth));
+            mem.put("specificDepth",    Integer.toString(specificDepth));
+            mem.put("recrawlIfOlder",   Integer.toString(recrawlIfOlder));
+            mem.put("domFilterDepth",   Integer.toString(domFilterDepth));
+            mem.put("domMaxPages",      Integer.toString(domMaxPages));
+            mem.put("crawlingQ",        (crawlingQ) ? "true" : "false"); // crawling of urls with '?'
+            mem.put("indexText",        (indexText) ? "true" : "false");
+            mem.put("indexMedia",       (indexMedia) ? "true" : "false");
+            mem.put("storeHTCache",     (storeHTCache) ? "true" : "false");
+            mem.put("storeTXCache",     (storeTXCache) ? "true" : "false");
+            mem.put("remoteIndexing",   (remoteIndexing) ? "true" : "false");
+            mem.put("xsstopw",          (xsstopw) ? "true" : "false"); // exclude static stop-words
+            mem.put("xdstopw",          (xdstopw) ? "true" : "false"); // exclude dynamic stop-word
+            mem.put("xpstopw",          (xpstopw) ? "true" : "false"); // exclude parent stop-words

            doms = new HashMap();
        }
--- a/startYACY.sh
+++ b/startYACY.sh
@ -93,7 +93,7 @@ fi
 CLASSPATH=""
 for N in lib/*.jar; do CLASSPATH="$CLASSPATH$N:"; done	
 for N in libx/*.jar; do CLASSPATH="$CLASSPATH$N:"; done
-CLASSPATH="classes:.:$CLASSPATH"
+CLASSPATH="classes:.:htroot:$CLASSPATH"


 cmdline="";