- added the possibility to have not one but a list of crawl start urls

- the list of urls is entered in the expert crawl start in a textfield; the one-line input field was replaced with a text box - start urls can also be given in one single line where the urls are separated by a '|'-character - as an effect, the crawl profile cannot carry a single start url for identificaton because it is possible to have more. Therefore the url was removed from the crawl profile - this affect all servlets which display a crawl profile: removed the url field from all there servlets - to work consistently with several start urls and the other crawl starts which computed crawl start url lists from sitelists or sitemaps, the crawl start servlet was restructured completely - new rules for must-match patterns were created to make it possible that site crawl starts also work with several crawl starts at once
13 years ago · a13e5153ac
parent 975bc95ddf
commit a13e5153ac
11 changed files with 307 additions and 371 deletions
--- a/htroot/CrawlProfileEditor_p.html
+++ b/htroot/CrawlProfileEditor_p.html
@ -33,7 +33,6 @@
  <tr class="TableHeader"> 
    <td><strong>Crawl Thread</strong></td>
    <td><strong>Status</strong></td>
-    <td><strong>Start URL</strong></td>
    <td><strong>Depth</strong></td>
    <td><strong>Must Match</strong></td>
    <td><strong>Must Not Match</strong></td>
@ -64,7 +63,6 @@
        </div></form>
        #(/deleteButton)#
 	</td>
-    <td><a  href="#[startURL]#">#[startURL]#</a></td>
    <td>#[depth]#</td>
    <td>#[mustmatch]#</td>
    <td>#[mustnotmatch]#</td>
--- a/htroot/CrawlProfileEditor_p.java
+++ b/htroot/CrawlProfileEditor_p.java
@ -66,7 +66,6 @@ public class CrawlProfileEditor_p {
    private static final List <eentry> labels = new ArrayList<eentry>();
    static {
        labels.add(new eentry(CrawlProfile.NAME,                "Name",                  true,  eentry.STRING));
-        labels.add(new eentry(CrawlProfile.START_URL,           "Start URL",             true,  eentry.STRING));
        labels.add(new eentry(CrawlProfile.FILTER_URL_MUSTMATCH,    "Must-Match Filter",     false, eentry.STRING));
        labels.add(new eentry(CrawlProfile.FILTER_URL_MUSTNOTMATCH, "Must-Not-Match Filter", false, eentry.STRING));
        labels.add(new eentry(CrawlProfile.DEPTH,               "Crawl Depth",           false, eentry.INTEGER));
--- a/htroot/CrawlStartExpert_p.html
+++ b/htroot/CrawlStartExpert_p.html
@ -44,7 +44,7 @@
                <td><label for="url"><span class="nobr">From URL&nbsp;(must start with<br/>http:// https:// ftp:// smb:// file://)</span></label>:</td>
                <td><input type="radio" name="crawlingMode" id="url" value="url" checked="checked" /></td>
                <td>
-                  <input name="crawlingURL" id="crawlingURL" type="text" size="41" maxlength="256" value="#[starturl]#" onkeypress="changed()" onfocus="check('url')" />
+                  <textarea name="crawlingURL" id="crawlingURL" cols="41" rows="3" size="41" onkeypress="changed()" onfocus="check('url')" >#[starturl]#</textarea>
                </td>
              </tr>
              <tr>
@ -83,7 +83,8 @@
            </table>
          </td>
          <td colspan="3">
-            Existing start URLs are always re-crawled.
+            Define the start-url(s) here. You can submit more than one URL, each line one URL please.
+            Each of these URLs are the root for a crawl start, existing start URLs are always re-loaded.
            Other already visited URLs are sorted out as "double", if they are not allowed using the re-crawl option.
          </td>
        </tr>
--- a/htroot/CrawlStartExpert_p.java
+++ b/htroot/CrawlStartExpert_p.java
@ -40,7 +40,7 @@ public class CrawlStartExpert_p {
        final serverObjects prop = new serverObjects();

        // define visible variables
-        prop.put("starturl", /*(intranet) ? repository :*/ "http://");
+        prop.put("starturl", /*(intranet) ? repository :*/ "");
        prop.put("proxyPrefetchDepth", env.getConfig("proxyPrefetchDepth", "0"));
        prop.put("crawlingDepth", Math.min(3, env.getConfigLong("crawlingDepth", 0)));
        prop.put("directDocByURLChecked", sb.getConfigBool("crawlingDirectDocByURL", true) ? "1" : "0");
--- a/htroot/Crawler_p.html
+++ b/htroot/Crawler_p.html
@ -161,12 +161,12 @@
  <col width="140"/>
  </colgroup>
  <tr class="TableHeader">
-    <td><strong>Start URL</strong></td>
+    <td><strong>Name</strong></td>
    <td><strong>Status</strong></td>
  </tr>
  #{list}# 
  <tr class="TableCell#(dark)#Light::Dark#(/dark)#">
-    <td><a  href="#[startURL]#">#[startURL]#</a></td>
+    <td>#[name]#</td>
    <td>#(terminateButton)#::
        <div style="text-decoration:blink;float:left;">Running</div>
        <form style="float:left;" action="Crawler_p.html" method="get" enctype="multipart/form-data" accept-charset="UTF-8"><div>
--- a/htroot/Crawler_p.java
+++ b/htroot/Crawler_p.java
@ -27,23 +27,25 @@

 import java.io.File;
 import java.io.FileInputStream;
+import java.io.IOException;
 import java.io.Writer;
 import java.net.MalformedURLException;
 import java.util.Date;
 import java.util.HashMap;
 import java.util.HashSet;
-import java.util.Iterator;
 import java.util.Map;
 import java.util.Properties;
 import java.util.Set;
 import java.util.regex.Pattern;
 import java.util.regex.PatternSyntaxException;

+import net.yacy.cora.document.ASCII;
 import net.yacy.cora.document.MultiProtocolURI;
 import net.yacy.cora.protocol.RequestHeader;
 import net.yacy.cora.services.federated.yacy.CacheStrategy;
 import net.yacy.cora.util.SpaceExceededException;
 import net.yacy.document.Document;
+import net.yacy.document.Parser.Failure;
 import net.yacy.document.parser.html.ContentScraper;
 import net.yacy.document.parser.html.TransformerWriter;
 import net.yacy.kelondro.data.meta.DigestURI;
@ -138,13 +140,6 @@ public class Crawler_p {
            if (sb.peers == null) {
                prop.put("info", "3");
            } else {
-                String crawlingStart = post.get("crawlingURL","").trim(); // the crawljob start url
-                // add the prefix http:// if necessary
-                int pos = crawlingStart.indexOf("://",0);
-                if (pos == -1) {
-                    if (crawlingStart.startsWith("www")) crawlingStart = "http://" + crawlingStart;
-                    if (crawlingStart.startsWith("ftp")) crawlingStart = "ftp://" + crawlingStart;
-                }
                
                // remove crawlingFileContent before we record the call
                String crawlingFileName = post.get("crawlingFile");
@ -159,19 +154,48 @@ public class Crawler_p {
                    post.remove("crawlingFile$file");
                }
                
-                // normalize URL
-                DigestURI crawlingStartURL = null;
-                if (crawlingFile == null) try {crawlingStartURL = new DigestURI(crawlingStart);} catch (final MalformedURLException e1) {Log.logException(e1);}
-                crawlingStart = (crawlingStartURL == null) ? null : crawlingStartURL.toNormalform(true, true);
-
-                // set new properties
+                // prepare some filter that are adjusted in case that this is wanted
+                boolean storeHTCache = "on".equals(post.get("storeHTCache", "on"));
+                String newcrawlingMustMatch = post.get("mustmatch", CrawlProfile.MATCH_ALL_STRING);
+                String newcrawlingMustNotMatch = post.get("mustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
+                if (newcrawlingMustMatch.length() < 2) newcrawlingMustMatch = CrawlProfile.MATCH_ALL_STRING; // avoid that all urls are filtered out if bad value was submitted
                final boolean fullDomain = "domain".equals(post.get("range", "wide")); // special property in simple crawl start
                final boolean subPath    = "subpath".equals(post.get("range", "wide")); // special property in simple crawl start

+                String crawlingStart0 = post.get("crawlingURL","").trim(); // the crawljob start url
+                String[] rootURLs0 = crawlingStart0.indexOf('\n') > 0 || crawlingStart0.indexOf('\r') > 0 ? crawlingStart0.split("[\\r\\n]+") : crawlingStart0.split(Pattern.quote("|"));
+                Set<DigestURI> rootURLs = new HashSet<DigestURI>();
+                String crawlName = "";
+                if (crawlingFile == null) for (String crawlingStart: rootURLs0) {
+                    if (crawlingStart == null || crawlingStart.length() == 0) continue;
+                    // add the prefix http:// if necessary
+                    int pos = crawlingStart.indexOf("://",0);
+                    if (pos == -1) {
+                        if (crawlingStart.startsWith("www")) crawlingStart = "http://" + crawlingStart;
+                        if (crawlingStart.startsWith("ftp")) crawlingStart = "ftp://" + crawlingStart;
+                    }
+                    try {
+                        DigestURI crawlingStartURL = new DigestURI(crawlingStart);
+                        rootURLs.add(crawlingStartURL);
+                        crawlName += crawlingStartURL.getHost() + "_";
+                        if (fullDomain) {
+                            newcrawlingMustMatch = CrawlProfile.mustMatchFilterFullDomain(crawlingStartURL);
+                            if (subPath) newcrawlingMustMatch = newcrawlingMustMatch.substring(0, newcrawlingMustMatch.length() - 2) + crawlingStartURL.getPath() + ".*";
+                        }
+                        if (crawlingStart!= null && subPath && (pos = crawlingStart.lastIndexOf('/')) > 0) {
+                            newcrawlingMustMatch = crawlingStart.substring(0, pos + 1) + ".*";
+                        }
+                        if (crawlingStartURL != null && (crawlingStartURL.isFile() || crawlingStartURL.isSMB())) storeHTCache = false;
+                        
+                    } catch (MalformedURLException e) {
+                        Log.logException(e);
+                    }
+                }
+                if (crawlName.length() > 80) crawlName = crawlName.substring(0, 80);
+                if (crawlName.endsWith("_")) crawlName = crawlName.substring(0, crawlName.length() - 1);
+
+                
                // set the crawl filter
-                String newcrawlingMustMatch = post.get("mustmatch", CrawlProfile.MATCH_ALL_STRING);
-                final String newcrawlingMustNotMatch = post.get("mustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
-                if (newcrawlingMustMatch.length() < 2) newcrawlingMustMatch = CrawlProfile.MATCH_ALL_STRING; // avoid that all urls are filtered out if bad value was submitted
                String ipMustMatch = post.get("ipMustmatch", CrawlProfile.MATCH_ALL_STRING);
                final String ipMustNotMatch = post.get("ipMustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
                if (ipMustMatch.length() < 2) ipMustMatch = CrawlProfile.MATCH_ALL_STRING;
@ -180,15 +204,6 @@ public class Crawler_p {
                sb.setConfig("crawlingIPMustNotMatch", ipMustNotMatch);
                if (countryMustMatch.length() > 0) sb.setConfig("crawlingCountryMustMatch", countryMustMatch);

-                // special cases:
-                if (crawlingStartURL!= null && fullDomain) {
-                    newcrawlingMustMatch = CrawlProfile.mustMatchFilterFullDomain(crawlingStartURL);
-                    if (subPath) newcrawlingMustMatch = newcrawlingMustMatch.substring(0, newcrawlingMustMatch.length() - 2) + crawlingStartURL.getPath() + ".*";
-                }
-                if (crawlingStart!= null && subPath && (pos = crawlingStart.lastIndexOf('/')) > 0) {
-                    newcrawlingMustMatch = crawlingStart.substring(0, pos + 1) + ".*";
-                }
-
                final boolean crawlOrder = post.get("crawlOrder", "off").equals("on");
                env.setConfig("crawlOrder", crawlOrder);

@ -196,7 +211,7 @@ public class Crawler_p {
                env.setConfig("crawlingDepth", Integer.toString(newcrawlingdepth));
                if ((crawlOrder) && (newcrawlingdepth > 8)) newcrawlingdepth = 8;

-                final boolean directDocByURL = "on".equals(post.get("directDocByURL", "on")); // catch also all linked media documents without loading them
+                boolean directDocByURL = "on".equals(post.get("directDocByURL", "on")); // catch also all linked media documents without loading them
                env.setConfig("crawlingDirectDocByURL", directDocByURL);

                final String collection = post.get("collection", sb.getConfig("collection", "user"));
@ -228,17 +243,17 @@ public class Crawler_p {
                // store this call as api call
                if (repeat_time > 0) {
                    // store as scheduled api call
-                    sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + ((crawlingStart == null) ? post.get("crawlingFile", "") : crawlingStart), repeat_time, repeat_unit.substring(3));
+                    sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + ((rootURLs.size() == 0) ? post.get("crawlingFile", "") : rootURLs.iterator().next().toNormalform(true, false)), repeat_time, repeat_unit.substring(3));
                } else {
                    // store just a protocol
-                    sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + ((crawlingStart == null) ? post.get("crawlingFile", "") : crawlingStart));
+                    sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + ((rootURLs.size() == 0) ? post.get("crawlingFile", "") : rootURLs.iterator().next().toNormalform(true, false)));
                }

                final boolean crawlingDomMaxCheck = "on".equals(post.get("crawlingDomMaxCheck", "off"));
                final int crawlingDomMaxPages = (crawlingDomMaxCheck) ? post.getInt("crawlingDomMaxPages", -1) : -1;
                env.setConfig("crawlingDomMaxPages", Integer.toString(crawlingDomMaxPages));

-                final boolean crawlingQ = "on".equals(post.get("crawlingQ", "off"));
+                boolean crawlingQ = "on".equals(post.get("crawlingQ", "off"));
                env.setConfig("crawlingQ", crawlingQ);

                final boolean indexText = "on".equals(post.get("indexText", "on"));
@ -247,8 +262,6 @@ public class Crawler_p {
                final boolean indexMedia = "on".equals(post.get("indexMedia", "on"));
                env.setConfig("indexMedia", indexMedia);

-                boolean storeHTCache = "on".equals(post.get("storeHTCache", "on"));
-                if (crawlingStartURL!= null &&(crawlingStartURL.isFile() || crawlingStartURL.isSMB())) storeHTCache = false;
                env.setConfig("storeHTCache", storeHTCache);

                CacheStrategy cachePolicy = CacheStrategy.parse(post.get("cachePolicy", "iffresh"));
@ -263,81 +276,72 @@ public class Crawler_p {
                final boolean xpstopw = "on".equals(post.get("xpstopw", "off"));
                env.setConfig("xpstopw", xpstopw);

-                final String crawlingMode = post.get("crawlingMode","url");
-                if (crawlingStart != null && crawlingStart.startsWith("ftp")) {
+                String crawlingMode = post.get("crawlingMode","url");
+                
+                if ("file".equals(crawlingMode) && post.containsKey("crawlingFile")) {
+                    newcrawlingMustNotMatch = CrawlProfile.MATCH_NEVER_STRING;
+                    directDocByURL = false;
+                }
+                
+                if ("sitemap".equals(crawlingMode)) {
+                    newcrawlingMustMatch = CrawlProfile.MATCH_ALL_STRING;
+                    newcrawlingMustNotMatch = CrawlProfile.MATCH_NEVER_STRING;
+                    newcrawlingdepth = 0;
+                    directDocByURL = false;
+                    crawlingQ = true;
+                }
+                
+                if ("sitelist".equals(crawlingMode)) {
+                    newcrawlingMustNotMatch = CrawlProfile.MATCH_NEVER_STRING;
+                    Set<DigestURI> newRootURLs = new HashSet<DigestURI>();
+                    for (DigestURI sitelistURL: rootURLs) {
+                        // download document
+                        Document scraper;
                        try {
+                            scraper = sb.loader.loadDocument(sitelistURL, CacheStrategy.IFFRESH, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay);
+                            // get links and generate filter
+                            for (MultiProtocolURI u: scraper.getAnchors().keySet()) {
+                                newRootURLs.add(new DigestURI(u));
+                            }
+                        } catch (IOException e) {
+                            Log.logException(e);
+                        }
+                    }
+                    rootURLs = newRootURLs;
+                    crawlingMode = "url";
+                    if ((fullDomain || subPath) && newcrawlingdepth > 0) newcrawlingMustMatch = CrawlProfile.MATCH_ALL_STRING; // to prevent that there is a restriction on the original urls
+                }
+                
+                // compute mustmatch filter according to rootURLs
+                if ((fullDomain || subPath) && newcrawlingdepth > 0) {
+                    String siteFilter = ".*";
+                    if (fullDomain) {
+                        siteFilter = siteFilter(rootURLs);
+                    } else if (subPath) {
+                        siteFilter = subpathFilter(rootURLs);
+                    }
+                    newcrawlingMustMatch = CrawlProfile.MATCH_ALL_STRING.equals(newcrawlingMustMatch) ? siteFilter : "(?=(" + newcrawlingMustMatch + "))(" + siteFilter + ")";
+                }
+                
                // check if the crawl filter works correctly
+                try {
                    Pattern.compile(newcrawlingMustMatch);
-                        final CrawlProfile profile = new CrawlProfile(
-                                crawlingStart,
-                                crawlingStartURL,
-                                newcrawlingMustMatch,
-                                newcrawlingMustNotMatch,
-                                ipMustMatch,
-                                ipMustNotMatch,
-                                countryMustMatch,
-                                newcrawlingdepth,
-                                directDocByURL,
-                                crawlingIfOlder,
-                                crawlingDomMaxPages,
-                                crawlingQ,
-                                indexText,
-                                indexMedia,
-                                storeHTCache,
-                                crawlOrder,
-                                xsstopw,
-                                xdstopw,
-                                xpstopw,
-                                cachePolicy,
-                                collection);
-                        sb.crawler.putActive(profile.handle().getBytes(), profile);
-                        sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
-                        final DigestURI url = crawlingStartURL;
-                        sb.crawlStacker.enqueueEntriesFTP(sb.peers.mySeed().hash.getBytes(), profile.handle(), url.getHost(), url.getPort(), false);
                } catch (final PatternSyntaxException e) {
                    prop.put("info", "4"); // crawlfilter does not match url
                    prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
                    prop.putHTML("info_error", e.getMessage());
-                    } catch (final Exception e) {
-                        // mist
-                        prop.put("info", "7"); // Error with file
-                        prop.putHTML("info_crawlingStart", crawlingStart);
-                        prop.putHTML("info_error", e.getMessage());
-                        Log.logException(e);
                } 
-                    sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
-                } else if ("url".equals(crawlingMode)) {
-
-                    // check if pattern matches
-                    if ((crawlingStart == null || crawlingStartURL == null) /* || (!(crawlingStart.matches(newcrawlingfilter))) */) {
-                        // print error message
+                try {
+                    Pattern.compile(newcrawlingMustNotMatch);
+                } catch (final PatternSyntaxException e) {
                    prop.put("info", "4"); // crawlfilter does not match url
-                        prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
-                        prop.putHTML("info_crawlingStart", crawlingStart);
-                    } else try {
-
-
-                        // check if the crawl filter works correctly
-                        Pattern.compile(newcrawlingMustMatch);
-
-                        // stack request
-                        // first delete old entry, if exists
-                        final DigestURI url = new DigestURI(crawlingStart);
-                        final byte[] urlhash = url.hash();
-                        sb.index.fulltext().remove(urlhash);
-                        sb.crawlQueues.noticeURL.removeByURLHash(urlhash);
-                        sb.crawlQueues.errorURL.remove(urlhash);
-
-                        // get a scraper to get the title
-                        final Document scraper = sb.loader.loadDocument(url, CacheStrategy.IFFRESH, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay);
-                        final String title = scraper == null ? url.toNormalform(true, true) : scraper.dc_title();
-                        final String description = scraper.dc_description();
+                    prop.putHTML("info_newcrawlingfilter", newcrawlingMustNotMatch);
+                    prop.putHTML("info_error", e.getMessage());
+                } 
                
-                        // stack url
-                        sb.crawler.removePassive(crawlingStartURL.hash()); // if there is an old entry, delete it
-                        final CrawlProfile pe = new CrawlProfile(
-                                (crawlingStartURL.getHost() == null) ? crawlingStartURL.toNormalform(true, false) : crawlingStartURL.getHost(),
-                                crawlingStartURL,
+                // prepare a new crawling profile
+                final CrawlProfile profile = new CrawlProfile(
+                        crawlName,
                        newcrawlingMustMatch,
                        newcrawlingMustNotMatch,
                        ipMustMatch,
@ -348,7 +352,8 @@ public class Crawler_p {
                        crawlingIfOlder,
                        crawlingDomMaxPages,
                        crawlingQ,
-                                indexText, indexMedia,
+                        indexText,
+                        indexMedia,
                        storeHTCache,
                        crawlOrder,
                        xsstopw,
@ -356,57 +361,33 @@ public class Crawler_p {
                        xpstopw,
                        cachePolicy,
                        collection);
-                        sb.crawler.putActive(pe.handle().getBytes(), pe);
-                        final String reasonString = sb.crawlStacker.stackCrawl(new Request(
-                                sb.peers.mySeed().hash.getBytes(),
-                                url,
-                                null,
-                                "CRAWLING-ROOT",
-                                new Date(),
-                                pe.handle(),
-                                0,
-                                0,
-                                0,
-                                0
-                                ));
+                byte[] handle = ASCII.getBytes(profile.handle());
                
-                        if (reasonString == null) {
-                            // create a bookmark from crawl start url
-                            //final Set<String> tags=ListManager.string2set(BookmarkHelper.cleanTagsString(post.get("bookmarkFolder","/crawlStart")));
-                            final Set<String> tags=ListManager.string2set(BookmarkHelper.cleanTagsString("/crawlStart"));
-                            tags.add("crawlStart");
-                            final String[] keywords = scraper.dc_subject();
-                            if (keywords != null) {
-                                for (final String k: keywords) {
-                                    final String kk = BookmarkHelper.cleanTagsString(k);
-                                    if (kk.length() > 0) tags.add(kk);
-                                }
-                            }
-                            String tagStr = tags.toString();
-                            if (tagStr.length() > 2 && tagStr.startsWith("[") && tagStr.endsWith("]")) tagStr = tagStr.substring(1, tagStr.length() - 2);
+                if ("url".equals(crawlingMode)) {
+                    if (rootURLs.size() == 0) {
+                        prop.put("info", "5"); //Crawling failed
+                        prop.putHTML("info_crawlingURL", "(no url given)");
+                        prop.putHTML("info_reasonString", "you must submit at least one crawl url");
+                    } else {
                        
-                            // we will create always a bookmark to use this to track crawled hosts
-                            final BookmarksDB.Bookmark bookmark = sb.bookmarksDB.createBookmark(crawlingStart, "admin");
-                            if (bookmark != null) {
-                                bookmark.setProperty(BookmarksDB.Bookmark.BOOKMARK_TITLE, title);
-                                bookmark.setProperty(BookmarksDB.Bookmark.BOOKMARK_DESCRIPTION, description);
-                                bookmark.setOwner("admin");
-                                bookmark.setPublic(false);
-                                bookmark.setTags(tags, true);
-                                sb.bookmarksDB.saveBookmark(bookmark);
+                        // stack requests
+                        sb.crawler.putActive(handle, profile);
+                        sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
+                        Set<DigestURI> successurls = new HashSet<DigestURI>();
+                        Map<DigestURI,String> failurls = new HashMap<DigestURI, String>();
+                        String failreason;
+                        for (DigestURI url: rootURLs) {
+                            if ((failreason = stackUrl(sb, profile, url)) == null) successurls.add(url); else failurls.put(url, failreason);
                        }
                        
-                            // do the same for ymarks
-                            // TODO: could a non admin user add crawls?
-                            sb.tables.bookmarks.createBookmark(sb.loader, url, YMarkTables.USER_ADMIN, true, "crawlStart", "/Crawl Start");
-
+                        if (failurls.size() == 0) {
                            // liftoff!
-                            prop.put("info", "8");//start msg
+                            prop.put("info", "8");
                            prop.putHTML("info_crawlingURL", post.get("crawlingURL"));
    
                            // generate a YaCyNews if the global flag was set
                            if (!sb.isRobinsonMode() && crawlOrder) {
-                                final Map<String, String> m = new HashMap<String, String>(pe); // must be cloned
+                                final Map<String, String> m = new HashMap<String, String>(profile); // must be cloned
                                m.remove("specificDepth");
                                m.remove("indexText");
                                m.remove("indexMedia");
@ -422,18 +403,16 @@ public class Crawler_p {
                                sb.peers.newsPool.publishMyNews(sb.peers.mySeed(), NewsPool.CATEGORY_CRAWL_START, m);
                            }
                        } else {
-                            prop.put("info", "5"); //Crawling failed
-                            prop.putHTML("info_crawlingURL", (post.get("crawlingURL")));
-                            prop.putHTML("info_reasonString", reasonString);
-
+                            StringBuilder fr = new StringBuilder();
+                            for (Map.Entry<DigestURI, String> failure: failurls.entrySet()) {
                                sb.crawlQueues.errorURL.push(
                                    new Request(
                                            sb.peers.mySeed().hash.getBytes(),
-                                        crawlingStartURL,
+                                            failure.getKey(),
                                            null,
                                            "",
                                            new Date(),
-                                        pe.handle(),
+                                            profile.handle(),
                                            0,
                                            0,
                                            0,
@ -442,20 +421,31 @@ public class Crawler_p {
                                    new Date(),
                                    1,
                                    FailCategory.FINAL_LOAD_CONTEXT,
-                                reasonString, -1);
+                                    failure.getValue(), -1);
+                                fr.append(failure.getValue()).append('/');
                            }
-                    } catch (final PatternSyntaxException e) {
-                        prop.put("info", "4"); // crawlfilter does not match url
-                        prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
-                        prop.putHTML("info_error", e.getMessage());
+    
+                            prop.put("info", "5"); //Crawling failed
+                            prop.putHTML("info_crawlingURL", (post.get("crawlingURL")));
+                            prop.putHTML("info_reasonString", fr.toString());
+                        }
+                        if (successurls.size() > 0) sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
+                    }
+                } else if ("sitemap".equals(crawlingMode)) {
+                    final String sitemapURLStr = post.get("sitemapURL","");
+                    try {
+                        final DigestURI sitemapURL = new DigestURI(sitemapURLStr);
+                        sb.crawler.putActive(handle, profile);
+                        final SitemapImporter importer = new SitemapImporter(sb, sitemapURL, profile);
+                        importer.start();
+                        sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
                    } catch (final Exception e) {
                        // mist
                        prop.put("info", "6");//Error with url
-                        prop.putHTML("info_crawlingStart", crawlingStart);
+                        prop.putHTML("info_crawlingStart", sitemapURLStr);
                        prop.putHTML("info_error", e.getMessage());
-                        Log.logInfo("Crawler_p", "start url rejected: " + e.getMessage());
+                        Log.logException(e);
                    }
-
                } else if ("file".equals(crawlingMode)) {
                    if (post.containsKey("crawlingFile")) {
                        final String crawlingFileContent = post.get("crawlingFile$file", "");
@ -481,30 +471,7 @@ public class Crawler_p {
                                }
                            }

-                            final DigestURI crawlURL = new DigestURI("file://" + crawlingFile.toString());
-                            final CrawlProfile profile = new CrawlProfile(
-                                    crawlingFileName,
-                                    crawlURL,
-                                    newcrawlingMustMatch,
-                                    CrawlProfile.MATCH_NEVER_STRING,
-                                    ipMustMatch,
-                                    ipMustNotMatch,
-                                    countryMustMatch,
-                                    newcrawlingdepth,
-                                    false,
-                                    crawlingIfOlder,
-                                    crawlingDomMaxPages,
-                                    crawlingQ,
-                                    indexText,
-                                    indexMedia,
-                                    storeHTCache,
-                                    crawlOrder,
-                                    xsstopw,
-                                    xdstopw,
-                                    xpstopw,
-                                    cachePolicy,
-                                    collection);
-                            sb.crawler.putActive(profile.handle().getBytes(), profile);
+                            sb.crawler.putActive(handle, profile);
                            sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
                            sb.crawlStacker.enqueueEntriesAsynchronous(sb.peers.mySeed().hash.getBytes(), profile.handle(), hyperlinks);
                        } catch (final PatternSyntaxException e) {
@ -520,110 +487,6 @@ public class Crawler_p {
                        }
                        sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
                    }
-                } else if ("sitemap".equals(crawlingMode)) {
-                    final String sitemapURLStr = post.get("sitemapURL","");
-                	try {
-                		final DigestURI sitemapURL = new DigestURI(sitemapURLStr);
-                		final CrawlProfile pe = new CrawlProfile(
-                				sitemapURLStr,
-                				sitemapURL,
-                				CrawlProfile.MATCH_ALL_STRING,
-                				CrawlProfile.MATCH_NEVER_STRING,
-                                ipMustMatch,
-                                ipMustNotMatch,
-                                countryMustMatch,
-                				0,
-                				false,
-                				crawlingIfOlder,
-                				crawlingDomMaxPages,
-                				true,
-                				indexText,
-                				indexMedia,
-                				storeHTCache,
-                				crawlOrder,
-                				xsstopw,
-                				xdstopw,
-                				xpstopw,
-                				cachePolicy,
-                				collection);
-                		sb.crawler.putActive(pe.handle().getBytes(), pe);
-                		final SitemapImporter importer = new SitemapImporter(sb, sitemapURL, pe);
-                		importer.start();
-                	} catch (final Exception e) {
-                		// mist
-                		prop.put("info", "6");//Error with url
-                		prop.putHTML("info_crawlingStart", sitemapURLStr);
-                		prop.putHTML("info_error", e.getMessage());
-                		Log.logException(e);
-                	}
-                } else if ("sitelist".equals(crawlingMode)) {
-                    try {
-                        final DigestURI sitelistURL = new DigestURI(crawlingStart);
-                        // download document
-                        Document scraper = sb.loader.loadDocument(sitelistURL, CacheStrategy.IFFRESH, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay);
-                        // String title = scraper.getTitle();
-                        // String description = scraper.getDescription();
-
-                        // get links and generate filter
-                        final Map<MultiProtocolURI, Properties> hyperlinks = scraper.getAnchors();
-                        if (fullDomain && newcrawlingdepth > 0) newcrawlingMustMatch = siteFilter(hyperlinks.keySet());
-
-                        // put links onto crawl queue
-                        final CrawlProfile profile = new CrawlProfile(
-                                sitelistURL.getHost(),
-                                sitelistURL,
-                                newcrawlingMustMatch,
-                                CrawlProfile.MATCH_NEVER_STRING,
-                                ipMustMatch,
-                                ipMustNotMatch,
-                                countryMustMatch,
-                                newcrawlingdepth,
-                                directDocByURL,
-                                crawlingIfOlder,
-                                crawlingDomMaxPages,
-                                crawlingQ,
-                                indexText,
-                                indexMedia,
-                                storeHTCache,
-                                crawlOrder,
-                                xsstopw,
-                                xdstopw,
-                                xpstopw,
-                                cachePolicy,
-                                collection);
-                        sb.crawler.putActive(profile.handle().getBytes(), profile);
-                        sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
-                        final Iterator<Map.Entry<MultiProtocolURI, Properties>> linkiterator = hyperlinks.entrySet().iterator();
-                        DigestURI nexturl;
-                        while (linkiterator.hasNext()) {
-                            final Map.Entry<MultiProtocolURI, Properties> e = linkiterator.next();
-                            if (e.getKey() == null) continue;
-                            nexturl = new DigestURI(e.getKey());
-                            // remove the url from the database to be prepared to crawl them again
-                            final byte[] urlhash = nexturl.hash();
-                            sb.index.fulltext().remove(urlhash);
-                            sb.crawlQueues.noticeURL.removeByURLHash(urlhash);
-                            sb.crawlQueues.errorURL.remove(urlhash);
-                            sb.crawlStacker.enqueueEntry(new Request(
-                                    sb.peers.mySeed().hash.getBytes(),
-                                    nexturl,
-                                    null,
-                                    e.getValue().getProperty("name", ""),
-                                    new Date(),
-                                    profile.handle(),
-                                    0,
-                                    0,
-                                    0,
-                                    0
-                                    ));
-                        }
-                    } catch (final Exception e) {
-                        // mist
-                        prop.put("info", "6");//Error with url
-                        prop.putHTML("info_crawlingStart", crawlingStart);
-                        prop.putHTML("info_error", e.getMessage());
-                        Log.logException(e);
-                    }
                }
            }
        }
@ -661,6 +524,106 @@ public class Crawler_p {
        return prop;
    }

+    /**
+     * stack the url to the crawler
+     * @param sb
+     * @param profile
+     * @param url
+     * @return null if this was ok. If this failed, return a string with a fail reason
+     */
+    private static String stackUrl(Switchboard sb, CrawlProfile profile, DigestURI url) {
+        
+        byte[] handle = ASCII.getBytes(profile.handle());
+
+        // remove url from the index to be prepared for a re-crawl
+        final byte[] urlhash = url.hash();
+        sb.index.fulltext().remove(urlhash);
+        sb.crawlQueues.noticeURL.removeByURLHash(urlhash);
+        sb.crawlQueues.errorURL.remove(urlhash);
+        
+        // special handling of ftp protocol
+        if (url.isFTP()) {
+            try {
+                sb.crawler.putActive(handle, profile);
+                sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
+                sb.crawlStacker.enqueueEntriesFTP(sb.peers.mySeed().hash.getBytes(), profile.handle(), url.getHost(), url.getPort(), false);
+                return null;
+            } catch (final Exception e) {
+                // mist
+                Log.logException(e);
+                return "problem crawling an ftp site: " + e.getMessage();
+            }
+        }
+
+        // get a scraper to get the title
+        Document scraper;
+        try {
+            scraper = sb.loader.loadDocument(url, CacheStrategy.IFFRESH, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay);
+        } catch (IOException e) {
+            Log.logException(e);
+            return "scraper cannot load URL: " + e.getMessage();
+        }
+        
+        final String title = scraper == null ? url.toNormalform(true, true) : scraper.dc_title();
+        final String description = scraper.dc_description();
+
+        // add the url to the crawl stack
+        sb.crawler.removePassive(handle); // if there is an old entry, delete it
+        sb.crawler.putActive(handle, profile);
+        final String reasonString = sb.crawlStacker.stackCrawl(new Request(
+                sb.peers.mySeed().hash.getBytes(),
+                url,
+                null,
+                "CRAWLING-ROOT",
+                new Date(),
+                profile.handle(),
+                0,
+                0,
+                0,
+                0
+                ));
+        
+        if (reasonString != null) return reasonString;
+        
+        // create a bookmark from crawl start url
+        //final Set<String> tags=ListManager.string2set(BookmarkHelper.cleanTagsString(post.get("bookmarkFolder","/crawlStart")));
+        final Set<String> tags=ListManager.string2set(BookmarkHelper.cleanTagsString("/crawlStart"));
+        tags.add("crawlStart");
+        final String[] keywords = scraper.dc_subject();
+        if (keywords != null) {
+            for (final String k: keywords) {
+                final String kk = BookmarkHelper.cleanTagsString(k);
+                if (kk.length() > 0) tags.add(kk);
+            }
+        }
+        String tagStr = tags.toString();
+        if (tagStr.length() > 2 && tagStr.startsWith("[") && tagStr.endsWith("]")) tagStr = tagStr.substring(1, tagStr.length() - 2);
+
+        // we will create always a bookmark to use this to track crawled hosts
+        final BookmarksDB.Bookmark bookmark = sb.bookmarksDB.createBookmark(url.toNormalform(true, false), "admin");
+        if (bookmark != null) {
+            bookmark.setProperty(BookmarksDB.Bookmark.BOOKMARK_TITLE, title);
+            bookmark.setProperty(BookmarksDB.Bookmark.BOOKMARK_DESCRIPTION, description);
+            bookmark.setOwner("admin");
+            bookmark.setPublic(false);
+            bookmark.setTags(tags, true);
+            sb.bookmarksDB.saveBookmark(bookmark);
+        }
+
+        // do the same for ymarks
+        // TODO: could a non admin user add crawls?
+        try {
+            sb.tables.bookmarks.createBookmark(sb.loader, url, YMarkTables.USER_ADMIN, true, "crawlStart", "/Crawl Start");
+        } catch (IOException e) {
+            Log.logException(e);
+        } catch (Failure e) {
+            Log.logException(e);
+        }
+
+        // that was ok
+        return null;
+    }
+    
    private static long recrawlIfOlderC(final boolean recrawlIfOlderCheck, final int recrawlIfOlderNumber, final String crawlingIfOlderUnit) {
        if (!recrawlIfOlderCheck) return 0L;
        if ("year".equals(crawlingIfOlderUnit)) return System.currentTimeMillis() - recrawlIfOlderNumber * 1000L * 60L * 60L * 24L * 365L;
@ -682,7 +645,7 @@ public class Crawler_p {
        sb.setPerformance(wantedPPM);
    }

-    private static String siteFilter(final Set<MultiProtocolURI> uris) {
+    private static String siteFilter(final Set<? extends MultiProtocolURI> uris) {
        final StringBuilder filter = new StringBuilder();
        final Set<String> filterSet = new HashSet<String>();
        for (final MultiProtocolURI uri: uris) {
@ -697,7 +660,7 @@ public class Crawler_p {
        return filter.length() > 0 ? filter.substring(1) : "";
    }

-    private static String subpathFilter(final Set<MultiProtocolURI> uris) {
+    private static String subpathFilter(final Set<? extends MultiProtocolURI> uris) {
        final StringBuilder filter = new StringBuilder();
        final Set<String> filterSet = new HashSet<String>();
        for (final MultiProtocolURI uri: uris) {
--- a/htroot/QuickCrawlLink_p.java
+++ b/htroot/QuickCrawlLink_p.java
@ -133,8 +133,7 @@ public class QuickCrawlLink_p {
            CrawlProfile pe = null;
            try {
                pe = new CrawlProfile(
-                        crawlingStartURL.getHost(),
-                        crawlingStartURL,
+                        crawlingStartURL.toNormalform(true, false),
                        crawlingMustMatch,
                        CrawlProfile.MATCH_ALL_STRING,
                        CrawlProfile.MATCH_NEVER_STRING,
--- a/source/de/anomic/crawler/CrawlProfile.java
+++ b/source/de/anomic/crawler/CrawlProfile.java
@ -33,10 +33,8 @@ import java.util.Set;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.regex.Pattern;

-import net.yacy.cora.document.ASCII;
 import net.yacy.cora.document.MultiProtocolURI;
 import net.yacy.cora.services.federated.yacy.CacheStrategy;
-import net.yacy.kelondro.data.meta.DigestURI;
 import net.yacy.kelondro.data.word.Word;
 import net.yacy.kelondro.logging.Log;
 import net.yacy.kelondro.order.Base64Order;
@ -55,7 +53,6 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
    // this is a simple record structure that hold all properties of a single crawl start
    public static final String HANDLE           = "handle";
    public static final String NAME             = "name";
-    public static final String START_URL        = "startURL";
    public static final String DEPTH            = "generalDepth";
    public static final String DIRECT_DOC_BY_URL= "directDocByURL";
    public static final String RECRAWL_IF_OLDER = "recrawlIfOlder";
@ -124,8 +121,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
     * @param collections a comma-separated list of tags which are attached to index entries
     */
    public CrawlProfile(
-                 final String name,
-                 final DigestURI startURL,
+                 String name,
                 final String urlMustMatch,
                 final String urlMustNotMatch,
                 final String ipMustMatch,
@ -149,14 +145,11 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        if (name == null || name.isEmpty()) {
            throw new NullPointerException("name must not be null or empty");
        }
+        if (name.length() > 60) name = name.substring(0, 60);
        this.doms = new ConcurrentHashMap<String, DomProfile>();
-
-        final String handle = (startURL == null)
-                ? Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(name)).substring(0, Word.commonHashLength)
-                : ASCII.String(startURL.hash());
+        final String handle = Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(name)).substring(0, Word.commonHashLength);
        put(HANDLE,           handle);
        put(NAME,             name);
-        put(START_URL,        (startURL == null) ? "" : startURL.toNormalform(true, false));
        put(FILTER_URL_MUSTMATCH,     (urlMustMatch == null) ? CrawlProfile.MATCH_ALL_STRING : urlMustMatch);
        put(FILTER_URL_MUSTNOTMATCH,  (urlMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : urlMustNotMatch);
        put(FILTER_IP_MUSTMATCH,      (ipMustMatch == null) ? CrawlProfile.MATCH_ALL_STRING : ipMustMatch);
@ -258,6 +251,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
     */
    public String handle() {
        final String r = get(HANDLE);
+        assert r != null;
        //if (r == null) return null;
        return r;
    }
@ -282,15 +276,6 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        return r;
    }

-    /**
-     * Gets the root URL of the crawl job.
-     * @return root URL
-     */
-    public String startURL() {
-        final String r = get(START_URL);
-        return r;
-    }
-
    /**
     * Gets the regex which must be matched by URLs in order to be crawled.
     * @return regex which must be matched
@ -540,7 +525,6 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        prop.put(CRAWL_PROFILE_PREFIX + count + "_terminateButton_handle", this.handle());
        prop.put(CRAWL_PROFILE_PREFIX + count + "_deleteButton", (active) ? "0" : "1");
        prop.put(CRAWL_PROFILE_PREFIX + count + "_deleteButton_handle", this.handle());
-        prop.putXML(CRAWL_PROFILE_PREFIX + count + "_startURL", this.startURL());
        prop.put(CRAWL_PROFILE_PREFIX + count + "_handle", this.handle());
        prop.put(CRAWL_PROFILE_PREFIX + count + "_depth", this.depth());
        prop.put(CRAWL_PROFILE_PREFIX + count + "_mustmatch", this.urlMustMatchPattern().toString());
--- a/source/de/anomic/crawler/CrawlQueues.java
+++ b/source/de/anomic/crawler/CrawlQueues.java
@ -315,6 +315,7 @@ public class CrawlQueues {
     * @param stats String for log prefixing
     * @return
     */
+    @SuppressWarnings("unused")
    private void load(final Request urlEntry, final String stats, final String profileHandle) {
        final CrawlProfile profile = this.sb.crawler.getActive(UTF8.getBytes(profileHandle));
        if (profile != null) {
--- a/source/de/anomic/crawler/CrawlSwitchboard.java
+++ b/source/de/anomic/crawler/CrawlSwitchboard.java
@ -292,7 +292,6 @@ public final class CrawlSwitchboard
            this.defaultProxyProfile =
                new CrawlProfile(
                    CRAWL_PROFILE_PROXY,
-                    null,
                    CrawlProfile.MATCH_ALL_STRING,
                    CrawlProfile.MATCH_NEVER_STRING,
                    CrawlProfile.MATCH_ALL_STRING,
@ -321,7 +320,6 @@ public final class CrawlSwitchboard
            this.defaultRemoteProfile =
                new CrawlProfile(
                    CRAWL_PROFILE_REMOTE,
-                    null,
                    CrawlProfile.MATCH_ALL_STRING,
                    CrawlProfile.MATCH_ALL_STRING,
                    CrawlProfile.MATCH_NEVER_STRING,
@ -350,7 +348,6 @@ public final class CrawlSwitchboard
            this.defaultTextSnippetLocalProfile =
                new CrawlProfile(
                    CRAWL_PROFILE_SNIPPET_LOCAL_TEXT,
-                    null,
                    CrawlProfile.MATCH_ALL_STRING,
                    CrawlProfile.MATCH_NEVER_STRING,
                    CrawlProfile.MATCH_ALL_STRING,
@ -379,7 +376,6 @@ public final class CrawlSwitchboard
            this.defaultTextSnippetGlobalProfile =
                new CrawlProfile(
                    CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT,
-                    null,
                    CrawlProfile.MATCH_ALL_STRING,
                    CrawlProfile.MATCH_NEVER_STRING,
                    CrawlProfile.MATCH_ALL_STRING,
@ -409,7 +405,6 @@ public final class CrawlSwitchboard
            this.defaultMediaSnippetLocalProfile =
                new CrawlProfile(
                    CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA,
-                    null,
                    CrawlProfile.MATCH_ALL_STRING,
                    CrawlProfile.MATCH_NEVER_STRING,
                    CrawlProfile.MATCH_ALL_STRING,
@ -438,7 +433,6 @@ public final class CrawlSwitchboard
            this.defaultMediaSnippetGlobalProfile =
                new CrawlProfile(
                    CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA,
-                    null,
                    CrawlProfile.MATCH_ALL_STRING,
                    CrawlProfile.MATCH_NEVER_STRING,
                    CrawlProfile.MATCH_ALL_STRING,
@ -467,7 +461,6 @@ public final class CrawlSwitchboard
            this.defaultSurrogateProfile =
                new CrawlProfile(
                    CRAWL_PROFILE_SURROGATE,
-                    null,
                    CrawlProfile.MATCH_ALL_STRING,
                    CrawlProfile.MATCH_NEVER_STRING,
                    CrawlProfile.MATCH_ALL_STRING,
--- a/source/de/anomic/data/ymark/YMarkCrawlStart.java
+++ b/source/de/anomic/data/ymark/YMarkCrawlStart.java
@ -98,9 +98,7 @@ public class YMarkCrawlStart extends HashMap<String,String>{
 		while(iter.hasNext()) {
 			final byte[] key = iter.next();
 			final CrawlProfile crawl = crawler.getActive(key);
-			if (crawl.startURL().equals(this.url)) {
-				return true;
-			}
+			if (crawl != null) return true;
 		}
 		return false;
 	}
@ -175,7 +173,7 @@ public class YMarkCrawlStart extends HashMap<String,String>{
        final int depth,
        final boolean crawlingQ, final boolean medialink) {
 		final CrawlProfile pe = new CrawlProfile(
-		                (startURL.getHost() == null) ? startURL.toNormalform(true, false) : startURL.getHost(), null,
+		                (startURL.getHost() == null) ? startURL.toNormalform(true, false) : startURL.getHost(),
 		                urlMustMatch,
 		                urlMustNotMatch,
 		                CrawlProfile.MATCH_ALL_STRING,