- added a delete button in host browser to delete a complete subpath

- removed storage of default collection name - default is now "user" - made stacking of crawl start points concurrently
13 years ago · f8f05ecba7
parent 0716a24737
commit f8f05ecba7
9 changed files with 193 additions and 154 deletions
--- a/htroot/CrawlStartExpert_p.java
+++ b/htroot/CrawlStartExpert_p.java
@ -85,7 +85,7 @@ public class CrawlStartExpert_p {

        boolean collectionEnabled = sb.index.fulltext().getSolrScheme().isEmpty() || sb.index.fulltext().getSolrScheme().contains(YaCySchema.collection_sxt);
        prop.put("collectionEnabled", collectionEnabled ? 1 : 0);
-        prop.put("collection", collectionEnabled ? sb.getConfig("collection", "user") : "");
+        prop.put("collection", collectionEnabled ? "user" : "");

        // return rewrite properties
        return prop;
--- a/htroot/Crawler_p.java
+++ b/htroot/Crawler_p.java
@ -43,13 +43,8 @@ import net.yacy.crawler.data.CrawlQueues;
 import net.yacy.crawler.data.ZURL.FailCategory;
 import net.yacy.crawler.retrieval.Request;
 import net.yacy.crawler.retrieval.SitemapImporter;
-import net.yacy.data.BookmarkHelper;
-import net.yacy.data.BookmarksDB;
-import net.yacy.data.ListManager;
 import net.yacy.data.WorkTables;
-import net.yacy.data.ymark.YMarkTables;
 import net.yacy.document.Document;
-import net.yacy.document.Parser.Failure;
 import net.yacy.document.parser.html.ContentScraper;
 import net.yacy.document.parser.html.TransformerWriter;
 import net.yacy.kelondro.data.meta.DigestURI;
@ -212,7 +207,7 @@ public class Crawler_p {
                boolean directDocByURL = "on".equals(post.get("directDocByURL", "off")); // catch also all linked media documents without loading them
                env.setConfig("crawlingDirectDocByURL", directDocByURL);

-                final String collection = post.get("collection", sb.getConfig("collection", "user"));
+                final String collection = post.get("collection", "user");
                env.setConfig("collection", collection);

                // recrawl
@ -376,13 +371,10 @@ public class Crawler_p {
                        // stack requests
                        sb.crawler.putActive(handle, profile);
                        sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
-                        Set<DigestURI> successurls = new HashSet<DigestURI>();
-                        Map<DigestURI,String> failurls = new HashMap<DigestURI, String>();
-                        String failreason;
-                        for (DigestURI url: rootURLs) {
-                            if ((failreason = stackUrl(sb, profile, url)) == null) successurls.add(url); else failurls.put(url, failreason);
-                        }
-                        
+                        final Set<DigestURI> successurls = new HashSet<DigestURI>();
+                        final Map<DigestURI,String> failurls = new HashMap<DigestURI, String>();
+                        sb.stackURLs(rootURLs, profile, successurls, failurls);
+
                        if (failurls.size() == 0) {
                            // liftoff!
                            prop.put("info", "8");
@ -552,106 +544,6 @@ public class Crawler_p {
        return prop;
    }

-    /**
-     * stack the url to the crawler
-     * @param sb
-     * @param profile
-     * @param url
-     * @return null if this was ok. If this failed, return a string with a fail reason
-     */
-    private static String stackUrl(Switchboard sb, CrawlProfile profile, DigestURI url) {
-        
-        byte[] handle = ASCII.getBytes(profile.handle());
-
-        // remove url from the index to be prepared for a re-crawl
-        final byte[] urlhash = url.hash();
-        sb.index.fulltext().remove(urlhash);
-        sb.crawlQueues.noticeURL.removeByURLHash(urlhash);
-        sb.crawlQueues.errorURL.remove(urlhash);
-        
-        // special handling of ftp protocol
-        if (url.isFTP()) {
-            try {
-                sb.crawler.putActive(handle, profile);
-                sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
-                sb.crawlStacker.enqueueEntriesFTP(sb.peers.mySeed().hash.getBytes(), profile.handle(), url.getHost(), url.getPort(), false);
-                return null;
-            } catch (final Exception e) {
-                // mist
-                Log.logException(e);
-                return "problem crawling an ftp site: " + e.getMessage();
-            }
-        }
-
-        // get a scraper to get the title
-        Document scraper;
-        try {
-            scraper = sb.loader.loadDocument(url, CacheStrategy.IFFRESH, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay);
-        } catch (IOException e) {
-            Log.logException(e);
-            return "scraper cannot load URL: " + e.getMessage();
-        }
-        
-        final String title = scraper == null ? url.toNormalform(true) : scraper.dc_title();
-        final String description = scraper.dc_description();
-
-        // add the url to the crawl stack
-        sb.crawler.removePassive(handle); // if there is an old entry, delete it
-        sb.crawler.putActive(handle, profile);
-        final String reasonString = sb.crawlStacker.stackCrawl(new Request(
-                sb.peers.mySeed().hash.getBytes(),
-                url,
-                null,
-                "CRAWLING-ROOT",
-                new Date(),
-                profile.handle(),
-                0,
-                0,
-                0,
-                0
-                ));
-        
-        if (reasonString != null) return reasonString;
-        
-        // create a bookmark from crawl start url
-        //final Set<String> tags=ListManager.string2set(BookmarkHelper.cleanTagsString(post.get("bookmarkFolder","/crawlStart")));
-        final Set<String> tags=ListManager.string2set(BookmarkHelper.cleanTagsString("/crawlStart"));
-        tags.add("crawlStart");
-        final String[] keywords = scraper.dc_subject();
-        if (keywords != null) {
-            for (final String k: keywords) {
-                final String kk = BookmarkHelper.cleanTagsString(k);
-                if (kk.length() > 0) tags.add(kk);
-            }
-        }
-        String tagStr = tags.toString();
-        if (tagStr.length() > 2 && tagStr.startsWith("[") && tagStr.endsWith("]")) tagStr = tagStr.substring(1, tagStr.length() - 2);
-
-        // we will create always a bookmark to use this to track crawled hosts
-        final BookmarksDB.Bookmark bookmark = sb.bookmarksDB.createBookmark(url.toNormalform(true), "admin");
-        if (bookmark != null) {
-            bookmark.setProperty(BookmarksDB.Bookmark.BOOKMARK_TITLE, title);
-            bookmark.setProperty(BookmarksDB.Bookmark.BOOKMARK_DESCRIPTION, description);
-            bookmark.setOwner("admin");
-            bookmark.setPublic(false);
-            bookmark.setTags(tags, true);
-            sb.bookmarksDB.saveBookmark(bookmark);
-        }
-
-        // do the same for ymarks
-        // TODO: could a non admin user add crawls?
-        try {
-            sb.tables.bookmarks.createBookmark(sb.loader, url, YMarkTables.USER_ADMIN, true, "crawlStart", "/Crawl Start");
-        } catch (IOException e) {
-            Log.logException(e);
-        } catch (Failure e) {
-            Log.logException(e);
-        }
-
-        // that was ok
-        return null;
-    }
-    
    private static long recrawlIfOlderC(final boolean recrawlIfOlderCheck, final int recrawlIfOlderNumber, final String crawlingIfOlderUnit) {
        if (!recrawlIfOlderCheck) return 0L;
        if ("year".equals(crawlingIfOlderUnit)) return System.currentTimeMillis() - recrawlIfOlderNumber * 1000L * 60L * 60L * 24L * 365L;
--- a/htroot/HostBrowser.html
+++ b/htroot/HostBrowser.html
@ -64,7 +64,9 @@ function updatepage(str) {
    <fieldset class="yacys">
      Host/URL: 
      <input id="search" type="text" name="path" value="#[path]#" size="40" maxlength="250" />
-      <input type="submit" name="list" value="Browse Host" class="submitready" style="width:240px;"/><br />
+      <input type="submit" name="list" value="Browse Host" class="submitready" style="width:240px;"/>
+      #(delete)#::<input type="submit" name="delete" value="Delete Subpath" class="submitready" style="width:240px;" onclick="return confirm('Confirm Deletion')"/>#(/delete)#
+      <br />
      <div id="searchresults"></div>
    </fieldset>
    </form>
--- a/htroot/HostBrowser.java
+++ b/htroot/HostBrowser.java
@ -20,10 +20,12 @@

 import java.io.IOException;
 import java.net.MalformedURLException;
+import java.util.ArrayList;
 import java.util.Date;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Iterator;
+import java.util.List;
 import java.util.Map;
 import java.util.Set;
 import java.util.TreeMap;
@ -91,6 +93,8 @@ public class HostBrowser {
            !path.startsWith("smb://") &&
            !path.startsWith("file://"))) { path = "http://" + path; }
        prop.putHTML("path", path);
+        prop.put("delete", admin && path.length() > 0 ? 1 : 0);
+        
        DigestURI pathURI = null;
        try {pathURI = new DigestURI(path);} catch (MalformedURLException e) {}

@ -145,6 +149,12 @@ public class HostBrowser {
        }
        
        if (path.length() > 0) {
+            boolean delete = false;
+            if (admin && post.containsKey("delete")) {
+                // delete the complete path!! That includes everything that matches with this prefix.
+                delete = true;
+            }
+            
            boolean complete = post.getBoolean("complete");
            if (complete) { // we want only root paths for complete lists
                p = path.indexOf('/', 10);
@ -174,10 +184,19 @@ public class HostBrowser {
                Set<String> inboundLinks = new HashSet<String>();
                Map<String, ReversibleScoreMap<String>> outboundHosts = new HashMap<String, ReversibleScoreMap<String>>();
                int hostsize = 0;
+                final List<byte[]> deleteIDs = new ArrayList<byte[]>();
                while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
                    String u = (String) doc.getFieldValue(YaCySchema.sku.getSolrFieldName());
                    hostsize++;
-                    if (complete || u.startsWith(path)) storedDocs.add(u);
+                    if (u.startsWith(path)) {
+                        if (delete) {
+                            deleteIDs.add(ASCII.getBytes((String) doc.getFieldValue(YaCySchema.id.name())));
+                        } else {
+                            storedDocs.add(u);
+                        }
+                    } else if (complete) {
+                        storedDocs.add(u);
+                    }
                    // collect inboundlinks to browse the host
                    Iterator<String> links = URIMetadataNode.getLinks(doc, true);
                    while (links.hasNext()) {
@ -202,6 +221,7 @@ public class HostBrowser {
                        } catch (MalformedURLException e) {}
                    }
                }
+                if (deleteIDs.size() > 0) sb.index.fulltext().removeConcurrently(deleteIDs);
                
                // now combine both lists into one
                Map<String, Boolean> files = new HashMap<String, Boolean>();
--- a/source/net/yacy/kelondro/util/FileUtils.java
+++ b/source/net/yacy/kelondro/util/FileUtils.java
@ -396,6 +396,10 @@ public final class FileUtils {
        return mb;
    }

+    private final static Pattern ps = Pattern.compile("\\\\");
+    private final static Pattern pn = Pattern.compile("\\n");
+    private final static Pattern pe = Pattern.compile("=");
+    
    public static void saveMap(final File file, final Map<String, String> props, final String comment) {
        PrintWriter pw = null;
        final File tf = new File(file.toString() + "." + (System.currentTimeMillis() % 1000));
@ -406,12 +410,16 @@ public final class FileUtils {
            for ( final Map.Entry<String, String> entry : props.entrySet() ) {
                key = entry.getKey();
                if ( key != null ) {
-                    key = key.replace("\\", "\\\\").replace("\n", "\\n").replace("=", "\\=");
+                    key = ps.matcher(key).replaceAll("\\\\");
+                    key = pn.matcher(key).replaceAll("\\n");
+                    key = pe.matcher(key).replaceAll("\\=");
                }
                if ( entry.getValue() == null ) {
                    value = "";
                } else {
-                    value = entry.getValue().replace("\\", "\\\\").replace("\n", "\\n");
+                    value = entry.getValue();
+                    value = ps.matcher(value).replaceAll("\\\\");
+                    value = pn.matcher(value).replaceAll("\\n");
                }
                pw.println(key + "=" + value);
            }
@ -432,7 +440,7 @@ public final class FileUtils {
            // ignore
        }
    }
-
+    
    public static void saveMapB(final File file, final Map<String, byte[]> props, final String comment) {
        HashMap<String, String> m = new HashMap<String, String>();
        for (Map.Entry<String, byte[]> e: props.entrySet()) m.put(e.getKey(), UTF8.String(e.getValue()));
--- a/source/net/yacy/peers/Protocol.java
+++ b/source/net/yacy/peers/Protocol.java
@ -1061,7 +1061,7 @@ public final class Protocol
        // evaluate result
        List<URIMetadataNode> container = new ArrayList<URIMetadataNode>();
 		if (docList.size() > 0) {// create containers
-            Network.log.logInfo("SEARCH (solr), returned " + docList.size() + " documents from " + (target == null ? "shard" : ("peer " + target.hash + ":" + target.getName()))) ;
+            Network.log.logInfo("SEARCH (solr), returned " + docList.size() + " out of " + docList.getNumFound() + " documents from " + (target == null ? "shard" : ("peer " + target.hash + ":" + target.getName()))) ;

        	int term = count;
            for (final SolrDocument doc: docList) {
--- a/source/net/yacy/peers/RemoteSearch.java
+++ b/source/net/yacy/peers/RemoteSearch.java
@ -293,7 +293,7 @@ public class RemoteSearch extends Thread {
                }
            }
        };
-        if (targetPeer == null) solr.run(); else solr.start();
+        /*if (targetPeer == null) solr.run(); else*/ solr.start();
        return solr;
    }

--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@ -60,6 +60,7 @@ import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 import java.util.Properties;
+import java.util.Set;
 import java.util.SortedMap;
 import java.util.SortedSet;
 import java.util.TreeMap;
@ -124,6 +125,7 @@ import net.yacy.crawler.retrieval.Response;
 import net.yacy.crawler.robots.RobotsTxt;
 import net.yacy.data.BlogBoard;
 import net.yacy.data.BlogBoardComments;
+import net.yacy.data.BookmarkHelper;
 import net.yacy.data.BookmarksDB;
 import net.yacy.data.ListManager;
 import net.yacy.data.MessageBoard;
@ -133,11 +135,13 @@ import net.yacy.data.WorkTables;
 import net.yacy.data.wiki.WikiBoard;
 import net.yacy.data.wiki.WikiCode;
 import net.yacy.data.wiki.WikiParser;
+import net.yacy.data.ymark.YMarkTables;
 import net.yacy.document.Condenser;
 import net.yacy.document.Document;
 import net.yacy.document.LibraryProvider;
 import net.yacy.document.Parser;
 import net.yacy.document.TextParser;
+import net.yacy.document.Parser.Failure;
 import net.yacy.document.content.DCEntry;
 import net.yacy.document.content.SurrogateReader;
 import net.yacy.document.importer.OAIListFriendsLoader;
@ -195,8 +199,7 @@ import net.yacy.utils.crypt;
 import com.google.common.io.Files;


-public final class Switchboard extends serverSwitch
-{
+public final class Switchboard extends serverSwitch {

    // load slots
    public static int xstackCrawlSlots = 2000;
@ -269,18 +272,12 @@ public final class Switchboard extends serverSwitch

    private final Semaphore shutdownSync = new Semaphore(0);
    private boolean terminate = false;
-
-    //private Object  crawlingPausedSync = new Object();
-    //private boolean crawlingIsPaused = false;
-
+    private static Switchboard sb;
    public HashMap<String, Object[]> crawlJobsStatus = new HashMap<String, Object[]>();

-    private static Switchboard sb = null;
-
-    public Switchboard(final File dataPath, final File appPath, final String initPath, final String configPath)
-        throws IOException {
+    public Switchboard(final File dataPath, final File appPath, final String initPath, final String configPath) throws IOException {
        super(dataPath, appPath, initPath, configPath);
-
+        sb = this;
        // check if port is already occupied
        final int port = getConfigInt("port", 8090);
        try {
@ -294,7 +291,6 @@ public final class Switchboard extends serverSwitch
        }

        MemoryTracker.startSystemProfiling();
-        sb = this;

        // set loglevel and log
        setLog(new Log("SWITCHBOARD"));
@ -374,9 +370,9 @@ public final class Switchboard extends serverSwitch
        // start indexing management
        this.log.logConfig("Starting Indexing Management");
        final String networkName = getConfig(SwitchboardConstants.NETWORK_NAME, "");
-        final long fileSizeMax = (OS.isWindows) ? sb.getConfigLong("filesize.max.win", Integer.MAX_VALUE) : sb.getConfigLong( "filesize.max.other", Integer.MAX_VALUE);
-        final int redundancy = (int) sb.getConfigLong("network.unit.dhtredundancy.senior", 1);
-        final int partitionExponent = (int) sb.getConfigLong("network.unit.dht.partitionExponent", 0);
+        final long fileSizeMax = (OS.isWindows) ? this.getConfigLong("filesize.max.win", Integer.MAX_VALUE) : this.getConfigLong( "filesize.max.other", Integer.MAX_VALUE);
+        final int redundancy = (int) this.getConfigLong("network.unit.dhtredundancy.senior", 1);
+        final int partitionExponent = (int) this.getConfigLong("network.unit.dht.partitionExponent", 0);
        this.networkRoot = new File(new File(indexPath, networkName), "NETWORK");
        this.queuesRoot = new File(new File(indexPath, networkName), "QUEUES");
        this.networkRoot.mkdirs();
@ -1022,7 +1018,7 @@ public final class Switchboard extends serverSwitch
                "this is the content control import thread",
                null,
                new InstantBusyThread(
-                    new ContentControlImportThread(sb),
+                    new ContentControlImportThread(this),
                    "run",
                    SwitchboardConstants.PEER_PING_METHOD_JOBCOUNT,
                    SwitchboardConstants.PEER_PING_METHOD_FREEMEM,
@ -1037,7 +1033,7 @@ public final class Switchboard extends serverSwitch
                "this is the content control filter update thread",
                null,
                new InstantBusyThread(
-                    new ContentControlFilterUpdateThread(sb),
+                    new ContentControlFilterUpdateThread(this),
                    "run",
                    SwitchboardConstants.PEER_PING_METHOD_JOBCOUNT,
                    SwitchboardConstants.PEER_PING_METHOD_FREEMEM,
@ -1063,7 +1059,6 @@ public final class Switchboard extends serverSwitch
        this.trail = new LinkedBlockingQueue<String>();

        this.log.logConfig("Finished Switchboard Initialization");
-        sb = this;
    }

    public int getIndexingProcessorsQueueSize() {
@ -1235,10 +1230,9 @@ public final class Switchboard extends serverSwitch
            final int wordCacheMaxCount =
                (int) getConfigLong(SwitchboardConstants.WORDCACHE_MAX_COUNT, 20000);
            final long fileSizeMax =
-                (OS.isWindows) ? sb.getConfigLong("filesize.max.win", Integer.MAX_VALUE) : sb
-                    .getConfigLong("filesize.max.other", Integer.MAX_VALUE);
-            final int redundancy = (int) sb.getConfigLong("network.unit.dhtredundancy.senior", 1);
-            final int partitionExponent = (int) sb.getConfigLong("network.unit.dht.partitionExponent", 0);
+                (OS.isWindows) ? this.getConfigLong("filesize.max.win", Integer.MAX_VALUE) : this.getConfigLong("filesize.max.other", Integer.MAX_VALUE);
+            final int redundancy = (int) this.getConfigLong("network.unit.dhtredundancy.senior", 1);
+            final int partitionExponent = (int) this.getConfigLong("network.unit.dht.partitionExponent", 0);
            final String networkName = getConfig(SwitchboardConstants.NETWORK_NAME, "");
            this.networkRoot = new File(new File(indexPrimaryPath, networkName), "NETWORK");
            this.queuesRoot = new File(new File(indexPrimaryPath, networkName), "QUEUES");
@ -1543,7 +1537,7 @@ public final class Switchboard extends serverSwitch
    public RankingProfile getRanking() {
        return (getConfig("rankingProfile", "").isEmpty())
            ? new RankingProfile(Classification.ContentDomain.TEXT)
-            : new RankingProfile("", crypt.simpleDecode(sb.getConfig("rankingProfile", "")));
+            : new RankingProfile("", crypt.simpleDecode(this.getConfig("rankingProfile", "")));
    }

    /**
@ -1970,7 +1964,7 @@ public final class Switchboard extends serverSwitch

            // clear caches if necessary
            if ( !MemoryControl.request(8000000L, false) ) {
-                sb.index.fulltext().clearCache();
+                this.index.fulltext().clearCache();
                SearchEventCache.cleanupEvents(false);
                this.trail.clear();
            }
@ -2246,7 +2240,7 @@ public final class Switchboard extends serverSwitch
            this.clusterhashes = this.peers.clusterHashes(getConfig("cluster.peers.yacydomain", ""));

            // check if we are reachable and try to map port again if not (e.g. when router rebooted)
-            if ( getConfigBool(SwitchboardConstants.UPNP_ENABLED, false) && sb.peers.mySeed().isJunior() ) {
+            if ( getConfigBool(SwitchboardConstants.UPNP_ENABLED, false) && this.peers.mySeed().isJunior() ) {
                UPnP.addPortMapping();
            }

@ -2698,6 +2692,122 @@ public final class Switchboard extends serverSwitch
        }
    }

+    public void stackURLs(Set<DigestURI> rootURLs, final CrawlProfile profile, final Set<DigestURI> successurls, final Map<DigestURI,String> failurls) {
+        List<Thread> stackthreads = new ArrayList<Thread>(); // do this concurrently
+        for (DigestURI url: rootURLs) {
+            final DigestURI turl = url;
+            Thread t = new Thread() {
+                public void run() {
+                    String failreason;
+                    if ((failreason = Switchboard.this.stackUrl(profile, turl)) == null) successurls.add(turl); else failurls.put(turl, failreason);
+                }
+            };
+            t.start();
+            stackthreads.add(t);
+        }
+        for (Thread t: stackthreads)try {t.join(5000);} catch (InterruptedException e) {}
+    }
+    
+    
+    /**
+     * stack the url to the crawler
+     * @param profile
+     * @param url
+     * @return null if this was ok. If this failed, return a string with a fail reason
+     */
+    public String stackUrl(CrawlProfile profile, DigestURI url) {
+        
+        byte[] handle = ASCII.getBytes(profile.handle());
+
+        // remove url from the index to be prepared for a re-crawl
+        final byte[] urlhash = url.hash();
+        this.index.fulltext().remove(urlhash);
+        this.crawlQueues.noticeURL.removeByURLHash(urlhash);
+        this.crawlQueues.errorURL.remove(urlhash);
+        
+        // special handling of ftp protocol
+        if (url.isFTP()) {
+            try {
+                this.crawler.putActive(handle, profile);
+                this.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
+                this.crawlStacker.enqueueEntriesFTP(this.peers.mySeed().hash.getBytes(), profile.handle(), url.getHost(), url.getPort(), false);
+                return null;
+            } catch (final Exception e) {
+                // mist
+                Log.logException(e);
+                return "problem crawling an ftp site: " + e.getMessage();
+            }
+        }
+
+        // get a scraper to get the title
+        Document scraper;
+        try {
+            scraper = this.loader.loadDocument(url, CacheStrategy.IFFRESH, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay);
+        } catch (IOException e) {
+            Log.logException(e);
+            return "scraper cannot load URL: " + e.getMessage();
+        }
+        
+        final String title = scraper == null ? url.toNormalform(true) : scraper.dc_title();
+        final String description = scraper.dc_description();
+
+        // add the url to the crawl stack
+        this.crawler.removePassive(handle); // if there is an old entry, delete it
+        this.crawler.putActive(handle, profile);
+        final String reasonString = this.crawlStacker.stackCrawl(new Request(
+                this.peers.mySeed().hash.getBytes(),
+                url,
+                null,
+                "CRAWLING-ROOT",
+                new Date(),
+                profile.handle(),
+                0,
+                0,
+                0,
+                0
+                ));
+        
+        if (reasonString != null) return reasonString;
+        
+        // create a bookmark from crawl start url
+        //final Set<String> tags=ListManager.string2set(BookmarkHelper.cleanTagsString(post.get("bookmarkFolder","/crawlStart")));
+        final Set<String> tags=ListManager.string2set(BookmarkHelper.cleanTagsString("/crawlStart"));
+        tags.add("crawlStart");
+        final String[] keywords = scraper.dc_subject();
+        if (keywords != null) {
+            for (final String k: keywords) {
+                final String kk = BookmarkHelper.cleanTagsString(k);
+                if (kk.length() > 0) tags.add(kk);
+            }
+        }
+        String tagStr = tags.toString();
+        if (tagStr.length() > 2 && tagStr.startsWith("[") && tagStr.endsWith("]")) tagStr = tagStr.substring(1, tagStr.length() - 2);
+
+        // we will create always a bookmark to use this to track crawled hosts
+        final BookmarksDB.Bookmark bookmark = this.bookmarksDB.createBookmark(url.toNormalform(true), "admin");
+        if (bookmark != null) {
+            bookmark.setProperty(BookmarksDB.Bookmark.BOOKMARK_TITLE, title);
+            bookmark.setProperty(BookmarksDB.Bookmark.BOOKMARK_DESCRIPTION, description);
+            bookmark.setOwner("admin");
+            bookmark.setPublic(false);
+            bookmark.setTags(tags, true);
+            this.bookmarksDB.saveBookmark(bookmark);
+        }
+
+        // do the same for ymarks
+        // TODO: could a non admin user add crawls?
+        try {
+            this.tables.bookmarks.createBookmark(this.loader, url, YMarkTables.USER_ADMIN, true, "crawlStart", "/Crawl Start");
+        } catch (IOException e) {
+            Log.logException(e);
+        } catch (Failure e) {
+            Log.logException(e);
+        }
+
+        // that was ok
+        return null;
+    }
+    
    /**
     * load the content of a URL, parse the content and add the content to the index This process is started
     * concurrently. The method returns immediately after the call.
@ -2718,7 +2828,7 @@ public final class Switchboard extends serverSwitch
            return; // don't do double-work
        }
        final Request request = this.loader.request(url, true, true);
-        final CrawlProfile profile = sb.crawler.getActive(ASCII.getBytes(request.profileHandle()));
+        final CrawlProfile profile = this.crawler.getActive(ASCII.getBytes(request.profileHandle()));
        final String acceptedError = this.crawlStacker.checkAcceptance(url, profile, 0);
        final String urls = url.toNormalform(true);
        if ( acceptedError != null ) {
@ -2793,7 +2903,7 @@ public final class Switchboard extends serverSwitch
            return; // don't do double-work
        }
        final Request request = this.loader.request(url, true, true);
-        final CrawlProfile profile = sb.crawler.getActive(ASCII.getBytes(request.profileHandle()));
+        final CrawlProfile profile = this.crawler.getActive(ASCII.getBytes(request.profileHandle()));
        final String acceptedError = this.crawlStacker.checkAcceptance(url, profile, 0);
        if (acceptedError != null) {
            this.log.logInfo("addToCrawler: cannot load "
@ -2804,9 +2914,9 @@ public final class Switchboard extends serverSwitch
        }
        final String s;
        if (asglobal) {
-            s = sb.crawlQueues.noticeURL.push(StackType.GLOBAL, request, this.robots);
+            s = this.crawlQueues.noticeURL.push(StackType.GLOBAL, request, this.robots);
        } else {
-            s = sb.crawlQueues.noticeURL.push(StackType.LOCAL, request, this.robots);
+            s = this.crawlQueues.noticeURL.push(StackType.LOCAL, request, this.robots);
        }

        if (s != null) {
@ -3179,7 +3289,7 @@ public final class Switchboard extends serverSwitch
                    if (links != null) {
                        if (links.size() < 1000) { // limit to 1000 to skip large index pages
                            final Iterator<MultiProtocolURI> i = links.keySet().iterator();
-                            final boolean globalcrawljob = sb.getConfigBool("heuristic.searchresults.crawlglobal",false);
+                            final boolean globalcrawljob = Switchboard.this.getConfigBool("heuristic.searchresults.crawlglobal",false);
                            while (i.hasNext()) {
                                url = DigestURI.toDigestURI(i.next());
                                boolean islocal = url.getHost().contentEquals(startUrl.getHost());
@ -3239,7 +3349,7 @@ public final class Switchboard extends serverSwitch
                searchEvent.getRankingResult().oneFeederStarted();
                try {
                    final Response response =
-                        sb.loader.load(sb.loader.request(url, true, false), CacheStrategy.NOCACHE, BlacklistType.SEARCH, TextSnippet.snippetMinLoadDelay);
+                        Switchboard.this.loader.load(Switchboard.this.loader.request(url, true, false), CacheStrategy.NOCACHE, BlacklistType.SEARCH, TextSnippet.snippetMinLoadDelay);
                    final byte[] resource = (response == null) ? null : response.getContent();
                    //System.out.println("BLEKKO: " + UTF8.String(resource));
                    rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource);
@ -3337,7 +3447,7 @@ public final class Switchboard extends serverSwitch
            if ( Thread.currentThread().isInterrupted() ) {
                break;
            }
-            seedListFileURL = sb.getConfig("network.unit.bootstrap.seedlist" + c, "");
+            seedListFileURL = this.getConfig("network.unit.bootstrap.seedlist" + c, "");
            if ( seedListFileURL.isEmpty() ) {
                break;
            }
--- a/source/net/yacy/search/index/Fulltext.java
+++ b/source/net/yacy/search/index/Fulltext.java
@ -295,6 +295,13 @@ public final class Fulltext implements Iterable<byte[]> {
        if (MemoryControl.shortStatus()) clearCache();
    }

+    public void removeConcurrently(final List<byte[]> deleteIDs) {
+        new Thread() {
+            public void run() {for (byte[] id: deleteIDs) {remove(id);}}
+        }.start();
+        this.solr.commit();
+    }
+    
    public boolean remove(final byte[] urlHash) {
        if (urlHash == null) return false;
        try {
@ -720,7 +727,7 @@ public final class Fulltext implements Iterable<byte[]> {
    }

    /**
-     * using a fragment of the url hash (5 bytes: bytes 6 to 10) it is possible to address all urls from a specific domain
+     * using a fragment of the url hash (6 bytes: bytes 6 to 11) it is possible to address all urls from a specific domain
     * here such a fragment can be used to delete all these domains at once
     * @param hosthash
     * @return number of deleted domains