changes towards the new index storage scheme:

- replaced usage of temporary IndexEntity by EntryContainer - added more attributes to word index - added exact-string search (using quotes in query) - disabled writing into WORDS during search; EntryContainers are used instead git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1485 6c8d7289-2bf4-0310-a012-ef5d649a1542
19 years ago · 03c65742ba
parent c81ad1bf34
commit 03c65742ba
24 changed files with 568 additions and 320 deletions
--- a/htroot/IndexControl_p.java
+++ b/htroot/IndexControl_p.java
@ -62,6 +62,7 @@ import de.anomic.plasma.plasmaSwitchboard;
 import de.anomic.plasma.plasmaURL;
 import de.anomic.plasma.plasmaWordIndexEntity;
 import de.anomic.plasma.plasmaWordIndexEntry;
+import de.anomic.plasma.plasmaWordIndexEntryContainer;
 import de.anomic.server.serverObjects;
 import de.anomic.server.serverSwitch;
 import de.anomic.yacy.yacyClient;
@ -255,12 +256,12 @@ public class IndexControl_p {
            }
            prop.put("urlstring", "");
            prop.put("urlhash", "");
-            plasmaWordIndexEntity[] indexes = new plasmaWordIndexEntity[1];
+            plasmaWordIndexEntryContainer[] indexes = new plasmaWordIndexEntryContainer[1];
            String result;
            long starttime = System.currentTimeMillis();
-            indexes[0] = switchboard.wordIndex.getEntity(keyhash, true, -1);
+            indexes[0] = switchboard.wordIndex.getContainer(keyhash, true, -1);
            // built urlCache
-            Iterator urlIter = indexes[0].elements(true);
+            Iterator urlIter = indexes[0].entries();
            HashMap knownURLs = new HashMap();
            HashSet unknownURLEntries = new HashSet();
            plasmaWordIndexEntry indexEntry;
@ -282,9 +283,7 @@ public class IndexControl_p {
            // now delete all entries that have no url entry
            Iterator hashIter = unknownURLEntries.iterator();
            while (hashIter.hasNext()) {
-                try {
-                    indexes[0].removeEntry((String) hashIter.next(), false);
-                } catch (IOException e) {}
+                indexes[0].remove((String) hashIter.next());
            }
            // use whats remaining           
            String gzipBody = switchboard.getConfig("indexControl.gzipBody","false");
@ -296,7 +295,8 @@ public class IndexControl_p {
                         "true".equalsIgnoreCase(gzipBody),
                         timeout);
            prop.put("result", (result == null) ? ("Successfully transferred " + indexes[0].size() + " words in " + ((System.currentTimeMillis() - starttime) / 1000) + " seconds") : result);
-            try {indexes[0].close();} catch (IOException e) {}
+            indexes[0] = null;
+            indexes = null;
        }

        // generate list
@ -431,15 +431,15 @@ public class IndexControl_p {

    public static String genUrlList(plasmaSwitchboard switchboard, String keyhash, String keystring) {
        // search for a word hash and generate a list of url links
-        plasmaWordIndexEntity index = null;
+        plasmaWordIndexEntryContainer index = null;
        try {
-            index = switchboard.wordIndex.getEntity(keyhash, true, -1);
+            index = switchboard.wordIndex.getContainer(keyhash, true, -1);

            final StringBuffer result = new StringBuffer(1024);
            if (index.size() == 0) {
                result.append("No URL entries related to this word hash <span class=\"tt\">").append(keyhash).append("</span>.");
            } else {
-                final Iterator en = index.elements(true);
+                final Iterator en = index.entries();
                result.append("URL entries related to this word hash <span class=\"tt\">").append(keyhash).append("</span><br><br>");
                result.append("<form action=\"IndexControl_p.html\" method=\"post\" enctype=\"multipart/form-data\">");
                String us;
@ -497,13 +497,12 @@ public class IndexControl_p {
                      .append("<span class=\"small\">for every resolveable and deleted URL reference, delete the same reference at every other word where the reference exists (very extensive, but prevents further unresolved references)</span>")
                      .append("</td></tr></table></fieldset></form><br>");
            }
-            index.close();
            index = null;
            return result.toString();
        }  catch (IOException e) {
            return "";
        } finally {
-            if (index != null) try { index.close(); index = null; } catch (Exception e) {};
+            if (index != null) index = null;
        }
    }

--- a/htroot/htdocsdefault/dir.java
+++ b/htroot/htdocsdefault/dir.java
@ -463,7 +463,7 @@ public class dir {
                "AAAAAAAAAAAA", /*referrer*/
                0, /*copycount*/
                false, /*localneed*/
-                condenser.RESULT_INFORMATION_VALUE,
+                condenser.RESULT_WORD_ENTROPHY,
                "**", /*language*/
                plasmaWordIndexEntry.DT_SHARE, /*doctype*/
                phrase.length(), /*size*/
--- a/htroot/index.java
+++ b/htroot/index.java
@ -126,7 +126,12 @@ public class index {

        // SEARCH
        // process search words
-        final String querystring = post.get("search", "");
+        int maxDistance = Integer.MAX_VALUE;
+        String querystring = post.get("search", "").trim();
+        if ((querystring.charAt(0) == '"') && (querystring.charAt(querystring.length() - 1) == '"')) {
+            querystring = querystring.substring(1, querystring.length() - 1).trim();
+            maxDistance = 1;
+        }
        if (sb.facilityDB != null) try { sb.facilityDB.update("zeitgeist", querystring, post); } catch (Exception e) {}
        final TreeSet query = plasmaSearchQuery.cleanQuery(querystring);
        // filter out stopwords
@ -172,7 +177,7 @@ public class index {
        }

        // do the search
-        plasmaSearchQuery thisSearch = new plasmaSearchQuery(query, new String[]{order1, order2, order3}, count, searchtime, urlmask, referer,
+        plasmaSearchQuery thisSearch = new plasmaSearchQuery(query, maxDistance, new String[]{order1, order2, order3}, count, searchtime, urlmask, referer,
                                                             ((global) && (yacyonline) && (!(env.getConfig("last-search","").equals(querystring)))) ? plasmaSearchQuery.SEARCHDOM_GLOBALDHT : plasmaSearchQuery.SEARCHDOM_LOCAL,
                                                             "", 20);
        final serverObjects prop = sb.searchFromLocal(thisSearch);
--- a/htroot/yacy/search.java
+++ b/htroot/yacy/search.java
@ -47,7 +47,6 @@
 // javac -classpath .:../../Classes search.java
 // if the shell's current path is htroot/yacy

-import java.io.IOException;
 import java.util.HashSet;
 import de.anomic.http.httpHeader;
 import de.anomic.plasma.plasmaCrawlLURL;
@ -81,6 +80,7 @@ public final class search {
 //      final String  fwden  = post.get("fwden", "");  // forward deny, a list of seed hashes. They may NOT be target of forward hopping
        final long    duetime= post.getLong("duetime", 3000);
        final int     count  = post.getInt("count", 10); // maximum number of wanted results
+        final int     maxdist= post.getInt("maxdist", Integer.MAX_VALUE);
 //      final boolean global = ((String) post.get("resource", "global")).equals("global"); // if true, then result may consist of answers from other peers
 //      Date remoteTime = yacyCore.parseUniversalDate((String) post.get(yacySeed.MYTIME));        // read remote time

@ -103,7 +103,7 @@ public final class search {
        }
        final long timestamp = System.currentTimeMillis();
        
-        plasmaSearchQuery squery = new plasmaSearchQuery(keyhashes, new String[]{plasmaSearchQuery.ORDER_YBR, plasmaSearchQuery.ORDER_DATE, plasmaSearchQuery.ORDER_QUALITY},
+        plasmaSearchQuery squery = new plasmaSearchQuery(keyhashes, maxdist, new String[]{plasmaSearchQuery.ORDER_YBR, plasmaSearchQuery.ORDER_DATE, plasmaSearchQuery.ORDER_QUALITY},
                                                        count, duetime, ".*");
        squery.domType = plasmaSearchQuery.SEARCHDOM_LOCAL;

@ -114,11 +114,8 @@ public final class search {
        plasmaSearchEvent theSearch = new plasmaSearchEvent(squery, yacyCore.log, sb.wordIndex, sb.urlPool.loadedURL, sb.snippetCache);
        plasmaSearchResult acc = null;
        int idxc = 0;
-        try {
        idxc = theSearch.localSearch();
        acc = theSearch.order();
-        } catch (IOException e) {
-        }

        // result is a List of urlEntry elements
        if ((idxc == 0) || (acc == null)) {
--- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
+++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
@ -75,6 +75,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
        linkTags0.add("img");
        linkTags0.add("base");
        linkTags0.add("frame");
+        linkTags0.add("meta");

        linkTags1 = new TreeSet(insensitiveCollator);
        linkTags1.add("a");
@ -88,6 +89,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
    // class variables: collectors for links
    private HashMap anchors;
    private HashMap images;
+    private HashMap metas;
    private String title;
    //private String headline;
    private List[] headlines;
@ -101,6 +103,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
        this.root = root;
        this.anchors = new HashMap();
        this.images = new HashMap();
+        this.metas = new HashMap();
        this.title = "";
        this.headlines = new ArrayList[4];
        for (int i = 0; i < 4; i++) headlines[i] = new ArrayList();
@ -194,6 +197,11 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
        }
    }
    
+    public static final String splitrex = " |/|\\(|\\)|-|\\:|_|\\.|,|\\?|!|'|" + '"';
+    public static String[] urlComps(String normalizedURL) {
+        return normalizedURL.toLowerCase().split(splitrex); // word components of the url
+    }
+    
    private String absolutePath(String relativePath) {
        try {
            return urlNormalform(new URL(root, relativePath));
@ -206,6 +214,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
        if (tagname.equalsIgnoreCase("img")) images.put(absolutePath(tagopts.getProperty("src", "")), tagopts.getProperty("alt",""));
        if (tagname.equalsIgnoreCase("base")) try {root = new URL(tagopts.getProperty("href", ""));} catch (MalformedURLException e) {}
        if (tagname.equalsIgnoreCase("frame")) anchors.put(absolutePath(tagopts.getProperty("src", "")), tagopts.getProperty("name",""));
+        if (tagname.equalsIgnoreCase("meta")) metas.put((tagopts.getProperty("name", "")).toLowerCase(), tagopts.getProperty("content",""));
    }

    public void scrapeTag1(String tagname, Properties tagopts, byte[] text) {
@ -252,10 +261,16 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
        // construct a title string, even if the document has no title
        // if there is one, return it
        if (title.length() > 0) return title;
+        
        // othervise take any headline
        for (int i = 0; i < 4; i++) {
            if (headlines[i].size() > 0) return (String) headlines[i].get(0);
        }
+        
+        // take description tag
+        String s = getDescription();
+        if (s.length() > 0) return s;
+        
        // extract headline from content
        if (content.length() > 80) return cleanLine(new String(content.getBytes(), 0, 80));
        return cleanLine(content.trim().toString());
@ -280,6 +295,45 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
        return images;
    }

+    public Map getMetas() {
+        return metas;
+    }
+
+    public String getDescription() {
+        String s = (String) metas.get("description");
+        if (s == null) return ""; else return s;
+    }
+    
+    public String getContentType() {
+        String s = (String) metas.get("content-type");
+        if (s == null) return ""; else return s;
+    }
+    
+    public String getCopyright() {
+        String s = (String) metas.get("copyright");
+        if (s == null) return ""; else return s;
+    }
+    
+    public String[] getContentLanguages() {
+        String s = (String) metas.get("content-language");
+        if (s == null) s = "";
+        return s.split(" |,");
+    }
+    
+    public String[] getKeywords() {
+        String s = (String) metas.get("keywords");
+        if (s == null) s = "";
+        if (s.length() == 0) {
+            return getTitle().toLowerCase().split(splitrex);
+        } else {
+        return s.split(" |,");
+        }
+    }
+    
+    /*
+     *  (non-Javadoc)
+     * @see de.anomic.htmlFilter.htmlFilterScraper#close()
+     */
    public void close() {
        // free resources
        super.close();
@ -298,6 +352,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
        }
        System.out.println("ANCHORS  :" + anchors.toString());
        System.out.println("IMAGES   :" + images.toString());
+        System.out.println("METAS    :" + metas.toString());
        System.out.println("TEXT     :" + new String(content.getBytes()));
    }

--- a/source/de/anomic/kelondro/kelondroTree.java
+++ b/source/de/anomic/kelondro/kelondroTree.java
@ -850,11 +850,11 @@ public class kelondroTree extends kelondroRecords implements kelondroIndex {
            this.rot = rotating;
            ii = new nodeIterator(asc, rot, start);
            nextNode = (ii.hasNext()) ? (Node) ii.next() : null;
-            if (nextNode != null) {
+            if ((nextNode != null) && (nextNode.getKey() != null)) {
                int c = objectOrder.compare(firstKey, nextNode.getKey());
                if ((c > 0) && (asc)) {
                    // firstKey > nextNode.getKey()
-                    log.logWarning("CORRECTING ITERATOR: firstKey=" + new String(firstKey) + ", nextNode=" + new String(nextNode.getKey()));
+                    if (log != null) log.logWarning("CORRECTING ITERATOR: firstKey=" + new String(firstKey) + ", nextNode=" + new String(nextNode.getKey()));
                    nextNode = (ii.hasNext()) ? (Node) ii.next() : null;
                }
                if ((c < 0) && (!(asc))) {
--- a/source/de/anomic/plasma/plasmaCondenser.java
+++ b/source/de/anomic/plasma/plasmaCondenser.java
@ -83,9 +83,6 @@ public final class plasmaCondenser {
    public int RESULT_NUMB_SENTENCES = -1;
    public int RESULT_DIFF_SENTENCES = -1;
    public int RESULT_SIMI_SENTENCES = -1;
-    public int RESULT_AVERAGE_WORD_OCC = -1;
-    public int RESULT_INFORMATION_VALUE = -1;
-
    
    public plasmaCondenser(InputStream text) {
        this(text, 3, 2);
@ -357,8 +354,7 @@ public final class plasmaCondenser {
        this.RESULT_NUMB_SENTENCES = allsentencecounter;
        this.RESULT_DIFF_SENTENCES = sentenceHandleCount;
        this.RESULT_SIMI_SENTENCES = sentences.size();
-        this.RESULT_AVERAGE_WORD_OCC = (words.size() == 0) ? 0 : (allwordcounter / words.size());
-        this.RESULT_INFORMATION_VALUE = (allwordcounter == 0) ? 0 : (wordenum.count() * words.size() / allwordcounter / 16);
+        //this.RESULT_INFORMATION_VALUE = (allwordcounter == 0) ? 0 : (wordenum.count() * words.size() / allwordcounter / 16);
    }

    public void print() {
--- a/source/de/anomic/plasma/plasmaCrawlLURL.java
+++ b/source/de/anomic/plasma/plasmaCrawlLURL.java
@ -176,7 +176,7 @@ public final class plasmaCrawlLURL extends plasmaURL {
        gcrawlResultStack.add(urlHash + initiatorHash + executorHash);
    }

-    public synchronized Entry getEntry(String hash, plasmaWordIndexEntry searchedWord) throws IOException {
+    public Entry getEntry(String hash, plasmaWordIndexEntry searchedWord) throws IOException {
        return new Entry(hash, searchedWord);
    }

@ -399,7 +399,15 @@ public final class plasmaCrawlLURL extends plasmaURL {
        private int size;
        private int wordCount;
        private String snippet;
-        private plasmaWordIndexEntry word;
+        private plasmaWordIndexEntry word; // this is only used if the url is transported via remote search requests
+
+        // more needed attributes:
+        // - author / copyright owner
+        // - keywords
+        // - phrasecount, total number of phrases
+        // - boolean: URL attributes
+        // - int: # of outlinks to same domain
+        // - int: # of outlinks to outside domain
        
        public Entry(URL url, String descr, Date moddate, Date loaddate, String referrerHash, int copyCount, boolean localNeed, int quality, String language, char doctype, int size, int wordCount) {
            // create new entry and store it into database
--- a/source/de/anomic/plasma/plasmaSearchEvent.java
+++ b/source/de/anomic/plasma/plasmaSearchEvent.java
@ -63,7 +63,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
    private plasmaWordIndex wordIndex;
    private plasmaCrawlLURL urlStore;
    private plasmaSnippetCache snippetCache;
-    private plasmaWordIndexEntity rcLocal, rcGlobal; // caches for results
+    private plasmaWordIndexEntryContainer rcLocal, rcGlobal; // caches for results
    private plasmaSearchProfile profileLocal, profileGlobal;
    private yacySearch[] searchThreads;
    
@ -73,8 +73,8 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
        this.query = query;
        this.urlStore = urlStore;
        this.snippetCache = snippetCache;
-        this.rcLocal = new plasmaWordIndexEntity(null);
-        this.rcGlobal = new plasmaWordIndexEntity(null);
+        this.rcLocal = new plasmaWordIndexEntryContainer(null);
+        this.rcGlobal = new plasmaWordIndexEntryContainer(null);
        if (query.domType == plasmaSearchQuery.SEARCHDOM_GLOBALDHT) {
            this.profileLocal  = new plasmaSearchProfile(4 * query.maximumTime / 10, query.wantedResults);
            this.profileGlobal = new plasmaSearchProfile(6 * query.maximumTime / 10, query.wantedResults);
@ -114,7 +114,6 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
            int globalContributions = globalSearch(fetchpeers);
            log.logFine("SEARCH TIME AFTER GLOBAL-TRIGGER TO " + fetchpeers + " PEERS: " + ((System.currentTimeMillis() - start) / 1000) + " seconds");
            
-            try {
            // combine the result and order
            plasmaSearchResult result = order();
            result.globalContributions = globalContributions;
@ -125,57 +124,46 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
            //serverInstantThread.oneTimeJob(this, "flushResults", log, 0);
            
            // clean up
-                if ((rcLocal != null) && (!(rcLocal.isTMPEntity()))) rcLocal.close();
            rcLocal = null;
            
            // return search result
            log.logFine("SEARCHRESULT: " + profileLocal.reportToString());
            lastEvent = this;
            return result;
-            } catch (IOException e) {
-                return null;
-            }
        } else {
-            // do a local search
-            //long start = System.currentTimeMillis();
-            try {
            localSearch();
            plasmaSearchResult result = order();
            result.localContributions = rcLocal.size();
            
            // clean up
-                if ((rcLocal != null) && (!(rcLocal.isTMPEntity()))) rcLocal.close();
            rcLocal = null;
            
            // return search result
            log.logFine("SEARCHRESULT: " + profileLocal.reportToString());
            lastEvent = this;
            return result;
-            } catch (IOException e) {
-                return null;
-            }
        }
    }
    
-    public int localSearch() throws IOException {
+    public int localSearch() {
        // search for the set of hashes and return an array of urlEntry elements
        
        // retrieve entities that belong to the hashes
        profileLocal.startTimer();
-        Set entities = wordIndex.getEntities(query.queryHashes, true, true, profileLocal.getTargetTime(plasmaSearchProfile.PROCESS_COLLECTION));
-        if (entities.size() < query.size()) entities = null; // prevent that only a subset is returned
+        Set containers = wordIndex.getContainers(query.queryHashes, true, true, profileLocal.getTargetTime(plasmaSearchProfile.PROCESS_COLLECTION));
+        if (containers.size() < query.size()) containers = null; // prevent that only a subset is returned
        profileLocal.setYieldTime(plasmaSearchProfile.PROCESS_COLLECTION);
-        profileLocal.setYieldCount(plasmaSearchProfile.PROCESS_COLLECTION, (entities == null) ? 0 : entities.size());
+        profileLocal.setYieldCount(plasmaSearchProfile.PROCESS_COLLECTION, (containers == null) ? 0 : containers.size());
        
        // since this is a conjunction we return an empty entity if any word is not known
-        if (entities == null) {
-            rcLocal = new plasmaWordIndexEntity(null);
+        if (containers == null) {
+            rcLocal = new plasmaWordIndexEntryContainer(null);
            return 0;
        }
        
        // join the result
        profileLocal.startTimer();
-        rcLocal = plasmaWordIndexEntity.joinEntities(entities, profileLocal.getTargetTime(plasmaSearchProfile.PROCESS_JOIN));
+        rcLocal = plasmaWordIndexEntryContainer.joinContainer(containers, profileLocal.getTargetTime(plasmaSearchProfile.PROCESS_JOIN), query.maxDistance);
        profileLocal.setYieldTime(plasmaSearchProfile.PROCESS_JOIN);
        profileLocal.setYieldCount(plasmaSearchProfile.PROCESS_JOIN, rcLocal.size());
        
@ -190,7 +178,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
        log.logFine("STARTING " + fetchpeers + " THREADS TO CATCH EACH " + profileGlobal.getTargetCount(plasmaSearchProfile.PROCESS_POSTSORT) + " URLs WITHIN " + (profileGlobal.duetime() / 1000) + " SECONDS");
        
        long timeout = System.currentTimeMillis() + profileGlobal.duetime() + 4000;
-        searchThreads = yacySearch.searchHashes(query.queryHashes, urlStore, rcGlobal, fetchpeers, plasmaSwitchboard.urlBlacklist, snippetCache, profileGlobal);
+        searchThreads = yacySearch.searchHashes(query.queryHashes, query.maxDistance, urlStore, rcGlobal, fetchpeers, plasmaSwitchboard.urlBlacklist, snippetCache, profileGlobal);
        
        // wait until wanted delay passed or wanted result appeared
        while (System.currentTimeMillis() < timeout) {
@ -204,20 +192,20 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
        return rcGlobal.size();
    }
    
-    public plasmaSearchResult order() throws IOException {
+    public plasmaSearchResult order() {
        // we collect the urlhashes and construct a list with urlEntry objects
        // attention: if minEntries is too high, this method will not terminate within the maxTime

-        plasmaWordIndexEntity searchResult = new plasmaWordIndexEntity(null);
-        searchResult.merge(rcLocal, -1);
-        searchResult.merge(rcGlobal, -1);
+        plasmaWordIndexEntryContainer searchResult = new plasmaWordIndexEntryContainer(null);
+        searchResult.add(rcLocal);
+        searchResult.add(rcGlobal);
        
        long preorderTime = profileLocal.getTargetTime(plasmaSearchProfile.PROCESS_PRESORT);
        long postorderTime = profileLocal.getTargetTime(plasmaSearchProfile.PROCESS_POSTSORT);
        
        profileLocal.startTimer();
        plasmaSearchPreOrder preorder = new plasmaSearchPreOrder(query);
-        preorder.addEntity(searchResult, preorderTime);
+        preorder.addContainer(searchResult, preorderTime);
        profileLocal.setYieldTime(plasmaSearchProfile.PROCESS_PRESORT);
        profileLocal.setYieldCount(plasmaSearchProfile.PROCESS_PRESORT, rcLocal.size());
        
@ -289,19 +277,13 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
                Iterator hashi = query.queryHashes.iterator();
                while (hashi.hasNext()) {
                    wordHash = (String) hashi.next();
-                    Iterator i = rcGlobal.elements(true);
-                    plasmaWordIndexEntry entry;
-                    plasmaWordIndexEntryContainer container = new plasmaWordIndexEntryContainer(wordHash, rcGlobal.size());
-                    while (i.hasNext()) {
-                        entry = (plasmaWordIndexEntry) i.next();
-                        container.add(entry, System.currentTimeMillis());
-                    }
-                    wordIndex.addEntries(container, true);
-                    log.logFine("FLUSHED " + wordHash + ": " + container.size() + " url entries");
+                    rcGlobal.setWordHash(wordHash);
+                    wordIndex.addEntries(rcGlobal, true);
+                    log.logFine("FLUSHED " + wordHash + ": " + rcGlobal.size() + " url entries");
                }
                // the rcGlobal was flushed, empty it
                count += rcGlobal.size();
-                rcGlobal.deleteComplete();
+                rcGlobal.clear();
            }    
            // wait a little bit before trying again
            try {Thread.sleep(3000);} catch (InterruptedException e) {}
--- a/source/de/anomic/plasma/plasmaSearchPreOrder.java
+++ b/source/de/anomic/plasma/plasmaSearchPreOrder.java
@ -116,8 +116,8 @@ public final class plasmaSearchPreOrder {
        return (plasmaWordIndexEntry) pageAcc.remove(top);
    }
    
-    public void addEntity(plasmaWordIndexEntity entity, long maxTime) {
-        Iterator i = entity.elements(true);
+    public void addContainer(plasmaWordIndexEntryContainer container, long maxTime) {
+        Iterator i = container.entries();
        long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime;
        plasmaWordIndexEntry entry;
        while (i.hasNext()) {
--- a/source/de/anomic/plasma/plasmaSearchQuery.java
+++ b/source/de/anomic/plasma/plasmaSearchQuery.java
@ -72,12 +72,14 @@ public final class plasmaSearchQuery {
    public int domType;
    public String domGroupName;
    public int domMaxTargets;
+    public int maxDistance;

-    public plasmaSearchQuery(Set queryWords,
+    public plasmaSearchQuery(Set queryWords, int maxDistance,
                             String[] order, int wantedResults, long maximumTime, String urlMask,
                             String referrer,
                             int domType, String domGroupName, int domMaxTargets) {
        this.queryWords = queryWords;
+        this.maxDistance = maxDistance;
        this.queryHashes = words2hashes(queryWords);
        this.order = order;
        this.wantedResults = wantedResults;
@ -89,9 +91,10 @@ public final class plasmaSearchQuery {
        this.domMaxTargets = domMaxTargets;
    }
    
-    public plasmaSearchQuery(Set queryHashes,
+    public plasmaSearchQuery(Set queryHashes, int maxDistance,
                             String[] order, int wantedResults, long maximumTime, String urlMask) {
        this.queryWords = null;
+        this.maxDistance = maxDistance;
        this.queryHashes = queryHashes;
        this.order = order;
        this.wantedResults = wantedResults;
--- a/source/de/anomic/plasma/plasmaSearchResult.java
+++ b/source/de/anomic/plasma/plasmaSearchResult.java
@ -54,11 +54,10 @@ import java.net.MalformedURLException;

 import de.anomic.kelondro.kelondroMScoreCluster;
 import de.anomic.server.serverCodings;
+import de.anomic.htmlFilter.htmlFilterContentScraper;

 public final class plasmaSearchResult {
    
-    public static final String splitrex = " |/|\\(|\\)|-|\\:|_|\\.|,|\\?|!|'|" + '"';
-    
    private TreeMap pageAcc;            // key = order hash; value = plasmaLURL.entry
    private kelondroMScoreCluster ref;  // reference score computation for the commonSense heuristic
    private ArrayList results;          // this is a buffer for plasmaWordIndexEntry + plasmaCrawlLURL.entry - objects
@ -111,8 +110,8 @@ public final class plasmaSearchResult {
        URL url = page.url();
        String descr = page.descr();
        if ((url == null) || (descr == null)) return;
-        String[] urlcomps = url.toString().toLowerCase().split(splitrex); // word components of the url
-        String[] descrcomps = descr.toLowerCase().split(splitrex); // words in the description
+        String[] urlcomps = htmlFilterContentScraper.urlComps(url.toString()); // word components of the url
+        String[] descrcomps = descr.toLowerCase().split(htmlFilterContentScraper.splitrex); // words in the description
        
        // store everything
        Object[] resultVector = new Object[] {indexEntry, page, urlcomps, descrcomps};
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@ -1285,7 +1285,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
                            yacyCore.seedDB.mySeed.hash,
                            referrerHash,
                            0, true,
-                            condenser.RESULT_INFORMATION_VALUE,
+                            condenser.RESULT_WORD_ENTROPHY,
                            plasmaWordIndexEntry.language(entry.url()),
                            plasmaWordIndexEntry.docType(document.getMimeType()),
                            (int) entry.size(),
@ -1313,15 +1313,11 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
                        } else {
                            HashMap urlCache = new HashMap(1);
                            urlCache.put(newEntry.hash(),newEntry);
-                            ArrayList tmpEntities = new ArrayList(condenser.RESULT_SIMI_WORDS);
+                            ArrayList tmpContainers = new ArrayList(condenser.RESULT_SIMI_WORDS);
                            String language = plasmaWordIndexEntry.language(entry.url());
                            char doctype = plasmaWordIndexEntry.docType(document.getMimeType());
-                            int quality = 0;
-                            try {
-                                quality = condenser.RESULT_INFORMATION_VALUE;
-                            } catch (NumberFormatException e) {
-                                System.out.println("INTERNAL ERROR WITH CONDENSER.INFORMATION_VALUE: " + e.toString() + ": in URL " + newEntry.url().toString());
-                            }
+                            int urlLength = newEntry.url().toString().length();
+                            int urlComps = htmlFilterContentScraper.urlComps(newEntry.url().toString()).length;
                            
                            // iterate over all words
                            Iterator i = condenser.words();
@ -1332,8 +1328,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
                                String word = (String) wentry.getKey();
                                wordStat = (plasmaCondenser.wordStatProp) wentry.getValue();
                                String wordHash = plasmaWordIndexEntry.word2hash(word);
-                                plasmaWordIndexEntity wordIdxEntity = new plasmaWordIndexEntity(wordHash);
+                                plasmaWordIndexEntryContainer wordIdxContainer = new plasmaWordIndexEntryContainer(wordHash);
                                plasmaWordIndexEntry wordIdxEntry = new plasmaWordIndexEntry(urlHash,
+                                                                                             urlLength, urlComps,
                                                                                             wordStat.count,
                                                                                             condenser.RESULT_SIMI_WORDS,
                                                                                             condenser.RESULT_SIMI_SENTENCES,
@ -1344,26 +1341,25 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
                                                                                             newEntry.size(),
                                                                                             docDate.getTime(),
                                                                                             System.currentTimeMillis(),
-                                                                                             quality, language, doctype, true);
-                                wordIdxEntity.addEntry(wordIdxEntry);
-                                tmpEntities.add(wordIdxEntity);
+                                                                                             condenser.RESULT_WORD_ENTROPHY,
+                                                                                             language,
+                                                                                             doctype,
+                                                                                             true);
+                                wordIdxContainer.add(wordIdxEntry);
+                                tmpContainers.add(wordIdxContainer);
                                // wordIndex.addEntries(plasmaWordIndexEntryContainer.instantContainer(wordHash, System.currentTimeMillis(), entry));
                            }
                            //System.out.println("DEBUG: plasmaSearch.addPageIndex: added " + condenser.getWords().size() + " words, flushed " + c + " entries");
                            words = condenser.RESULT_SIMI_WORDS;
                            
                            // transfering the index to the storage peer
-                            String error = yacyClient.transferIndex(seed,(plasmaWordIndexEntity[])tmpEntities.toArray(new plasmaWordIndexEntity[tmpEntities.size()]),urlCache,true,120000);
+                            String error = yacyClient.transferIndex(seed,(plasmaWordIndexEntryContainer[])tmpContainers.toArray(new plasmaWordIndexEntity[tmpContainers.size()]),urlCache,true,120000);
                            
                            if (error != null) {
                                words = wordIndex.addPageIndex(entry.url(), urlHash, docDate, (int) entry.size(), condenser, plasmaWordIndexEntry.language(entry.url()), plasmaWordIndexEntry.docType(document.getMimeType()));
                            }
                            
-                            // cleanup
-                            for (int j=0; j < tmpEntities.size(); j++) {
-                                plasmaWordIndexEntity tmpEntity = (plasmaWordIndexEntity) tmpEntities.get(j);
-                                try { tmpEntity.close(); } catch (Exception e) {}
-                            }
+                            tmpContainers = null;
                        }
                        storageEndTime = System.currentTimeMillis();
                        
--- a/source/de/anomic/plasma/plasmaWordIndex.java
+++ b/source/de/anomic/plasma/plasmaWordIndex.java
@ -56,6 +56,7 @@ import java.util.Set;
 import java.util.Date;
 import java.net.URL;

+import de.anomic.htmlFilter.htmlFilterContentScraper;
 import de.anomic.kelondro.kelondroBase64Order;
 import de.anomic.server.logging.serverLog;

@ -136,15 +137,7 @@ public final class plasmaWordIndex {
    
    public int addPageIndex(URL url, String urlHash, Date urlModified, int size, plasmaCondenser condenser, String language, char doctype) {
        // this is called by the switchboard to put in a new page into the index
-        // use all the words in one condenser object to simultanous create index
-        // entries
-        // int age = microDateDays(urlModified);
-        int quality = 0;
-        try {
-            quality = condenser.RESULT_INFORMATION_VALUE;
-        } catch (NumberFormatException e) {
-            System.out.println("INTERNAL ERROR WITH CONDENSER.INFORMATION_VALUE: " + e.toString() + ": in URL " + url.toString());
-        }
+        // use all the words in one condenser object to simultanous create index entries
        
        // iterate over all words
        Iterator i = condenser.words();
@ -153,6 +146,9 @@ public final class plasmaWordIndex {
        plasmaWordIndexEntry ientry;
        plasmaCondenser.wordStatProp wprop;
        String wordHash;
+        int urlLength = url.toString().length();
+        int urlComps = htmlFilterContentScraper.urlComps(url.toString()).length;
+        
        while (i.hasNext()) {
            wentry = (Map.Entry) i.next();
            word = (String) wentry.getKey();
@ -160,6 +156,7 @@ public final class plasmaWordIndex {
            // if ((s.length() > 4) && (c > 1)) System.out.println("# " + s + ":" + c);
            wordHash = plasmaWordIndexEntry.word2hash(word);
            ientry = new plasmaWordIndexEntry(urlHash,
+                                              urlLength, urlComps,
                                             wprop.count,
                                             condenser.RESULT_SIMI_WORDS,
                                             condenser.RESULT_SIMI_SENTENCES,
@ -170,7 +167,10 @@ public final class plasmaWordIndex {
                                             size,
                                             urlModified.getTime(),
                                             System.currentTimeMillis(),
-                                             quality, language, doctype, true);
+                                             condenser.RESULT_WORD_ENTROPHY,
+                                             language,
+                                             doctype,
+                                             true);
            addEntries(plasmaWordIndexEntryContainer.instantContainer(wordHash, System.currentTimeMillis(), ientry), false);
        }
        // System.out.println("DEBUG: plasmaSearch.addPageIndex: added " +
@ -178,10 +178,43 @@ public final class plasmaWordIndex {
        return condenser.RESULT_SIMI_WORDS;
    }

+    public plasmaWordIndexEntryContainer getContainer(String wordHash, boolean deleteIfEmpty, long maxTime) {
+        return ramCache.getContainer(wordHash, deleteIfEmpty, maxTime);
+    }
+    
    public plasmaWordIndexEntity getEntity(String wordHash, boolean deleteIfEmpty, long maxTime) {
-        return ramCache.getIndex(wordHash, deleteIfEmpty, maxTime);
+        return ramCache.getEntity(wordHash, deleteIfEmpty, maxTime);
+    }
+
+    public Set getContainers(Set wordHashes, boolean deleteIfEmpty, boolean interruptIfEmpty, long maxTime) {
+        
+        // retrieve entities that belong to the hashes
+        HashSet containers = new HashSet();
+        String singleHash;
+        plasmaWordIndexEntryContainer singleContainer;
+        Iterator i = wordHashes.iterator();
+        long start = System.currentTimeMillis();
+        long remaining;
+        while (i.hasNext()) {
+            // check time
+            remaining = maxTime - (System.currentTimeMillis() - start);
+            //if ((maxTime > 0) && (remaining <= 0)) break;
+            
+            // get next hash:
+            singleHash = (String) i.next();
+            
+            // retrieve index
+            singleContainer = getContainer(singleHash, deleteIfEmpty, (maxTime < 0) ? -1 : remaining / (wordHashes.size() - containers.size()));
+            
+            // check result
+            if (((singleContainer == null) || (singleContainer.size() == 0)) && (interruptIfEmpty)) return new HashSet();
+            
+            containers.add(singleContainer);
+        }
+        return containers;
    }

+    /*
    public Set getEntities(Set wordHashes, boolean deleteIfEmpty, boolean interruptIfEmpty, long maxTime) {
        
        // retrieve entities that belong to the hashes
@ -203,12 +236,13 @@ public final class plasmaWordIndex {
            singleEntity = getEntity(singleHash, deleteIfEmpty, (maxTime < 0) ? -1 : remaining / (wordHashes.size() - entities.size()));
            
            // check result
-            if (((singleEntity == null) || (singleEntity.size() == 0)) && (interruptIfEmpty)) return null;
+            if (((singleEntity == null) || (singleEntity.size() == 0)) && (interruptIfEmpty)) return new HashSet();
            
            entities.add(singleEntity);
        }
        return entities;
    }
+    */
    
    public int size() {
        return ramCache.size();
--- a/source/de/anomic/plasma/plasmaWordIndexAssortmentCluster.java
+++ b/source/de/anomic/plasma/plasmaWordIndexAssortmentCluster.java
@ -203,7 +203,7 @@ public final class plasmaWordIndexAssortmentCluster {
    }
    
    public plasmaWordIndexEntryContainer removeFromAll(String wordHash, long maxTime) {
-        // collect all records from all the assortments and return them
+        // removes all records from all the assortments and return them
        plasmaWordIndexEntryContainer buffer, record = new plasmaWordIndexEntryContainer(wordHash);
        long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime;
        for (int i = 0; i < clusterCount; i++) {
@ -214,6 +214,18 @@ public final class plasmaWordIndexAssortmentCluster {
        return record;
    }

+    public plasmaWordIndexEntryContainer getFromAll(String wordHash, long maxTime) {
+        // collect all records from all the assortments and return them
+        plasmaWordIndexEntryContainer buffer, record = new plasmaWordIndexEntryContainer(wordHash);
+        long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime;
+        for (int i = 0; i < clusterCount; i++) {
+            buffer = assortments[i].get(wordHash);
+            if (buffer != null) record.add(buffer);
+            if (System.currentTimeMillis() > limitTime) break;
+        }
+        return record;
+    }
+
    public Iterator hashConjunction(String startWordHash, boolean up, boolean rot) {
        HashSet iterators = new HashSet();
        //if (rot) System.out.println("WARNING: kelondroMergeIterator does not work correctly when individual iterators rotate on their own!");
--- a/source/de/anomic/plasma/plasmaWordIndexCache.java
+++ b/source/de/anomic/plasma/plasmaWordIndexCache.java
@ -391,7 +391,18 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
        }
    }

-    public plasmaWordIndexEntity getIndex(String wordHash, boolean deleteIfEmpty, long maxTime) {
+    public plasmaWordIndexEntryContainer getContainer(String wordHash, boolean deleteIfEmpty, long maxTime) {
+        long start = System.currentTimeMillis();
+        if (maxTime > 0) maxTime = 8 * maxTime / 10; // reserve time for later adding to backend
+        plasmaWordIndexEntryContainer container = assortmentCluster.getFromAll(wordHash, maxTime);
+        if (container == null) {
+            container = new plasmaWordIndexEntryContainer(wordHash);
+        }
+        container.add(backend.getContainer(wordHash, deleteIfEmpty, (maxTime < 0) ? -1 : System.currentTimeMillis() - start));
+        return container;
+    }
+
+    public plasmaWordIndexEntity getEntity(String wordHash, boolean deleteIfEmpty, long maxTime) {
        // this possibly creates an index file in the back-end
        // the index file is opened and returned as entity object
        long start = System.currentTimeMillis();
@ -406,7 +417,7 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
            }
        }
        long r = maxTime - (System.currentTimeMillis() - start);
-        return backend.getIndex(wordHash, deleteIfEmpty, (r < 0) ? 0 : r);
+        return backend.getEntity(wordHash, deleteIfEmpty, (r < 0) ? 0 : r);
    }

    public long getUpdateTime(String wordHash) {
--- a/source/de/anomic/plasma/plasmaWordIndexClassicDB.java
+++ b/source/de/anomic/plasma/plasmaWordIndexClassicDB.java
@ -181,7 +181,24 @@ public class plasmaWordIndexClassicDB implements plasmaWordIndexInterface {
        }
    }

-    public plasmaWordIndexEntity getIndex(String wordHash, boolean deleteIfEmpty, long maxTime) {
+    public plasmaWordIndexEntryContainer getContainer(String wordHash, boolean deleteIfEmpty, long maxTime) {
+        long start = System.currentTimeMillis();
+        if (plasmaWordIndexEntity.wordHash2path(databaseRoot, wordHash).exists()) {
+            plasmaWordIndexEntity entity = this.getEntity(wordHash, deleteIfEmpty, (maxTime < 0) ? -1 : maxTime * 9 / 10);
+            plasmaWordIndexEntryContainer container = new plasmaWordIndexEntryContainer(wordHash);
+            plasmaWordIndexEntry entry;
+            Iterator i = entity.elements(true);
+            while ((i.hasNext()) && ((maxTime < 0) || (System.currentTimeMillis() < start + maxTime))) {
+                entry = (plasmaWordIndexEntry) i.next();
+                container.add(entry);
+            }
+            return container;
+        } else {
+            return new plasmaWordIndexEntryContainer(wordHash, 0);
+        }
+    }
+    
+    public plasmaWordIndexEntity getEntity(String wordHash, boolean deleteIfEmpty, long maxTime) {
        return new plasmaWordIndexEntity(databaseRoot, wordHash, deleteIfEmpty);
    }
    
@ -190,7 +207,6 @@ public class plasmaWordIndexClassicDB implements plasmaWordIndexInterface {
        if (f.exists()) return f.lastModified(); else return -1;
    }
    
-    
    public void deleteIndex(String wordHash) {
        plasmaWordIndexEntity.removePlasmaIndex(databaseRoot, wordHash);
    }
@ -200,7 +216,7 @@ public class plasmaWordIndexClassicDB implements plasmaWordIndexInterface {
        plasmaWordIndexEntity pi = null;
        int count = 0;
        try {
-            pi = getIndex(wordHash, true, -1);
+            pi = getEntity(wordHash, true, -1);
            for (int i = 0; i < urlHashes.length; i++)
                if (pi.removeEntry(urlHashes[i], deleteComplete)) count++;
            int size = pi.size();
--- a/source/de/anomic/plasma/plasmaWordIndexDistribution.java
+++ b/source/de/anomic/plasma/plasmaWordIndexDistribution.java
@ -201,33 +201,33 @@ public final class plasmaWordIndexDistribution {
        // collect index
        String startPointHash = selectTransferStart();
        log.logFine("Selected hash " + startPointHash + " as start point for index distribution, distance = " + yacyDHTAction.dhtDistance(yacyCore.seedDB.mySeed.hash, startPointHash));
-        Object[] selectResult = selectTransferIndexes(startPointHash, indexCount, this.maxOpenFiles4Distribution);
-        plasmaWordIndexEntity[] indexEntities = (plasmaWordIndexEntity[]) selectResult[0];
+        Object[] selectResult = selectTransferContainers(startPointHash, indexCount, this.maxOpenFiles4Distribution);
+        plasmaWordIndexEntryContainer[] indexContainers = (plasmaWordIndexEntryContainer[]) selectResult[0];
        //Integer openedFiles = (Integer) selectResult[2];
        HashMap urlCache = (HashMap) selectResult[1]; // String (url-hash) / plasmaCrawlLURL.Entry 
-        if ((indexEntities == null) || (indexEntities.length == 0)) {
+        if ((indexContainers == null) || (indexContainers.length == 0)) {
            log.logFine("No index available for index transfer, hash start-point " + startPointHash);
            return -1;
        }
        // count the indexes again, can be smaller as expected
        indexCount = 0;
-        for (int i = 0; i < indexEntities.length; i++) {
-            indexCount += indexEntities[i].size();
+        for (int i = 0; i < indexContainers.length; i++) {
+            indexCount += indexContainers[i].size();
        }
        if (indexCount < 50) {
            log.logFine("Too few (" + indexCount + ") indexes selected for transfer.");
-            closeTransferIndexes (indexEntities);
+            closeTransferIndexes(indexContainers);
            return -1; // failed
        }

        // find start point for DHT-selection
-        String keyhash = indexEntities[indexEntities.length - 1].wordHash(); // DHT targets must have greater hashes
+        String keyhash = indexContainers[indexContainers.length - 1].wordHash(); // DHT targets must have greater hashes

        // find a list of DHT-peers
        yacySeed[] seeds = new yacySeed[peerCount + 10];
        int hc0 = 0;
-        double ownDistance = Math.min(yacyDHTAction.dhtDistance(yacyCore.seedDB.mySeed.hash, indexEntities[0].wordHash()),
-                                      yacyDHTAction.dhtDistance(yacyCore.seedDB.mySeed.hash, indexEntities[indexEntities.length - 1].wordHash()));
+        double ownDistance = Math.min(yacyDHTAction.dhtDistance(yacyCore.seedDB.mySeed.hash, indexContainers[0].wordHash()),
+                                      yacyDHTAction.dhtDistance(yacyCore.seedDB.mySeed.hash, indexContainers[indexContainers.length - 1].wordHash()));
        double maxDistance = Math.min(ownDistance, 0.4);
        synchronized (yacyCore.dhtAgent) {
            double avdist;
@ -239,8 +239,8 @@ public final class plasmaWordIndexDistribution {
                }
                seeds[hc0] = (yacySeed) e.nextElement();
                if (seeds[hc0] != null) {
-                    avdist = Math.max(yacyDHTAction.dhtDistance(seeds[hc0].hash, indexEntities[0].wordHash()),
-                                      yacyDHTAction.dhtDistance(seeds[hc0].hash, indexEntities[indexEntities.length - 1].wordHash()));
+                    avdist = Math.max(yacyDHTAction.dhtDistance(seeds[hc0].hash, indexContainers[0].wordHash()),
+                                      yacyDHTAction.dhtDistance(seeds[hc0].hash, indexContainers[indexContainers.length - 1].wordHash()));
                    if (avdist < maxDistance) {
                        log.logInfo("Selected " + ((hc0 < peerCount) ? "primary" : "reserve") + " DHT target peer " + seeds[hc0].getName() + ":" + seeds[hc0].hash + ", distance = " + avdist);
                        hc0++;
@ -252,7 +252,7 @@ public final class plasmaWordIndexDistribution {
        
        if (hc0 < peerCount) {
            log.logWarning("found not enough (" + hc0 + ") peers for distribution");
-            closeTransferIndexes (indexEntities);
+            closeTransferIndexes(indexContainers);
            return -1; // failed
        }
        
@ -267,9 +267,9 @@ public final class plasmaWordIndexDistribution {
                return -1; // interrupted
            }
            start = System.currentTimeMillis();
-            error = yacyClient.transferIndex(seeds[i], indexEntities, urlCache, this.gzipBody4Distribution, this.timeout4Distribution);
+            error = yacyClient.transferIndex(seeds[i], indexContainers, urlCache, this.gzipBody4Distribution, this.timeout4Distribution);
            if (error == null) {
-                log.logInfo("Index transfer of " + indexCount + " words [" + indexEntities[0].wordHash() + " .. " + indexEntities[indexEntities.length - 1].wordHash() + "] to peer " + seeds[i].getName() + ":" + seeds[i].hash + " in " + ((System.currentTimeMillis() - start) / 1000)
+                log.logInfo("Index transfer of " + indexCount + " words [" + indexContainers[0].wordHash() + " .. " + indexContainers[indexContainers.length - 1].wordHash() + "] to peer " + seeds[i].getName() + ":" + seeds[i].hash + " in " + ((System.currentTimeMillis() - start) / 1000)
                                + " seconds successfull (" + (1000 * indexCount / (System.currentTimeMillis() - start + 1)) + " words/s)");
                peerNames += ", " + seeds[i].getName();
                hc1++;
@ -286,8 +286,8 @@ public final class plasmaWordIndexDistribution {
            // success
            if (delete) {
                try {
-                    if (deleteTransferIndexes(indexEntities)) {
-                        log.logFine("Deleted all " + indexEntities.length + " transferred whole-word indexes locally");
+                    if (deleteTransferIndexes(indexContainers)) {
+                        log.logFine("Deleted all " + indexContainers.length + " transferred whole-word indexes locally");
                        return indexCount;
                    } else {
                        log.logSevere("Deleted not all transferred whole-word indexes");
@ -299,13 +299,13 @@ public final class plasmaWordIndexDistribution {
                }
            } else {
                // simply close the indexEntities
-                closeTransferIndexes (indexEntities);
+                closeTransferIndexes(indexContainers);
            }
            return indexCount;
        } else {
            log.logSevere("Index distribution failed. Too few peers (" + hc1 + ") received the index, not deleted locally.");
            // simply close the indexEntities
-            closeTransferIndexes (indexEntities);
+            closeTransferIndexes(indexContainers);
            return -1;
        }
    }
@ -322,15 +322,16 @@ public final class plasmaWordIndexDistribution {
        return startPointHash;
    }

-    Object[] /* of {plasmaWordIndexEntity[], HashMap(String, plasmaCrawlLURL.Entry)}*/
-           selectTransferIndexes(String hash, int count, int maxOpenFiles) {
+    Object[] /* of {plasmaWordIndexEntryContainer[], HashMap(String, plasmaCrawlLURL.Entry)}*/
+           selectTransferContainers(String hash, int count, int maxOpenFiles) {
        // the hash is a start hash from where the indexes are picked
-        ArrayList tmpEntities = new ArrayList(count);
+        ArrayList tmpContainers = new ArrayList(count);
        String nexthash = "";
        try {
            int currOpenFiles = 0;
            Iterator wordHashIterator = this.wordIndex.wordHashes(hash, true, true);
-            plasmaWordIndexEntity indexEntity, tmpEntity;
+            plasmaWordIndexEntity indexEntity;
+            plasmaWordIndexEntryContainer indexContainer;
            Iterator urlIter;
            Iterator hashIter;
            plasmaWordIndexEntry indexEntry;
@ -343,56 +344,15 @@ public final class plasmaWordIndexDistribution {
                    (wordHashIterator.hasNext()) &&
                    ((nexthash = (String) wordHashIterator.next()) != null) && 
                    (nexthash.trim().length() > 0) &&
-                    ((currOpenFiles == 0) || (yacyDHTAction.dhtDistance(nexthash,
-                    ((plasmaWordIndexEntity)tmpEntities.get(0)).wordHash()) < 0.2))
+                    ((currOpenFiles == 0) ||
+                     (yacyDHTAction.dhtDistance(nexthash, ((plasmaWordIndexEntity)tmpContainers.get(0)).wordHash()) < 0.2))
            ) {
                indexEntity = this.wordIndex.getEntity(nexthash, true, -1);
                if (indexEntity.size() == 0) {
                    indexEntity.deleteComplete();
-                } else if ((indexEntity.size() <= count)||        // if we havn't exceeded the limit
-                        (Math.abs(indexEntity.size() - count) <= 10)){  // or there are only at most 10 entries left
-                    // take the whole entity
-                    try {
-                        // fist check if we know all urls
-                        urlIter = indexEntity.elements(true);
-                        unknownURLEntries.clear();
-                        while (urlIter.hasNext()) {
-                            indexEntry = (plasmaWordIndexEntry) urlIter.next();     
-                            try {
-                                lurl = this.urlPool.loadedURL.getEntry(indexEntry.getUrlHash(), indexEntry);
-                                if ((lurl == null) || (lurl.url() == null)) {
-                                    unknownURLEntries.add(indexEntry.getUrlHash());
-                                } else {
-                                    knownURLs.put(indexEntry.getUrlHash(), lurl);
-                                }
-                            } catch (IOException e) {
-                                unknownURLEntries.add(indexEntry.getUrlHash());
-                            }
-                        }
-                        // now delete all entries that have no url entry
-                        hashIter = unknownURLEntries.iterator();
-                        while (hashIter.hasNext()) {
-                            String nextUrlHash = (String) hashIter.next();
-                            indexEntity.removeEntry(nextUrlHash, false);
-                            this.urlPool.loadedURL.remove(nextUrlHash);
-                        }
-                        
-                        if (indexEntity.size() == 0) {
-                            indexEntity.deleteComplete();
-                        } else {
-                            // use whats remaining
-                            tmpEntities.add(indexEntity);
-                            this.log.logFine("Selected whole index (" + indexEntity.size() + " URLs, " + unknownURLEntries.size() + " not bound) for word " + indexEntity.wordHash());
-                            count -= indexEntity.size();
-                            currOpenFiles++;
-                        }
-                    } catch (kelondroException e) {
-                        this.log.logSevere("plasmaWordIndexDistribution/1: deleted DB for word " + indexEntity.wordHash(), e);
-                        indexEntity.deleteComplete();
-                    }
                } else {
                    // make an on-the-fly entity and insert values
-                    tmpEntity = new plasmaWordIndexEntity(indexEntity.wordHash());
+                    indexContainer = new plasmaWordIndexEntryContainer(indexEntity.wordHash());
                    try {
                        urlIter = indexEntity.elements(true);
                        unknownURLEntries.clear();
@ -404,7 +364,7 @@ public final class plasmaWordIndexDistribution {
                                    unknownURLEntries.add(indexEntry.getUrlHash());
                                } else {
                                    knownURLs.put(indexEntry.getUrlHash(), lurl);
-                                    tmpEntity.addEntry(indexEntry);
+                                    indexContainer.add(indexEntry);
                                    count--;
                                }
                            } catch (IOException e) {
@ -426,8 +386,8 @@ public final class plasmaWordIndexDistribution {
                        }                       
                        
                        // use whats remaining
-                        this.log.logFine("Selected partial index (" + tmpEntity.size() + " from " + indexEntity.size() +" URLs, " + unknownURLEntries.size() + " not bound) for word " + tmpEntity.wordHash());
-                        tmpEntities.add(tmpEntity);
+                        this.log.logFine("Selected partial index (" + indexContainer.size() + " from " + indexEntity.size() +" URLs, " + unknownURLEntries.size() + " not bound) for word " + indexContainer.wordHash());
+                        tmpContainers.add(indexContainer);
                    } catch (kelondroException e) {
                        this.log.logSevere("plasmaWordIndexDistribution/2: deleted DB for word " + indexEntity.wordHash(), e);
                        indexEntity.deleteComplete();
@ -438,8 +398,8 @@ public final class plasmaWordIndexDistribution {
                
            }
            // transfer to array
-            plasmaWordIndexEntity[] indexEntities = (plasmaWordIndexEntity[]) tmpEntities.toArray(new plasmaWordIndexEntity[tmpEntities.size()]);
-            return new Object[]{indexEntities, knownURLs, new Integer(currOpenFiles)};
+            plasmaWordIndexEntryContainer[] entryContainers = (plasmaWordIndexEntryContainer[]) tmpContainers.toArray(new plasmaWordIndexEntryContainer[tmpContainers.size()]);
+            return new Object[]{entryContainers, knownURLs, new Integer(currOpenFiles)};
        } catch (IOException e) {
            this.log.logSevere("selectTransferIndexes IO-Error (hash=" + nexthash + "): " + e.getMessage(), e);
            return new Object[]{new plasmaWordIndexEntity[0], new HashMap(0)};
@ -477,6 +437,40 @@ public final class plasmaWordIndexDistribution {
        } catch (IOException ee) {}
    }

+    void closeTransferIndexes(plasmaWordIndexEntryContainer[] indexContainers) {
+        for (int i = 0; i < indexContainers.length; i++) {
+            indexContainers[i] = null;
+        }
+    }
+
+    boolean deleteTransferIndexes(plasmaWordIndexEntryContainer[] indexContainers) throws IOException {
+        Iterator urlIter;
+        plasmaWordIndexEntry indexEntry;
+        plasmaWordIndexEntity indexEntity;
+        String[] urlHashes;
+        int sz;
+        boolean success = true;
+        for (int i = 0; i < indexContainers.length; i++) {
+            // delete entries separately
+            int c = 0;
+            urlHashes = new String[indexContainers[i].size()];
+            urlIter = indexContainers[i].entries();
+            while (urlIter.hasNext()) {
+                indexEntry = (plasmaWordIndexEntry) urlIter.next();
+                urlHashes[c++] = indexEntry.getUrlHash();
+            }
+            wordIndex.removeEntries(indexContainers[i].wordHash(), urlHashes, true);
+            indexEntity = wordIndex.getEntity(indexContainers[i].wordHash(), true, -1);
+            sz = indexEntity.size();
+            // indexEntity.close();
+            closeTransferIndex(indexEntity);
+            log.logFine("Deleted partial index (" + c + " URLs) for word " + indexContainers[i].wordHash() + "; " + sz + " entries left");
+            indexContainers[i] = null;
+        }
+        return success;
+    }
+
+/*
    boolean deleteTransferIndexes(plasmaWordIndexEntity[] indexEntities) throws IOException {
        Iterator urlIter;
        plasmaWordIndexEntry indexEntry;
@ -500,13 +494,6 @@ public final class plasmaWordIndexDistribution {
                // indexEntity.close();
                closeTransferIndex(indexEntity);
                log.logFine("Deleted partial index (" + c + " URLs) for word " + indexEntities[i].wordHash() + "; " + sz + " entries left");
-                // DEBUG: now try to delete the remaining index. If this works, this routine is fine
-                /*
-                 if (wordIndex.getEntity(indexEntities[i].wordHash()).deleteComplete())
-                 System.out.println("DEBUG: trial delete of partial word index " + indexEntities[i].wordHash() + " SUCCESSFULL");
-                 else
-                 System.out.println("DEBUG: trial delete of partial word index " + indexEntities[i].wordHash() + " FAILED");
-                 */
                // end debug
                indexEntities[i].close();
            } else {
@ -516,7 +503,7 @@ public final class plasmaWordIndexDistribution {
                } else {
                    indexEntities[i].close();
                    // have another try...
-                    if (!(plasmaWordIndexEntity.wordHash2path(wordIndex.getRoot() /*PLASMADB*/, indexEntities[i].wordHash()).delete())) {
+                    if (!(plasmaWordIndexEntity.wordHash2path(wordIndex.getRoot(), indexEntities[i].wordHash()).delete())) {
                        success = false;
                        log.logSevere("Could not delete whole index for word " + indexEntities[i].wordHash());
                    }
@ -526,6 +513,7 @@ public final class plasmaWordIndexDistribution {
        }
        return success;
    }
+ */
 
    public void startTransferWholeIndex(yacySeed seed, boolean delete) {
        if (transferIdxThread == null) {
@ -573,14 +561,14 @@ public final class plasmaWordIndexDistribution {
        // word chunk
        private String endPointHash;
        private String startPointHash;
-        plasmaWordIndexEntity[] indexEntities;
+        plasmaWordIndexEntryContainer[] indexContainers;
        
        // other fields
        HashMap urlCache;
        
        public transferIndexWorkerThread(
                yacySeed seed, 
-                plasmaWordIndexEntity[] indexEntities, 
+                plasmaWordIndexEntryContainer[] indexContainers, 
                HashMap urlCache, 
                boolean gzipBody, 
                int timeout,
@ -594,7 +582,7 @@ public final class plasmaWordIndexDistribution {
            this.timeout4Transfer = timeout;
            this.iteration = iteration;
            this.seed = seed;
-            this.indexEntities = indexEntities;
+            this.indexContainers = indexContainers;
            this.urlCache = urlCache;
            this.idxCount = idxCount;
            this.chunkSize = chunkSize;
@ -657,11 +645,11 @@ public final class plasmaWordIndexDistribution {
                
                // transfering seleted words to remote peer
                this.status = "Running: Transfering chunk " + iteration;
-                String error = yacyClient.transferIndex(seed, indexEntities, urlCache, gzipBody4Transfer, timeout4Transfer);
+                String error = yacyClient.transferIndex(seed, indexContainers, urlCache, gzipBody4Transfer, timeout4Transfer);
                if (error == null) {
                    // words successfully transfered
                    transferTime = System.currentTimeMillis() - start;
-                    plasmaWordIndexDistribution.this.log.logInfo("Index transfer of " + idxCount + " words [" + indexEntities[0].wordHash() + " .. " + indexEntities[indexEntities.length-1].wordHash() + "]" +
+                    plasmaWordIndexDistribution.this.log.logInfo("Index transfer of " + idxCount + " words [" + indexContainers[0].wordHash() + " .. " + indexContainers[indexContainers.length-1].wordHash() + "]" +
                            " to peer " + seed.getName() + ":" + seed.hash + " in " + (transferTime/1000) + " seconds successfull (" +
                            (1000 * idxCount / (transferTime + 1)) + " words/s)");
                    retryCount = 0;
@ -817,7 +805,7 @@ public final class plasmaWordIndexDistribution {
        }
        
        public void performTransferWholeIndex() {
-            plasmaWordIndexEntity[] newIndexEntities = null, oldIndexEntities = null;
+            plasmaWordIndexEntryContainer[] newIndexContainers = null, oldIndexContainers = null;
            try {
                // pausing the regular index distribution
                // TODO: adding sync, to wait for a still running index distribution to finish
@ -838,12 +826,12 @@ public final class plasmaWordIndexDistribution {
                    iteration++;
                    int idxCount = 0;
                    selectionStart = System.currentTimeMillis();
-                    oldIndexEntities = newIndexEntities;
+                    oldIndexContainers = newIndexContainers;
                    
                    // selecting 500 words to transfer
                    this.status = "Running: Selecting chunk " + iteration;
-                    Object[] selectResult = selectTransferIndexes(this.startPointHash, this.chunkSize, this.maxOpenFiles4Transfer - openedFiles.intValue());
-                    newIndexEntities = (plasmaWordIndexEntity[]) selectResult[0];                                        
+                    Object[] selectResult = selectTransferContainers(this.startPointHash, this.chunkSize, this.maxOpenFiles4Transfer - openedFiles.intValue());
+                    newIndexContainers = (plasmaWordIndexEntryContainer[]) selectResult[0];                                        
                    HashMap urlCache = (HashMap) selectResult[1]; // String (url-hash) / plasmaCrawlLURL.Entry
                    openedFiles = (Integer) selectResult[2];
                    
@ -851,7 +839,7 @@ public final class plasmaWordIndexDistribution {
                     * a) no words are left in the index
                     * b) max open file limit was exceeded 
                     */
-                    if ((newIndexEntities == null) || (newIndexEntities.length == 0)) {
+                    if ((newIndexContainers == null) || (newIndexContainers.length == 0)) {
                        if (sb.wordIndex.size() > 0) {
                            // if there are still words in the index we try it again now
                            startPointHash = "------------";
@ -863,15 +851,15 @@ public final class plasmaWordIndexDistribution {
                        }
                    } else {
                        // count the indexes again, can be smaller as expected                    
-                        for (int i = 0; i < newIndexEntities.length; i++) idxCount += newIndexEntities[i].size();
+                        for (int i = 0; i < newIndexContainers.length; i++) idxCount += newIndexContainers[i].size();
                        
                        // getting start point for next DHT-selection
                        oldStartingPointHash = startPointHash;
-                        startPointHash = newIndexEntities[newIndexEntities.length - 1].wordHash(); // DHT targets must have greater hashes
+                        startPointHash = newIndexContainers[newIndexContainers.length - 1].wordHash(); // DHT targets must have greater hashes
                        
                        selectionEnd = System.currentTimeMillis();
                        selectionTime = selectionEnd - selectionStart;
-                        plasmaWordIndexDistribution.this.log.logInfo("Index selection of " + idxCount + " words [" + newIndexEntities[0].wordHash() + " .. " + newIndexEntities[newIndexEntities.length-1].wordHash() + "]" +
+                        plasmaWordIndexDistribution.this.log.logInfo("Index selection of " + idxCount + " words [" + newIndexContainers[0].wordHash() + " .. " + newIndexContainers[newIndexContainers.length-1].wordHash() + "]" +
                                " in " +
                                (selectionTime / 1000) + " seconds (" +
                                (1000 * idxCount / (selectionTime+1)) + " words/s)");                     
@ -886,10 +874,10 @@ public final class plasmaWordIndexDistribution {
                            this.status = "Aborted because of Transfer error:\n" + worker.getStatus();
                            
                            // cleanup. closing all open files
-                            closeEntities(oldIndexEntities);
-                            oldIndexEntities = null;
-                            closeEntities(newIndexEntities);
-                            newIndexEntities = null;
+                            closeContainers(oldIndexContainers);
+                            oldIndexContainers = null;
+                            closeContainers(newIndexContainers);
+                            newIndexContainers = null;
                            
                            // abort index transfer
                            return;
@ -922,10 +910,10 @@ public final class plasmaWordIndexDistribution {
                            if (delete) {
                                this.status = "Running: Deleting chunk " + iteration;
                                try {
-                                    if (deleteTransferIndexes(oldIndexEntities)) {
-                                        plasmaWordIndexDistribution.this.log.logFine("Deleted all " + oldIndexEntities.length + " transferred whole-word indexes locally");
+                                    if (deleteTransferIndexes(oldIndexContainers)) {
+                                        plasmaWordIndexDistribution.this.log.logFine("Deleted all " + oldIndexContainers.length + " transferred whole-word indexes locally");
                                        transferedEntryCount += idxCount;
-                                        transferedEntityCount += oldIndexEntities.length;
+                                        transferedEntityCount += oldIndexContainers.length;
                                    } else {
                                        plasmaWordIndexDistribution.this.log.logSevere("Deleted not all transferred whole-word indexes");
                                    }
@ -933,18 +921,18 @@ public final class plasmaWordIndexDistribution {
                                    plasmaWordIndexDistribution.this.log.logSevere("Deletion of indexes not possible:" + ee.getMessage(), ee);
                                }
                            } else {
-                                this.closeEntities(oldIndexEntities);
+                                this.closeContainers(oldIndexContainers);
                                transferedEntryCount += idxCount;
-                                transferedEntityCount += oldIndexEntities.length;
+                                transferedEntityCount += oldIndexContainers.length;
                            }
-                            oldIndexEntities = null;
+                            oldIndexContainers = null;
                        }
                        this.worker = null;
                    }
                    
                    // handover chunk to transfer worker
-                    if (!((newIndexEntities == null) || (newIndexEntities.length == 0))) {
-                        worker = new transferIndexWorkerThread(seed,newIndexEntities,urlCache,gzipBody4Transfer,timeout4Transfer,iteration,idxCount,idxCount,startPointHash,oldStartingPointHash);
+                    if (!((newIndexContainers == null) || (newIndexContainers.length == 0))) {
+                        worker = new transferIndexWorkerThread(seed,newIndexContainers,urlCache,gzipBody4Transfer,timeout4Transfer,iteration,idxCount,idxCount,startPointHash,oldStartingPointHash);
                        worker.start();
                    }
                }
@ -961,30 +949,21 @@ public final class plasmaWordIndexDistribution {
                    try {worker.join();}catch(Exception e){}
                    // worker = null;
                }
-                if (oldIndexEntities != null) closeEntities(oldIndexEntities);
-                if (newIndexEntities != null) closeEntities(newIndexEntities);
+                if (oldIndexContainers != null) closeContainers(oldIndexContainers);
+                if (newIndexContainers != null) closeContainers(newIndexContainers);
                
                plasmaWordIndexDistribution.this.paused = false;
            }
        }
        
-        private void closeEntities(plasmaWordIndexEntity[] indexEntities) {
-            if ((indexEntities == null)||(indexEntities.length ==0)) return;
+        private void closeContainers(plasmaWordIndexEntryContainer[] indexContainers) {
+            if ((indexContainers == null)||(indexContainers.length ==0)) return;
            
-            for (int i = 0; i < indexEntities.length; i++) try {
-                indexEntities[i].close();
-            } catch (IOException ee) {}
+            for (int i = 0; i < indexContainers.length; i++) {
+                indexContainers[i] = null;
            }
-        
-        /*
-        private boolean isAborted() {
-            if (finished || Thread.currentThread().isInterrupted()) {
-                this.status = "aborted";
-                return true;
        }
-            return false;
-        }
-        */
+
    }

 }
--- a/source/de/anomic/plasma/plasmaWordIndexEntity.java
+++ b/source/de/anomic/plasma/plasmaWordIndexEntity.java
@ -48,7 +48,6 @@ import java.io.File;
 import java.io.IOException;
 import java.util.Iterator;
 import java.util.TreeMap;
-import java.util.Set;
 import de.anomic.kelondro.kelondroRecords;
 import de.anomic.kelondro.kelondroTree;
 import de.anomic.kelondro.kelondroException;
@ -111,6 +110,7 @@ public final class plasmaWordIndexEntity {
             hash.substring(4,6) + "/" + hash + ".db");
    }

+    /*
    public plasmaWordIndexEntity(String wordHash) {
        // this creates a nameless temporary index. It is needed for combined search
        // and used to hold the intersection of two indexes
@ -121,7 +121,7 @@ public final class plasmaWordIndexEntity {
        theLocation = null;
        theTmpMap   = new TreeMap();
    }
-
+*/
    public boolean isTMPEntity() {
        return theTmpMap != null;
    }
@ -302,12 +302,6 @@ public final class plasmaWordIndexEntity {
        else return "EMPTY";
    }

-    // join methods
-    private static int log2(int x) {
-        int l = 0;
-        while (x > 0) {x = x >> 1; l++;}
-        return l;
-    }
    
    public void merge(plasmaWordIndexEntity otherEntity, long time) throws IOException {
        // this is a merge of another entity to this entity
@ -324,6 +318,14 @@ public final class plasmaWordIndexEntity {
        }
    }
    
+    /*
+    // join methods
+    private static int log2(int x) {
+        int l = 0;
+        while (x > 0) {x = x >> 1; l++;}
+        return l;
+    }
+    
    public static plasmaWordIndexEntity joinEntities(Set entities, long time) throws IOException {
        
    		// big problem here: there cannot be a time-out for join, since a time-out will leave the joined set too big.
@ -485,5 +487,5 @@ public final class plasmaWordIndexEntity {
        }
        return conj;
    }
-
+*/
 }
--- a/source/de/anomic/plasma/plasmaWordIndexEntry.java
+++ b/source/de/anomic/plasma/plasmaWordIndexEntry.java
@ -112,6 +112,9 @@ public final class plasmaWordIndexEntry {
    public static final int AP_IMG       =  9; // tag inside image references
    public static final int AP_TAG       = 10; // for tagged indexeing (i.e. using mp3 tags)
    public static final int AP_ANCHOR    = 11; // anchor description
+    public static final int AP_BOLD      = 12;
+    public static final int AP_ITALICS   = 13;
+    public static final int AP_INVISIBLE = 14; // good for spam detection
    
    // URL attributes
    public static final int UA_LOCAL    =  0; // URL was crawled locally
@ -208,6 +211,8 @@ public final class plasmaWordIndexEntry {
    // the class instantiation can only be done by a plasmaStore method
    // therefore they are all public
    public plasmaWordIndexEntry(String  urlHash,
+                                int     urlLength,    // byte-length of complete URL
+                                int     urlComps,     // number of path components
                                int     hitcount,     //*how often appears this word in the text
                                int     wordcount,    //*total number of words
                                int     phrasecount,  //*total number of phrases
@ -227,14 +232,9 @@ public final class plasmaWordIndexEntry {
        // more needed attributes:
        // - boolean: appearance attributes: title, appears in header, anchor-descr, image-tag etc
        // - boolean: URL attributes
-        // - int: url-length (shorter are better)
-        // - int: url-number of components / length of path
        // - int: length of description tag / title tag (longer are better)
-        // - int: number of chapters
        // - int: # of outlinks to same domain
        // - int: # of outlinks to outside domain
-        // - int: length of description
-        // - int: length of title
        // - int: # of keywords
        
    if ((language == null) || (language.length() != plasmaURL.urlLanguageLength)) language = "uk";
--- a/source/de/anomic/plasma/plasmaWordIndexEntryContainer.java
+++ b/source/de/anomic/plasma/plasmaWordIndexEntryContainer.java
@ -54,12 +54,14 @@ package de.anomic.plasma;

 import java.util.HashMap;
 import java.util.Iterator;
+import java.util.Set;
+import java.util.TreeMap;

 import de.anomic.kelondro.kelondroBase64Order;

 public final class plasmaWordIndexEntryContainer implements Comparable {

-    private final String wordHash;
+    private String wordHash;
    private final HashMap container; // urlHash/plasmaWordIndexEntry - Mapping
    private long updateTime;
    
@ -73,6 +75,15 @@ public final class plasmaWordIndexEntryContainer implements Comparable {
        container = new HashMap(initContainerSize); // a urlhash/plasmaWordIndexEntry - relation
    }
    
+    public void setWordHash(String newWordHash) {
+        // this is used to replicate a container for different word indexes during global search
+        this.wordHash = newWordHash;
+    }
+    
+    public void clear() {
+        container.clear();
+    }
+    
    public int size() {
        return container.size();
    }
@ -85,14 +96,18 @@ public final class plasmaWordIndexEntryContainer implements Comparable {
        return wordHash;
    }

+    public int add(plasmaWordIndexEntry entry) {
+        return add(entry, System.currentTimeMillis());
+    }
+    
    public int add(plasmaWordIndexEntry entry, long updateTime) {
        this.updateTime = java.lang.Math.max(this.updateTime, updateTime);
-        return (add(entry)) ? 1 : 0;
+        return (addi(entry)) ? 1 : 0;
    }
    
    public int add(plasmaWordIndexEntry[] entries, long updateTime) {
        int c = 0;
-        for (int i = 0; i < entries.length; i++) if (add(entries[i])) c++;
+        for (int i = 0; i < entries.length; i++) if (addi(entries[i])) c++;
        this.updateTime = java.lang.Math.max(this.updateTime, updateTime);
        return c;
    }
@ -102,13 +117,13 @@ public final class plasmaWordIndexEntryContainer implements Comparable {
        Iterator i = c.entries();
        int x = 0;
        while (i.hasNext()) {
-            if (add((plasmaWordIndexEntry) i.next())) x++;
+            if (addi((plasmaWordIndexEntry) i.next())) x++;
        }
        this.updateTime = java.lang.Math.max(this.updateTime, c.updateTime);
        return x;
    }

-    private boolean add(plasmaWordIndexEntry entry) {
+    private boolean addi(plasmaWordIndexEntry entry) {
        // returns true if the new entry was added, false if it already existet
        return (container.put(entry.getUrlHash(), entry) == null);
    }
@ -117,10 +132,18 @@ public final class plasmaWordIndexEntryContainer implements Comparable {
        return container.containsKey(urlHash);
    }

+    public plasmaWordIndexEntry get(String urlHash) {
+        return (plasmaWordIndexEntry) container.get(urlHash);
+    }
+    
    public plasmaWordIndexEntry[] getEntryArray() {
        return (plasmaWordIndexEntry[]) container.values().toArray();
    }

+    public plasmaWordIndexEntry remove(String urlHash) {
+        return (plasmaWordIndexEntry) container.remove(urlHash);
+    }
+    
    public Iterator entries() {
        // returns an iterator of plasmaWordIndexEntry objects
        return container.values().iterator();
@ -146,4 +169,126 @@ public final class plasmaWordIndexEntryContainer implements Comparable {
        return (int) kelondroBase64Order.enhancedCoder.decodeLong(this.wordHash.substring(0, 4));
    }
    
+    public static plasmaWordIndexEntryContainer joinContainer(Set containers, long time, int maxDistance) {
+        
+        long stamp = System.currentTimeMillis();
+        
+        // order entities by their size
+        TreeMap map = new TreeMap();
+        plasmaWordIndexEntryContainer singleContainer;
+        Iterator i = containers.iterator();
+        int count = 0;
+        while (i.hasNext()) {
+            // get next entity:
+            singleContainer = (plasmaWordIndexEntryContainer) i.next();
+            
+            // check result
+            if ((singleContainer == null) || (singleContainer.size() == 0)) return new plasmaWordIndexEntryContainer(null); // as this is a cunjunction of searches, we have no result if any word is not known
+            
+            // store result in order of result size
+            map.put(new Long(singleContainer.size() * 1000 + count), singleContainer);
+            count++;
+        }
+        
+        // check if there is any result
+        if (map.size() == 0) return new plasmaWordIndexEntryContainer(null); // no result, nothing found
+        
+        // the map now holds the search results in order of number of hits per word
+        // we now must pairwise build up a conjunction of these sets
+        Long k = (Long) map.firstKey(); // the smallest, which means, the one with the least entries
+        plasmaWordIndexEntryContainer searchA, searchB, searchResult = (plasmaWordIndexEntryContainer) map.remove(k);
+        while ((map.size() > 0) && (searchResult.size() > 0)) {
+            // take the first element of map which is a result and combine it with result
+            k = (Long) map.firstKey(); // the next smallest...
+            time -= (System.currentTimeMillis() - stamp); stamp = System.currentTimeMillis();
+            searchA = searchResult;
+            searchB = (plasmaWordIndexEntryContainer) map.remove(k);
+            searchResult = plasmaWordIndexEntryContainer.joinConstructive(searchA, searchB, 2 * time / (map.size() + 1), maxDistance);
+            // free resources
+            searchA = null;
+            searchB = null;
+        }
+
+        // in 'searchResult' is now the combined search result
+        if (searchResult.size() == 0) return new plasmaWordIndexEntryContainer(null);
+        return searchResult;
+    }
+    
+    // join methods
+    private static int log2(int x) {
+        int l = 0;
+        while (x > 0) {x = x >> 1; l++;}
+        return l;
+    }
+    
+    public static plasmaWordIndexEntryContainer joinConstructive(plasmaWordIndexEntryContainer i1, plasmaWordIndexEntryContainer i2, long time, int maxDistance) {
+        if ((i1 == null) || (i2 == null)) return null;
+        if ((i1.size() == 0) || (i2.size() == 0)) return new plasmaWordIndexEntryContainer(null);
+        
+        // decide which method to use
+        int high = ((i1.size() > i2.size()) ? i1.size() : i2.size());
+        int low  = ((i1.size() > i2.size()) ? i2.size() : i1.size());
+        int stepsEnum = 10 * (high + low - 1);
+        int stepsTest = 12 * log2(high) * low;
+        
+        // start most efficient method
+        if (stepsEnum > stepsTest) {
+            if (i1.size() < i2.size())
+                return joinConstructiveByTest(i1, i2, time, maxDistance);
+            else
+                return joinConstructiveByTest(i2, i1, time, maxDistance);
+        } else {
+            return joinConstructiveByEnumeration(i1, i2, time, maxDistance);
+        }
+    }
+    
+    private static plasmaWordIndexEntryContainer joinConstructiveByTest(plasmaWordIndexEntryContainer small, plasmaWordIndexEntryContainer large, long time, int maxDistance) {
+        System.out.println("DEBUG: JOIN METHOD BY TEST");
+        plasmaWordIndexEntryContainer conj = new plasmaWordIndexEntryContainer(null); // start with empty search result
+        Iterator se = small.entries();
+        plasmaWordIndexEntry ie0, ie1;
+        long stamp = System.currentTimeMillis();
+            while ((se.hasNext()) && ((System.currentTimeMillis() - stamp) < time)) {
+                ie0 = (plasmaWordIndexEntry) se.next();
+                ie1 = large.get(ie0.getUrlHash());
+                if (ie1 != null) {
+                    // this is a hit. Calculate word distance:
+                    ie0.combineDistance(ie1);
+                    if (ie0.worddistance() <= maxDistance) conj.add(ie0);
+                }
+            }
+        return conj;
+    }
+    
+    private static plasmaWordIndexEntryContainer joinConstructiveByEnumeration(plasmaWordIndexEntryContainer i1, plasmaWordIndexEntryContainer i2, long time, int maxDistance) {
+        System.out.println("DEBUG: JOIN METHOD BY ENUMERATION");
+        plasmaWordIndexEntryContainer conj = new plasmaWordIndexEntryContainer(null); // start with empty search result
+        Iterator e1 = i1.entries();
+        Iterator e2 = i2.entries();
+        int c;
+        if ((e1.hasNext()) && (e2.hasNext())) {
+            plasmaWordIndexEntry ie1;
+            plasmaWordIndexEntry ie2;
+            ie1 = (plasmaWordIndexEntry) e1.next();
+            ie2 = (plasmaWordIndexEntry) e2.next();
+
+            long stamp = System.currentTimeMillis();
+            while ((System.currentTimeMillis() - stamp) < time) {
+                c = ie1.getUrlHash().compareTo(ie2.getUrlHash());
+                if (c < 0) {
+                    if (e1.hasNext()) ie1 = (plasmaWordIndexEntry) e1.next(); else break;
+                } else if (c > 0) {
+                    if (e2.hasNext()) ie2 = (plasmaWordIndexEntry) e2.next(); else break;
+                } else {
+                    // we have found the same urls in different searches!
+                    ie1.combineDistance(ie2);
+                    if (ie1.worddistance() <= maxDistance) conj.add(ie1);
+                    if (e1.hasNext()) ie1 = (plasmaWordIndexEntry) e1.next(); else break;
+                    if (e2.hasNext()) ie2 = (plasmaWordIndexEntry) e2.next(); else break;
+                }
+            }
+        }
+        return conj;
+    }
+
 }
--- a/source/de/anomic/plasma/plasmaWordIndexInterface.java
+++ b/source/de/anomic/plasma/plasmaWordIndexInterface.java
@ -50,7 +50,8 @@ public interface plasmaWordIndexInterface {
    
    public Iterator wordHashes(String startWordHash, boolean up);

-    public plasmaWordIndexEntity getIndex(String wordHash, boolean deleteIfEmpty, long maxTime);
+    public plasmaWordIndexEntity getEntity(String wordHash, boolean deleteIfEmpty, long maxTime);
+    public plasmaWordIndexEntryContainer getContainer(String wordHash, boolean deleteIfEmpty, long maxTime);
    public long getUpdateTime(String wordHash);
    public void deleteIndex(String wordHash);

--- a/source/de/anomic/yacy/yacyClient.java
+++ b/source/de/anomic/yacy/yacyClient.java
@ -50,12 +50,13 @@ import java.util.ArrayList;
 import java.util.Date;
 import java.util.HashMap;
 import java.util.Iterator;
+
+import de.anomic.htmlFilter.htmlFilterContentScraper;
 import de.anomic.http.httpc;
 import de.anomic.kelondro.kelondroBase64Order;
 import de.anomic.plasma.plasmaCrawlLURL;
 import de.anomic.plasma.plasmaSnippetCache;
 import de.anomic.plasma.plasmaSwitchboard;
-import de.anomic.plasma.plasmaWordIndexEntity;
 import de.anomic.plasma.plasmaWordIndexEntry;
 import de.anomic.plasma.plasmaWordIndexEntryContainer;
 import de.anomic.plasma.plasmaURLPattern;
@ -349,10 +350,11 @@ public final class yacyClient {

    public static int search(
            String wordhashes,
+            int maxDistance,
            boolean global, 
            yacySeed targetPeer,
            plasmaCrawlLURL urlManager, 
-           plasmaWordIndexEntity entityCache,
+            plasmaWordIndexEntryContainer containerCache,
            plasmaURLPattern blacklist, 
            plasmaSnippetCache snippets, 
            plasmaSearchProfile profile
@ -403,6 +405,7 @@ public final class yacyClient {
            obj.put("ttl", "0");
            obj.put("duetime", Long.toString(duetime));
            obj.put("profile", profile.targetToString()); // new duetimes splitted by specific search tasks
+            obj.put("maxdist", maxDistance);
            obj.put(yacySeed.MYTIME, yacyCore.universalDateShortString(new Date()));

            //yacyCore.log.logDebug("yacyClient.search url=" + url);
@ -460,6 +463,9 @@ public final class yacyClient {
                // get one single search result
                urlEntry = urlManager.newEntry((String) result.get("resource" + n), true);
                if (urlEntry != null && blacklist.isListed(urlEntry.url().getHost().toLowerCase(), urlEntry.url().getPath())) { continue; } // block with backlist
+                int urlLength = urlEntry.url().toString().length();
+                int urlComps = htmlFilterContentScraper.urlComps(urlEntry.url().toString()).length;
+                
                urlManager.addEntry(urlEntry, yacyCore.seedDB.mySeed.hash, targetPeer.hash, 2);
                // save the url entry
                final plasmaWordIndexEntry entry;
@ -467,6 +473,7 @@ public final class yacyClient {
                    // the old way to define words
                    entry = new plasmaWordIndexEntry(
                                                     urlEntry.hash(),
+                                                     urlLength, urlComps,
                                                     urlEntry.wordCount(),
                                                     0, 0, 0, 0, 0, 0,
                                                     urlEntry.size(),
@ -494,7 +501,7 @@ public final class yacyClient {
            }

            // finally insert the containers to the index
-            for (int m = 0; m < words; m++) { entityCache.addEntries(container[m]); }
+            for (int m = 0; m < words; m++) { containerCache.add(container[m]); }

            // generate statistics
            long searchtime;
@ -841,7 +848,7 @@ public final class yacyClient {
                                   httpHeader requestHeader) throws IOException {
     */

-    public static String transferIndex(yacySeed targetSeed, plasmaWordIndexEntity[] indexes, HashMap urlCache, boolean gzipBody, int timeout) {
+    public static String transferIndex(yacySeed targetSeed, plasmaWordIndexEntryContainer[] indexes, HashMap urlCache, boolean gzipBody, int timeout) {
        
        HashMap in = transferRWI(targetSeed, indexes, gzipBody, timeout);
        if (in == null) { return "no_connection_1"; }
@ -875,7 +882,7 @@ public final class yacyClient {
        return null;
    }

-    private static HashMap transferRWI(yacySeed targetSeed, plasmaWordIndexEntity[] indexes, boolean gzipBody, int timeout) {
+    private static HashMap transferRWI(yacySeed targetSeed, plasmaWordIndexEntryContainer[] indexes, boolean gzipBody, int timeout) {
        final String address = targetSeed.getAddress();
        if (address == null) { return null; }
        
@ -903,7 +910,7 @@ public final class yacyClient {
        Iterator eenum;
        plasmaWordIndexEntry entry;
        for (int i = 0; i < indexes.length; i++) {
-            eenum = indexes[i].elements(true);
+            eenum = indexes[i].entries();
            while (eenum.hasNext()) {
                entry = (plasmaWordIndexEntry) eenum.next();
                entrypost.append(indexes[i].wordHash()) 
--- a/source/de/anomic/yacy/yacySearch.java
+++ b/source/de/anomic/yacy/yacySearch.java
@ -52,8 +52,8 @@ import de.anomic.kelondro.kelondroMScoreCluster;
 import de.anomic.plasma.plasmaCrawlLURL;
 import de.anomic.plasma.plasmaURLPattern;
 import de.anomic.plasma.plasmaSnippetCache;
-import de.anomic.plasma.plasmaWordIndexEntity;
 import de.anomic.plasma.plasmaSearchProfile;
+import de.anomic.plasma.plasmaWordIndexEntryContainer;
 import de.anomic.server.logging.serverLog;

 public class yacySearch extends Thread {
@ -61,29 +61,31 @@ public class yacySearch extends Thread {
    final private Set wordhashes;
    final private boolean global;
    final private plasmaCrawlLURL urlManager;
-    final private plasmaWordIndexEntity entityCache;
+    final private plasmaWordIndexEntryContainer containerCache;
    final private plasmaURLPattern blacklist;
    final private plasmaSnippetCache snippetCache;
    final private yacySeed targetPeer;
    private int links;
+    private int maxDistance;
    final private plasmaSearchProfile profile;

-    public yacySearch(Set wordhashes, boolean global, yacySeed targetPeer,
-                      plasmaCrawlLURL urlManager, plasmaWordIndexEntity entityCache, plasmaURLPattern blacklist, plasmaSnippetCache snippetCache, plasmaSearchProfile profile) {
+    public yacySearch(Set wordhashes, int maxDistance, boolean global, yacySeed targetPeer,
+                      plasmaCrawlLURL urlManager, plasmaWordIndexEntryContainer containerCache, plasmaURLPattern blacklist, plasmaSnippetCache snippetCache, plasmaSearchProfile profile) {
        super("yacySearch_" + targetPeer.getName());
        this.wordhashes = wordhashes;
        this.global = global;
        this.urlManager = urlManager;
-        this.entityCache = entityCache;
+        this.containerCache = containerCache;
        this.blacklist = blacklist;
        this.snippetCache = snippetCache;
        this.targetPeer = targetPeer;
        this.links = -1;
+        this.maxDistance = maxDistance;
        this.profile = (plasmaSearchProfile) profile.clone();
    }

    public void run() {
-        this.links = yacyClient.search(set2string(wordhashes), global, targetPeer, urlManager, entityCache, blacklist, snippetCache, profile);
+        this.links = yacyClient.search(set2string(wordhashes), maxDistance, global, targetPeer, urlManager, containerCache, blacklist, snippetCache, profile);
        if (links != 0) {
            //yacyCore.log.logInfo("REMOTE SEARCH - remote peer " + targetPeer.hash + ":" + targetPeer.getName() + " contributed " + links + " links for word hash " + wordhashes);
            yacyCore.seedDB.mySeed.incRI(links);
@ -172,7 +174,7 @@ public class yacySearch extends Thread {
        return result;
    }

-    public static yacySearch[] searchHashes(Set wordhashes, plasmaCrawlLURL urlManager, plasmaWordIndexEntity entityCache,
+    public static yacySearch[] searchHashes(Set wordhashes, int maxDist, plasmaCrawlLURL urlManager, plasmaWordIndexEntryContainer containerCache,
                           int targets, plasmaURLPattern blacklist, plasmaSnippetCache snippetCache, plasmaSearchProfile profile) {
        // check own peer status
        if (yacyCore.seedDB.mySeed == null || yacyCore.seedDB.mySeed.getAddress() == null) { return null; }
@ -185,8 +187,8 @@ public class yacySearch extends Thread {
        if (targets == 0) return null;
        yacySearch[] searchThreads = new yacySearch[targets];
        for (int i = 0; i < targets; i++) {           
-            searchThreads[i]= new yacySearch(wordhashes, true, targetPeers[i],
-                    urlManager, entityCache, blacklist, snippetCache, profile);
+            searchThreads[i]= new yacySearch(wordhashes, maxDist, true, targetPeers[i],
+                    urlManager, containerCache, blacklist, snippetCache, profile);
            searchThreads[i].start();
            try {Thread.sleep(20);} catch (InterruptedException e) {}

@ -216,5 +218,4 @@ public class yacySearch extends Thread {
        }
    }
    
-    
 }