enhancements, bugfixes and additions to word index attribute storage

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1392 6c8d7289-2bf4-0310-a012-ef5d649a1542
19 years ago · f14d49fae9
parent 0f750c2ed6
commit f14d49fae9
6 changed files with 80 additions and 41 deletions
--- a/htroot/htdocsdefault/dir.java
+++ b/htroot/htdocsdefault/dir.java
@ -53,7 +53,8 @@ import java.io.IOException;
 import java.net.URL;
 import java.text.SimpleDateFormat;
 import java.util.Date;
-import java.util.Set;
+import java.util.Iterator;
+import java.util.Map;
 import java.net.InetAddress;
 import java.net.UnknownHostException;
 import de.anomic.http.httpHeader;
@ -478,8 +479,12 @@ public class dir {
    public static void deletePhrase(plasmaSwitchboard switchboard, String urlstring, String phrase, String descr) {
        try {
            final String urlhash = plasmaURL.urlHash(new URL(urlstring));
-            final Set words = plasmaCondenser.getWords(("yacyshare " + phrase + " " + descr).getBytes("UTF-8"));
-            switchboard.removeReferences(urlhash, words);
+            final Iterator words = plasmaCondenser.getWords(("yacyshare " + phrase + " " + descr).getBytes("UTF-8"));
+            Map.Entry entry;
+            while (words.hasNext()) {
+                entry = (Map.Entry) words.next();
+                switchboard.wordIndex.removeEntries(plasmaWordIndexEntry.word2hash((String) entry.getKey()), new String[] {urlhash}, true);
+            }
            switchboard.urlPool.loadedURL.remove(urlhash);
        } catch (Exception e) {
            serverLog.logSevere("DIR", "INTERNAL ERROR in dir.deletePhrase", e);
--- a/source/de/anomic/plasma/plasmaCondenser.java
+++ b/source/de/anomic/plasma/plasmaCondenser.java
@ -57,7 +57,6 @@ import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.Map;
-import java.util.Set;
 import java.util.TreeMap;
 import java.util.TreeSet;

@ -71,7 +70,7 @@ public final class plasmaCondenser {
    private final static int numlength = 5;

    //private Properties analysis;
-    private TreeMap words; // a string (the words) to (statProp) - relation
+    private TreeMap words; // a string (the words) to (wordStatProp) - relation
    private HashMap sentences;
    private int wordminsize;
    private int wordcut;
@ -109,10 +108,13 @@ public final class plasmaCondenser {
        return oldsize - words.size();
    }

-    public Set getWords() {
-        return words.keySet();
+    public Iterator words() {
+        // returns an entry set iterator
+        // key is a String (the word), value is a wordStatProp Object
+        return words.entrySet().iterator();
    }

+    /*
    public int wordCount(String word) {
        // number of occurrences of one word
        // if the word did not occur, this simply returns 0
@ -144,7 +146,8 @@ public final class plasmaCondenser {
        if (sp == null) return 0;
        return sp.numOfPhrase;
    }
-
+    */
+    
    public static class wordStatProp {
        // object carries statistics for words and sentences
        
@ -706,11 +709,11 @@ public final class plasmaCondenser {
    }
    */
    
-    public static Set getWords(byte[] text) {
+    public static Iterator getWords(byte[] text) {
        if (text == null) return null;
        ByteArrayInputStream buffer = new ByteArrayInputStream(text);
        plasmaCondenser condenser = new plasmaCondenser(buffer);
-        return condenser.getWords();
+        return condenser.words();
    }
        
    public static void main(String[] args) {
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@ -1313,9 +1313,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
                        } else {
                            HashMap urlCache = new HashMap(1);
                            urlCache.put(newEntry.hash(),newEntry);
-                            
-                            ArrayList tmpEntities = new ArrayList(condenser.getWords().size());
-                            
+                            ArrayList tmpEntities = new ArrayList(condenser.RESULT_SIMI_WORDS);
+                            String language = plasmaWordIndexEntry.language(entry.url());
+                            char doctype = plasmaWordIndexEntry.docType(document.getMimeType());
                            int quality = 0;
                            try {
                                quality = condenser.RESULT_INFORMATION_VALUE;
@ -1323,22 +1323,23 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
                                System.out.println("INTERNAL ERROR WITH CONDENSER.INFORMATION_VALUE: " + e.toString() + ": in URL " + newEntry.url().toString());
                            }
                            
-                            String language = plasmaWordIndexEntry.language(entry.url());
-                            char doctype = plasmaWordIndexEntry.docType(document.getMimeType());
-                            
                            // iterate over all words
-                            Iterator i = condenser.getWords().iterator();
+                            Iterator i = condenser.words();
+                            Map.Entry wentry;
+                            plasmaCondenser.wordStatProp wordStat;
                            while (i.hasNext()) {
-                                String word = (String) i.next();
+                                wentry = (Map.Entry) i.next();
+                                String word = (String) wentry.getKey();
+                                wordStat = (plasmaCondenser.wordStatProp) wentry.getValue();
                                String wordHash = plasmaWordIndexEntry.word2hash(word);
                                plasmaWordIndexEntity wordIdxEntity = new plasmaWordIndexEntity(wordHash);
                                plasmaWordIndexEntry wordIdxEntry = new plasmaWordIndexEntry(urlHash,
-                                                                                             condenser.wordCount(word),
+                                                                                             wordStat.count,
                                                                                             condenser.RESULT_SIMI_WORDS,
                                                                                             condenser.RESULT_SIMI_SENTENCES,
-                                                                                             condenser.wordPositionInText(word),
-                                                                                             condenser.wordPositionInPhrase(word),
-                                                                                             condenser.wordNumberOfPhrase(word),
+                                                                                             wordStat.posInText,
+                                                                                             wordStat.posInPhrase,
+                                                                                             wordStat.numOfPhrase,
                                                                                             0,
                                                                                             docDate.getTime(),
                                                                                             quality, language, doctype, true);
@ -1347,7 +1348,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
                                // wordIndex.addEntries(plasmaWordIndexEntryContainer.instantContainer(wordHash, System.currentTimeMillis(), entry));
                            }
                            //System.out.println("DEBUG: plasmaSearch.addPageIndex: added " + condenser.getWords().size() + " words, flushed " + c + " entries");
-                            words = condenser.getWords().size();
+                            words = condenser.RESULT_SIMI_WORDS;
                            
                            // transfering the index to the storage peer
                            String error = yacyClient.transferIndex(seed,(plasmaWordIndexEntity[])tmpEntities.toArray(new plasmaWordIndexEntity[tmpEntities.size()]),urlCache,true,120000);
@ -1805,9 +1806,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
            // get set of words
            // Set words = plasmaCondenser.getWords(getText(getResource(url,
            // fetchOnline)));
-            Set words = plasmaCondenser.getWords(snippetCache.parseDocument(url, snippetCache.getResource(url, fetchOnline)).getText());
+            Iterator witer = plasmaCondenser.getWords(snippetCache.parseDocument(url, snippetCache.getResource(url, fetchOnline)).getText());
            // delete all word references
-            int count = removeReferences(urlhash, words);
+            int count = removeReferences(urlhash, witer);
            // finally delete the url entry itself
            urlPool.loadedURL.remove(urlhash);
            return count;
@ -1834,7 +1835,23 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
        }
        return count;
    }
-    
+
+    public int removeReferences(final String urlhash, final Iterator wordStatPropIterator) {
+        // sequentially delete all word references
+        // returns number of deletions
+        Map.Entry entry;
+        String word;
+        final String[] urlEntries = new String[] {urlhash};
+        int count = 0;
+        while (wordStatPropIterator.hasNext()) {
+            entry = (Map.Entry) wordStatPropIterator.next();
+            word = (String) entry.getKey();
+            // delete the URL reference in this word index
+            count += wordIndex.removeEntries(plasmaWordIndexEntry.word2hash(word), urlEntries, true);
+        }
+        return count;
+    }
+
    public int adminAuthenticated(httpHeader header) {
        String adminAccountBase64MD5 = getConfig("adminAccountBase64MD5", "");
        if (adminAccountBase64MD5.length() == 0) return 2; // not necessary
--- a/source/de/anomic/plasma/plasmaWordIndex.java
+++ b/source/de/anomic/plasma/plasmaWordIndex.java
@ -51,6 +51,7 @@ import java.io.File;
 import java.util.ArrayList;
 import java.util.Comparator;
 import java.util.Iterator;
+import java.util.Map;
 import java.util.TreeSet;
 import java.util.HashSet;
 import java.util.Set;
@ -150,28 +151,33 @@ public final class plasmaWordIndex {
        }

        // iterate over all words
-        Iterator i = condenser.getWords().iterator();
+        Iterator i = condenser.words();
+        Map.Entry wentry;
        String word;
-        plasmaWordIndexEntry entry;
+        plasmaWordIndexEntry ientry;
+        plasmaCondenser.wordStatProp wprop;
        String wordHash;
        while (i.hasNext()) {
-            word = (String) i.next();
+            wentry = (Map.Entry) i.next();
+            word = (String) wentry.getKey();
+            wprop = (plasmaCondenser.wordStatProp) wentry.getValue();
            // if ((s.length() > 4) && (c > 1)) System.out.println("# " + s + ":" + c);
            wordHash = plasmaWordIndexEntry.word2hash(word);
-            entry = new plasmaWordIndexEntry(urlHash,
-                                             condenser.wordCount(word),
+            ientry = new plasmaWordIndexEntry(urlHash,
+                                             wprop.count,
                                             condenser.RESULT_SIMI_WORDS,
                                             condenser.RESULT_SIMI_SENTENCES,
-                                             condenser.wordPositionInText(word),
-                                             condenser.wordPositionInPhrase(word),
-                                             condenser.wordNumberOfPhrase(word),
+                                             wprop.posInText,
+                                             wprop.posInPhrase,
+                                             wprop.numOfPhrase,
                                             0,
-                                             urlModified.getTime(), quality, language, doctype, true);
-            addEntries(plasmaWordIndexEntryContainer.instantContainer(wordHash, System.currentTimeMillis(), entry), false);
+                                             urlModified.getTime(),
+                                             quality, language, doctype, true);
+            addEntries(plasmaWordIndexEntryContainer.instantContainer(wordHash, System.currentTimeMillis(), ientry), false);
        }
        // System.out.println("DEBUG: plasmaSearch.addPageIndex: added " +
        // condenser.getWords().size() + " words, flushed " + c + " entries");
-        return condenser.getWords().size();
+        return condenser.RESULT_SIMI_WORDS;
    }
    
    public plasmaWordIndexEntity getEntity(String wordHash, boolean deleteIfEmpty, long maxTime) {
--- a/source/de/anomic/plasma/plasmaWordIndexEntry.java
+++ b/source/de/anomic/plasma/plasmaWordIndexEntry.java
@ -209,7 +209,7 @@ public final class plasmaWordIndexEntry {
                                int posinphrase, // position of word in its phrase
                                int posofphrase, // number of the phrase where word appears
                                int distance,    // word distance; this is 0 by default, and set to the difference of posintext from two indexes if these are combined (simultanous search). If stored, this shows that the result was obtained by remote search
-                                long time,       // last-modified time of the document where word appears
+                                long lastmodified, // last-modified time of the document where word appears
                                int quality,     // 
                                String language, //
                                char doctype,    //
@ -232,7 +232,7 @@ public final class plasmaWordIndexEntry {
        this.posinphrase = posinphrase;
        this.posofphrase = posofphrase;
        this.worddistance = distance;
-        this.lastModified = time;
+        this.lastModified = lastmodified;
        this.quality = quality;
        this.language = language.getBytes();
        this.doctype = doctype;
@ -325,6 +325,10 @@ public final class plasmaWordIndexEntry {
    
    public void combineDistance(plasmaWordIndexEntry oe) {
        this.worddistance = this.worddistance + oe.worddistance + Math.abs(this.posintext - oe.posintext);
+        this.posintext = Math.min(this.posintext, oe.posintext);
+        if (this.posofphrase != oe.posofphrase) this.posinphrase = 0; // (unknown)
+        this.posofphrase = Math.min(this.posofphrase, oe.posofphrase);
+        this.wordcount = (this.wordcount + oe.wordcount) / 2;
    }
    
    public String getUrlHash() { return urlHash; }
--- a/source/de/anomic/yacy/yacyClient.java
+++ b/source/de/anomic/yacy/yacyClient.java
@ -457,7 +457,8 @@ public final class yacyClient {
                urlManager.addEntry(urlEntry, yacyCore.seedDB.mySeed.hash, targetPeer.hash, 2);
                // save the url entry
                final plasmaWordIndexEntry entry;
-                if (urlEntry.word() == null)
+                if (urlEntry.word() == null) {
+                    // the old way to define words
                    entry = new plasmaWordIndexEntry(
                                                     urlEntry.hash(),
                                                     urlEntry.wordCount(),
@ -468,7 +469,10 @@ public final class yacyClient {
                                                     urlEntry.doctype(),
                                                     false
                                                    );
-                else entry = urlEntry.word();
+                } else {
+                    // the new way: the search-result-url transports all the attributes of word indexes
+                    entry = urlEntry.word();
+                }
                if (urlEntry.snippet() != null) {
                    // we don't store the snippets along the url entry, because they are search-specific.
                    // instead, they are placed in a snipped-search cache.