update to language statistic evaluation:

- the condenser does not abandon too small words any more before feeding the statistics - for text indexing no more urls are used to feed the index (this was wrong, but in contrast the indexing of urls for media search is necessary) - urls are not used any more to feed the statistics git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5197 6c8d7289-2bf4-0310-a012-ef5d649a1542
17 years ago · 8e0de7f180
parent 2ff6f0360e
commit 8e0de7f180
2 changed files with 28 additions and 27 deletions
--- a/source/de/anomic/plasma/plasmaCondenser.java
+++ b/source/de/anomic/plasma/plasmaCondenser.java
@ -125,7 +125,8 @@ public final class plasmaCondenser {
        
        //System.out.println("DEBUG: condensing " + document.getMainLongTitle() + ", indexText=" + Boolean.toString(indexText) + ", indexMedia=" + Boolean.toString(indexMedia));

-        insertTextToWords(document.dc_source().toNormalform(false, true), 0, indexRWIEntry.flag_app_dc_identifier, RESULT_FLAGS);
+        // add the URL components to the word list
+        insertTextToWords(document.dc_source().toNormalform(false, true), 0, indexRWIEntry.flag_app_dc_identifier, RESULT_FLAGS, false);
        
        Map.Entry<yacyURL, String> entry;
        if (indexText) {
@ -142,22 +143,21 @@ public final class plasmaCondenser {
            // phrase  99 is taken from the media Link url and anchor description
            // phrase 100 and above are lines from the text
      
-            insertTextToWords(document.dc_title(),    1, indexRWIEntry.flag_app_dc_title, RESULT_FLAGS);
-            insertTextToWords(document.dc_description(), 3, indexRWIEntry.flag_app_dc_description, RESULT_FLAGS);
-            insertTextToWords(document.dc_creator(),   4, indexRWIEntry.flag_app_dc_creator, RESULT_FLAGS);
+            insertTextToWords(document.dc_title(),    1, indexRWIEntry.flag_app_dc_title, RESULT_FLAGS, true);
+            insertTextToWords(document.dc_description(), 3, indexRWIEntry.flag_app_dc_description, RESULT_FLAGS, true);
+            insertTextToWords(document.dc_creator(),   4, indexRWIEntry.flag_app_dc_creator, RESULT_FLAGS, true);
            // missing: tags!
            final String[] titles = document.getSectionTitles();
            for (int i = 0; i < titles.length; i++) {
-                insertTextToWords(titles[i], i + 10, indexRWIEntry.flag_app_emphasized, RESULT_FLAGS);
+                insertTextToWords(titles[i], i + 10, indexRWIEntry.flag_app_emphasized, RESULT_FLAGS, true);
            }
            
-            // anchors
+            // anchors: for text indexing we add only the anchor description
            final Iterator<Map.Entry<yacyURL, String>> i = document.getAnchors().entrySet().iterator();
            while (i.hasNext()) {
                entry = i.next();
                if ((entry == null) || (entry.getKey() == null)) continue;
-                insertTextToWords(entry.getKey().toNormalform(false, false), 98, indexRWIEntry.flag_app_dc_identifier, RESULT_FLAGS);
-                insertTextToWords(entry.getValue(), 98, indexRWIEntry.flag_app_dc_description, RESULT_FLAGS);
+                insertTextToWords(entry.getValue(), 98, indexRWIEntry.flag_app_dc_description, RESULT_FLAGS, true);
            }
        } else {
            this.RESULT_NUMB_WORDS = 0;
@ -167,28 +167,29 @@ public final class plasmaCondenser {
        }
        
        if (indexMedia) {
+            // add anchor descriptions: here, we also add the url components
            // audio
            Iterator<Map.Entry<yacyURL, String>> i = document.getAudiolinks().entrySet().iterator();
            while (i.hasNext()) {
                entry = i.next();
-                insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasaudio, RESULT_FLAGS);
-                insertTextToWords(entry.getValue(), 99, flag_cat_hasaudio, RESULT_FLAGS);
+                insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasaudio, RESULT_FLAGS, false);
+                insertTextToWords(entry.getValue(), 99, flag_cat_hasaudio, RESULT_FLAGS, true);
            }

            // video
            i = document.getVideolinks().entrySet().iterator();
            while (i.hasNext()) {
                entry = i.next();
-                insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasvideo, RESULT_FLAGS);
-                insertTextToWords(entry.getValue(), 99, flag_cat_hasvideo, RESULT_FLAGS);
+                insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasvideo, RESULT_FLAGS, false);
+                insertTextToWords(entry.getValue(), 99, flag_cat_hasvideo, RESULT_FLAGS, true);
            }

            // applications
            i = document.getApplinks().entrySet().iterator();
            while (i.hasNext()) {
                entry = i.next();
-                insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasapp, RESULT_FLAGS);
-                insertTextToWords(entry.getValue(), 99, flag_cat_hasapp, RESULT_FLAGS);
+                insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasapp, RESULT_FLAGS, false);
+                insertTextToWords(entry.getValue(), 99, flag_cat_hasapp, RESULT_FLAGS, true);
            }

            // images
@ -196,8 +197,8 @@ public final class plasmaCondenser {
            htmlFilterImageEntry ientry;
            while (j.hasNext()) {
                ientry = j.next();
-                insertTextToWords(ientry.url().toNormalform(false, false), 99, flag_cat_hasimage, RESULT_FLAGS);
-                insertTextToWords(ientry.alt(), 99, flag_cat_hasimage, RESULT_FLAGS);
+                insertTextToWords(ientry.url().toNormalform(false, false), 99, flag_cat_hasimage, RESULT_FLAGS, false);
+                insertTextToWords(ientry.alt(), 99, flag_cat_hasimage, RESULT_FLAGS, true);
            }
        
            // finally check all words for missing flag entry
@ -221,19 +222,20 @@ public final class plasmaCondenser {
        if (document.getApplinks().size()   > 0) RESULT_FLAGS.set(flag_cat_hasapp,   true);
    }
    
-    private void insertTextToWords(final String text, final int phrase, final int flagpos, final kelondroBitfield flagstemplate) {
+    private void insertTextToWords(final String text, final int phrase, final int flagpos, final kelondroBitfield flagstemplate, boolean useForLanguageIdentification) {
        String word;
        indexWord wprop;
        sievedWordsEnum wordenum;
        try {
-            wordenum = new sievedWordsEnum(new ByteArrayInputStream(text.getBytes()), "UTF-8", 3);
+            wordenum = new sievedWordsEnum(new ByteArrayInputStream(text.getBytes()), "UTF-8");
        } catch (final UnsupportedEncodingException e) {
            return;
        }
        int pip = 0;
        while (wordenum.hasMoreElements()) {
            word = (new String(wordenum.nextElement())).toLowerCase();
-            languageIdentificator.add(word);
+            if (useForLanguageIdentification) languageIdentificator.add(word);
+            if (word.length() < 3) continue;
            wprop = words.get(word);
            if (wprop == null) wprop = new indexWord(0, pip, phrase);
            if (wprop.flags == null) wprop.flags = flagstemplate.clone();
@ -309,9 +311,11 @@ public final class plasmaCondenser {
        }
        
        // read source
-        final sievedWordsEnum wordenum = new sievedWordsEnum(is, charset, wordminsize);
+        final sievedWordsEnum wordenum = new sievedWordsEnum(is, charset);
        while (wordenum.hasMoreElements()) {
            word = (new String(wordenum.nextElement())).toLowerCase(); // TODO: does toLowerCase work for non ISO-8859-1 chars?
+            languageIdentificator.add(word);
+            if (word.length() < wordminsize) continue;
            //System.out.println("PARSED-WORD " + word);
            
            //This is useful for testing what YaCy "sees" of a website.
@ -483,9 +487,9 @@ public final class plasmaCondenser {
        return invisibleChar[c - ' '];
    }

-    public static Enumeration<StringBuffer> wordTokenizer(final String s, final String charset, final int minLength) {
+    public static Enumeration<StringBuffer> wordTokenizer(final String s, final String charset) {
        try {
-            return new sievedWordsEnum(new ByteArrayInputStream(s.getBytes()), charset, minLength);
+            return new sievedWordsEnum(new ByteArrayInputStream(s.getBytes()), charset);
        } catch (final Exception e) {
            return null;
        }
@ -496,12 +500,10 @@ public final class plasmaCondenser {
        
        StringBuffer buffer = null;
        unsievedWordsEnum e;
-        int ml;

-        public sievedWordsEnum(final InputStream is, final String charset, final int minLength) throws UnsupportedEncodingException {
+        public sievedWordsEnum(final InputStream is, final String charset) throws UnsupportedEncodingException {
            e = new unsievedWordsEnum(is, charset);
            buffer = nextElement0();
-            ml = minLength;
        }

        public void pre(final boolean x) {
@ -514,7 +516,6 @@ public final class plasmaCondenser {
            loop: while (e.hasMoreElements()) {
                s = e.nextElement();
                if ((s.length() == 1) && (htmlFilterContentScraper.punctuation(s.charAt(0)))) return s;
-                if ((s.length() < ml) && (!(s.toString().equals("of")))) continue loop;
                for (int i = 0; i < s.length(); i++) {
                    c = s.charAt(i);
                    // TODO: Bugfix needed for UTF-8
--- a/source/de/anomic/plasma/plasmaSnippetCache.java
+++ b/source/de/anomic/plasma/plasmaSnippetCache.java
@ -732,7 +732,7 @@ public class plasmaSnippetCache {
    private static HashMap<String, Integer> hashSentence(final String sentence) {
        // generates a word-wordPos mapping
        final HashMap<String, Integer> map = new HashMap<String, Integer>();
-        final Enumeration<StringBuffer> words = plasmaCondenser.wordTokenizer(sentence, "UTF-8", 0);
+        final Enumeration<StringBuffer> words = plasmaCondenser.wordTokenizer(sentence, "UTF-8");
        int pos = 0;
        StringBuffer word;
        String hash;