From 8e0de7f1800dc4c5ddc93847b32cd59b5772bf43 Mon Sep 17 00:00:00 2001
From: orbiter <orbiter@6c8d7289-2bf4-0310-a012-ef5d649a1542>
Date: Sun, 21 Sep 2008 20:25:47 +0000
Subject: [PATCH] update to language statistic evaluation: - the condenser does
 not abandon too small words any more before feeding the statistics - for text
 indexing no more urls are used to feed the index (this was wrong, but in
 contrast the indexing of urls for media search is necessary) - urls are not
 used any more to feed the statistics

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5197 6c8d7289-2bf4-0310-a012-ef5d649a1542
---
 source/de/anomic/plasma/plasmaCondenser.java  | 53 ++++++++++---------
 .../de/anomic/plasma/plasmaSnippetCache.java  |  2 +-
 2 files changed, 28 insertions(+), 27 deletions(-)

diff --git a/source/de/anomic/plasma/plasmaCondenser.java b/source/de/anomic/plasma/plasmaCondenser.java
index 057560230..b73490098 100644
--- a/source/de/anomic/plasma/plasmaCondenser.java
+++ b/source/de/anomic/plasma/plasmaCondenser.java
@@ -125,7 +125,8 @@ public final class plasmaCondenser {
         
         //System.out.println("DEBUG: condensing " + document.getMainLongTitle() + ", indexText=" + Boolean.toString(indexText) + ", indexMedia=" + Boolean.toString(indexMedia));
 
-        insertTextToWords(document.dc_source().toNormalform(false, true), 0, indexRWIEntry.flag_app_dc_identifier, RESULT_FLAGS);
+        // add the URL components to the word list
+        insertTextToWords(document.dc_source().toNormalform(false, true), 0, indexRWIEntry.flag_app_dc_identifier, RESULT_FLAGS, false);
         
         Map.Entry<yacyURL, String> entry;
         if (indexText) {
@@ -142,22 +143,21 @@ public final class plasmaCondenser {
             // phrase  99 is taken from the media Link url and anchor description
             // phrase 100 and above are lines from the text
       
-            insertTextToWords(document.dc_title(),    1, indexRWIEntry.flag_app_dc_title, RESULT_FLAGS);
-            insertTextToWords(document.dc_description(), 3, indexRWIEntry.flag_app_dc_description, RESULT_FLAGS);
-            insertTextToWords(document.dc_creator(),   4, indexRWIEntry.flag_app_dc_creator, RESULT_FLAGS);
+            insertTextToWords(document.dc_title(),    1, indexRWIEntry.flag_app_dc_title, RESULT_FLAGS, true);
+            insertTextToWords(document.dc_description(), 3, indexRWIEntry.flag_app_dc_description, RESULT_FLAGS, true);
+            insertTextToWords(document.dc_creator(),   4, indexRWIEntry.flag_app_dc_creator, RESULT_FLAGS, true);
             // missing: tags!
             final String[] titles = document.getSectionTitles();
             for (int i = 0; i < titles.length; i++) {
-                insertTextToWords(titles[i], i + 10, indexRWIEntry.flag_app_emphasized, RESULT_FLAGS);
+                insertTextToWords(titles[i], i + 10, indexRWIEntry.flag_app_emphasized, RESULT_FLAGS, true);
             }
             
-            // anchors
+            // anchors: for text indexing we add only the anchor description
             final Iterator<Map.Entry<yacyURL, String>> i = document.getAnchors().entrySet().iterator();
             while (i.hasNext()) {
                 entry = i.next();
                 if ((entry == null) || (entry.getKey() == null)) continue;
-                insertTextToWords(entry.getKey().toNormalform(false, false), 98, indexRWIEntry.flag_app_dc_identifier, RESULT_FLAGS);
-                insertTextToWords(entry.getValue(), 98, indexRWIEntry.flag_app_dc_description, RESULT_FLAGS);
+                insertTextToWords(entry.getValue(), 98, indexRWIEntry.flag_app_dc_description, RESULT_FLAGS, true);
             }
         } else {
             this.RESULT_NUMB_WORDS = 0;
@@ -167,28 +167,29 @@ public final class plasmaCondenser {
         }
         
         if (indexMedia) {
+            // add anchor descriptions: here, we also add the url components
             // audio
             Iterator<Map.Entry<yacyURL, String>> i = document.getAudiolinks().entrySet().iterator();
             while (i.hasNext()) {
                 entry = i.next();
-                insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasaudio, RESULT_FLAGS);
-                insertTextToWords(entry.getValue(), 99, flag_cat_hasaudio, RESULT_FLAGS);
+                insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasaudio, RESULT_FLAGS, false);
+                insertTextToWords(entry.getValue(), 99, flag_cat_hasaudio, RESULT_FLAGS, true);
             }
 
             // video
             i = document.getVideolinks().entrySet().iterator();
             while (i.hasNext()) {
                 entry = i.next();
-                insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasvideo, RESULT_FLAGS);
-                insertTextToWords(entry.getValue(), 99, flag_cat_hasvideo, RESULT_FLAGS);
+                insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasvideo, RESULT_FLAGS, false);
+                insertTextToWords(entry.getValue(), 99, flag_cat_hasvideo, RESULT_FLAGS, true);
             }
 
             // applications
             i = document.getApplinks().entrySet().iterator();
             while (i.hasNext()) {
                 entry = i.next();
-                insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasapp, RESULT_FLAGS);
-                insertTextToWords(entry.getValue(), 99, flag_cat_hasapp, RESULT_FLAGS);
+                insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasapp, RESULT_FLAGS, false);
+                insertTextToWords(entry.getValue(), 99, flag_cat_hasapp, RESULT_FLAGS, true);
             }
 
             // images
@@ -196,8 +197,8 @@ public final class plasmaCondenser {
             htmlFilterImageEntry ientry;
             while (j.hasNext()) {
                 ientry = j.next();
-                insertTextToWords(ientry.url().toNormalform(false, false), 99, flag_cat_hasimage, RESULT_FLAGS);
-                insertTextToWords(ientry.alt(), 99, flag_cat_hasimage, RESULT_FLAGS);
+                insertTextToWords(ientry.url().toNormalform(false, false), 99, flag_cat_hasimage, RESULT_FLAGS, false);
+                insertTextToWords(ientry.alt(), 99, flag_cat_hasimage, RESULT_FLAGS, true);
             }
         
             // finally check all words for missing flag entry
@@ -221,19 +222,20 @@ public final class plasmaCondenser {
         if (document.getApplinks().size()   > 0) RESULT_FLAGS.set(flag_cat_hasapp,   true);
     }
     
-    private void insertTextToWords(final String text, final int phrase, final int flagpos, final kelondroBitfield flagstemplate) {
+    private void insertTextToWords(final String text, final int phrase, final int flagpos, final kelondroBitfield flagstemplate, boolean useForLanguageIdentification) {
         String word;
         indexWord wprop;
         sievedWordsEnum wordenum;
         try {
-            wordenum = new sievedWordsEnum(new ByteArrayInputStream(text.getBytes()), "UTF-8", 3);
+            wordenum = new sievedWordsEnum(new ByteArrayInputStream(text.getBytes()), "UTF-8");
         } catch (final UnsupportedEncodingException e) {
             return;
         }
         int pip = 0;
         while (wordenum.hasMoreElements()) {
             word = (new String(wordenum.nextElement())).toLowerCase();
-            languageIdentificator.add(word);
+            if (useForLanguageIdentification) languageIdentificator.add(word);
+            if (word.length() < 3) continue;
             wprop = words.get(word);
             if (wprop == null) wprop = new indexWord(0, pip, phrase);
             if (wprop.flags == null) wprop.flags = flagstemplate.clone();
@@ -309,9 +311,11 @@ public final class plasmaCondenser {
         }
         
         // read source
-        final sievedWordsEnum wordenum = new sievedWordsEnum(is, charset, wordminsize);
+        final sievedWordsEnum wordenum = new sievedWordsEnum(is, charset);
         while (wordenum.hasMoreElements()) {
             word = (new String(wordenum.nextElement())).toLowerCase(); // TODO: does toLowerCase work for non ISO-8859-1 chars?
+            languageIdentificator.add(word);
+            if (word.length() < wordminsize) continue;
             //System.out.println("PARSED-WORD " + word);
             
             //This is useful for testing what YaCy "sees" of a website.
@@ -483,9 +487,9 @@ public final class plasmaCondenser {
         return invisibleChar[c - ' '];
     }
 
-    public static Enumeration<StringBuffer> wordTokenizer(final String s, final String charset, final int minLength) {
+    public static Enumeration<StringBuffer> wordTokenizer(final String s, final String charset) {
         try {
-            return new sievedWordsEnum(new ByteArrayInputStream(s.getBytes()), charset, minLength);
+            return new sievedWordsEnum(new ByteArrayInputStream(s.getBytes()), charset);
         } catch (final Exception e) {
             return null;
         }
@@ -496,12 +500,10 @@ public final class plasmaCondenser {
         
         StringBuffer buffer = null;
         unsievedWordsEnum e;
-        int ml;
 
-        public sievedWordsEnum(final InputStream is, final String charset, final int minLength) throws UnsupportedEncodingException {
+        public sievedWordsEnum(final InputStream is, final String charset) throws UnsupportedEncodingException {
             e = new unsievedWordsEnum(is, charset);
             buffer = nextElement0();
-            ml = minLength;
         }
 
         public void pre(final boolean x) {
@@ -514,7 +516,6 @@ public final class plasmaCondenser {
             loop: while (e.hasMoreElements()) {
                 s = e.nextElement();
                 if ((s.length() == 1) && (htmlFilterContentScraper.punctuation(s.charAt(0)))) return s;
-                if ((s.length() < ml) && (!(s.toString().equals("of")))) continue loop;
                 for (int i = 0; i < s.length(); i++) {
                     c = s.charAt(i);
                     // TODO: Bugfix needed for UTF-8
diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java
index ab7d4f7cc..b22f30fdf 100644
--- a/source/de/anomic/plasma/plasmaSnippetCache.java
+++ b/source/de/anomic/plasma/plasmaSnippetCache.java
@@ -732,7 +732,7 @@ public class plasmaSnippetCache {
     private static HashMap<String, Integer> hashSentence(final String sentence) {
         // generates a word-wordPos mapping
         final HashMap<String, Integer> map = new HashMap<String, Integer>();
-        final Enumeration<StringBuffer> words = plasmaCondenser.wordTokenizer(sentence, "UTF-8", 0);
+        final Enumeration<StringBuffer> words = plasmaCondenser.wordTokenizer(sentence, "UTF-8");
         int pos = 0;
         StringBuffer word;
         String hash;