From 8e0de7f1800dc4c5ddc93847b32cd59b5772bf43 Mon Sep 17 00:00:00 2001 From: orbiter Date: Sun, 21 Sep 2008 20:25:47 +0000 Subject: [PATCH] update to language statistic evaluation: - the condenser does not abandon too small words any more before feeding the statistics - for text indexing no more urls are used to feed the index (this was wrong, but in contrast the indexing of urls for media search is necessary) - urls are not used any more to feed the statistics git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5197 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- source/de/anomic/plasma/plasmaCondenser.java | 53 ++++++++++--------- .../de/anomic/plasma/plasmaSnippetCache.java | 2 +- 2 files changed, 28 insertions(+), 27 deletions(-) diff --git a/source/de/anomic/plasma/plasmaCondenser.java b/source/de/anomic/plasma/plasmaCondenser.java index 057560230..b73490098 100644 --- a/source/de/anomic/plasma/plasmaCondenser.java +++ b/source/de/anomic/plasma/plasmaCondenser.java @@ -125,7 +125,8 @@ public final class plasmaCondenser { //System.out.println("DEBUG: condensing " + document.getMainLongTitle() + ", indexText=" + Boolean.toString(indexText) + ", indexMedia=" + Boolean.toString(indexMedia)); - insertTextToWords(document.dc_source().toNormalform(false, true), 0, indexRWIEntry.flag_app_dc_identifier, RESULT_FLAGS); + // add the URL components to the word list + insertTextToWords(document.dc_source().toNormalform(false, true), 0, indexRWIEntry.flag_app_dc_identifier, RESULT_FLAGS, false); Map.Entry entry; if (indexText) { @@ -142,22 +143,21 @@ public final class plasmaCondenser { // phrase 99 is taken from the media Link url and anchor description // phrase 100 and above are lines from the text - insertTextToWords(document.dc_title(), 1, indexRWIEntry.flag_app_dc_title, RESULT_FLAGS); - insertTextToWords(document.dc_description(), 3, indexRWIEntry.flag_app_dc_description, RESULT_FLAGS); - insertTextToWords(document.dc_creator(), 4, indexRWIEntry.flag_app_dc_creator, RESULT_FLAGS); + insertTextToWords(document.dc_title(), 1, indexRWIEntry.flag_app_dc_title, RESULT_FLAGS, true); + insertTextToWords(document.dc_description(), 3, indexRWIEntry.flag_app_dc_description, RESULT_FLAGS, true); + insertTextToWords(document.dc_creator(), 4, indexRWIEntry.flag_app_dc_creator, RESULT_FLAGS, true); // missing: tags! final String[] titles = document.getSectionTitles(); for (int i = 0; i < titles.length; i++) { - insertTextToWords(titles[i], i + 10, indexRWIEntry.flag_app_emphasized, RESULT_FLAGS); + insertTextToWords(titles[i], i + 10, indexRWIEntry.flag_app_emphasized, RESULT_FLAGS, true); } - // anchors + // anchors: for text indexing we add only the anchor description final Iterator> i = document.getAnchors().entrySet().iterator(); while (i.hasNext()) { entry = i.next(); if ((entry == null) || (entry.getKey() == null)) continue; - insertTextToWords(entry.getKey().toNormalform(false, false), 98, indexRWIEntry.flag_app_dc_identifier, RESULT_FLAGS); - insertTextToWords(entry.getValue(), 98, indexRWIEntry.flag_app_dc_description, RESULT_FLAGS); + insertTextToWords(entry.getValue(), 98, indexRWIEntry.flag_app_dc_description, RESULT_FLAGS, true); } } else { this.RESULT_NUMB_WORDS = 0; @@ -167,28 +167,29 @@ public final class plasmaCondenser { } if (indexMedia) { + // add anchor descriptions: here, we also add the url components // audio Iterator> i = document.getAudiolinks().entrySet().iterator(); while (i.hasNext()) { entry = i.next(); - insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasaudio, RESULT_FLAGS); - insertTextToWords(entry.getValue(), 99, flag_cat_hasaudio, RESULT_FLAGS); + insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasaudio, RESULT_FLAGS, false); + insertTextToWords(entry.getValue(), 99, flag_cat_hasaudio, RESULT_FLAGS, true); } // video i = document.getVideolinks().entrySet().iterator(); while (i.hasNext()) { entry = i.next(); - insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasvideo, RESULT_FLAGS); - insertTextToWords(entry.getValue(), 99, flag_cat_hasvideo, RESULT_FLAGS); + insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasvideo, RESULT_FLAGS, false); + insertTextToWords(entry.getValue(), 99, flag_cat_hasvideo, RESULT_FLAGS, true); } // applications i = document.getApplinks().entrySet().iterator(); while (i.hasNext()) { entry = i.next(); - insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasapp, RESULT_FLAGS); - insertTextToWords(entry.getValue(), 99, flag_cat_hasapp, RESULT_FLAGS); + insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasapp, RESULT_FLAGS, false); + insertTextToWords(entry.getValue(), 99, flag_cat_hasapp, RESULT_FLAGS, true); } // images @@ -196,8 +197,8 @@ public final class plasmaCondenser { htmlFilterImageEntry ientry; while (j.hasNext()) { ientry = j.next(); - insertTextToWords(ientry.url().toNormalform(false, false), 99, flag_cat_hasimage, RESULT_FLAGS); - insertTextToWords(ientry.alt(), 99, flag_cat_hasimage, RESULT_FLAGS); + insertTextToWords(ientry.url().toNormalform(false, false), 99, flag_cat_hasimage, RESULT_FLAGS, false); + insertTextToWords(ientry.alt(), 99, flag_cat_hasimage, RESULT_FLAGS, true); } // finally check all words for missing flag entry @@ -221,19 +222,20 @@ public final class plasmaCondenser { if (document.getApplinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasapp, true); } - private void insertTextToWords(final String text, final int phrase, final int flagpos, final kelondroBitfield flagstemplate) { + private void insertTextToWords(final String text, final int phrase, final int flagpos, final kelondroBitfield flagstemplate, boolean useForLanguageIdentification) { String word; indexWord wprop; sievedWordsEnum wordenum; try { - wordenum = new sievedWordsEnum(new ByteArrayInputStream(text.getBytes()), "UTF-8", 3); + wordenum = new sievedWordsEnum(new ByteArrayInputStream(text.getBytes()), "UTF-8"); } catch (final UnsupportedEncodingException e) { return; } int pip = 0; while (wordenum.hasMoreElements()) { word = (new String(wordenum.nextElement())).toLowerCase(); - languageIdentificator.add(word); + if (useForLanguageIdentification) languageIdentificator.add(word); + if (word.length() < 3) continue; wprop = words.get(word); if (wprop == null) wprop = new indexWord(0, pip, phrase); if (wprop.flags == null) wprop.flags = flagstemplate.clone(); @@ -309,9 +311,11 @@ public final class plasmaCondenser { } // read source - final sievedWordsEnum wordenum = new sievedWordsEnum(is, charset, wordminsize); + final sievedWordsEnum wordenum = new sievedWordsEnum(is, charset); while (wordenum.hasMoreElements()) { word = (new String(wordenum.nextElement())).toLowerCase(); // TODO: does toLowerCase work for non ISO-8859-1 chars? + languageIdentificator.add(word); + if (word.length() < wordminsize) continue; //System.out.println("PARSED-WORD " + word); //This is useful for testing what YaCy "sees" of a website. @@ -483,9 +487,9 @@ public final class plasmaCondenser { return invisibleChar[c - ' ']; } - public static Enumeration wordTokenizer(final String s, final String charset, final int minLength) { + public static Enumeration wordTokenizer(final String s, final String charset) { try { - return new sievedWordsEnum(new ByteArrayInputStream(s.getBytes()), charset, minLength); + return new sievedWordsEnum(new ByteArrayInputStream(s.getBytes()), charset); } catch (final Exception e) { return null; } @@ -496,12 +500,10 @@ public final class plasmaCondenser { StringBuffer buffer = null; unsievedWordsEnum e; - int ml; - public sievedWordsEnum(final InputStream is, final String charset, final int minLength) throws UnsupportedEncodingException { + public sievedWordsEnum(final InputStream is, final String charset) throws UnsupportedEncodingException { e = new unsievedWordsEnum(is, charset); buffer = nextElement0(); - ml = minLength; } public void pre(final boolean x) { @@ -514,7 +516,6 @@ public final class plasmaCondenser { loop: while (e.hasMoreElements()) { s = e.nextElement(); if ((s.length() == 1) && (htmlFilterContentScraper.punctuation(s.charAt(0)))) return s; - if ((s.length() < ml) && (!(s.toString().equals("of")))) continue loop; for (int i = 0; i < s.length(); i++) { c = s.charAt(i); // TODO: Bugfix needed for UTF-8 diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java index ab7d4f7cc..b22f30fdf 100644 --- a/source/de/anomic/plasma/plasmaSnippetCache.java +++ b/source/de/anomic/plasma/plasmaSnippetCache.java @@ -732,7 +732,7 @@ public class plasmaSnippetCache { private static HashMap hashSentence(final String sentence) { // generates a word-wordPos mapping final HashMap map = new HashMap(); - final Enumeration words = plasmaCondenser.wordTokenizer(sentence, "UTF-8", 0); + final Enumeration words = plasmaCondenser.wordTokenizer(sentence, "UTF-8"); int pos = 0; StringBuffer word; String hash;