From 7829480b825aa63b261968cb273e03f9283fe415 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Wed, 1 Jul 2015 18:28:18 +0200 Subject: [PATCH] refactoring: separated condenser and tokenizer --- source/net/yacy/document/Condenser.java | 230 +--------------------- source/net/yacy/document/Tokenizer.java | 241 ++++++++++++++++++++++++ 2 files changed, 244 insertions(+), 227 deletions(-) create mode 100644 source/net/yacy/document/Tokenizer.java diff --git a/source/net/yacy/document/Condenser.java b/source/net/yacy/document/Condenser.java index 6cf125d17..c59de2b7d 100644 --- a/source/net/yacy/document/Condenser.java +++ b/source/net/yacy/document/Condenser.java @@ -24,19 +24,14 @@ import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; -import java.util.ArrayList; import java.util.Date; import java.util.HashMap; -import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashSet; -import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Properties; -import java.util.Set; import java.util.SortedSet; -import java.util.TreeMap; import org.apache.solr.common.params.MapSolrParams; @@ -45,11 +40,8 @@ import net.yacy.cora.document.analysis.Classification.ContentDomain; import net.yacy.cora.document.analysis.EnhancedTextProfileSignature; import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.id.AnchorURL; -import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.federate.solr.Ranking; -import net.yacy.cora.language.synonyms.SynonymLibrary; -import net.yacy.cora.lod.vocabulary.Tagging; import net.yacy.cora.util.CommonPattern; import net.yacy.cora.util.ConcurrentLog; import net.yacy.document.language.Identificator; @@ -59,34 +51,11 @@ import net.yacy.kelondro.data.word.WordReferenceRow; import net.yacy.kelondro.util.Bitfield; import net.yacy.kelondro.util.SetTools; +public final class Condenser extends Tokenizer { -public final class Condenser { - - // this is the page analysis class - public final static boolean pseudostemming = false; // switch for removal of words that appear in shortened form - public final static int wordminsize = 2; - public final static int wordcut = 2; - - // category flags that show how the page can be distinguished in different interest groups - public static final int flag_cat_indexof = 0; // a directory listing page (i.e. containing 'index of') - public static final int flag_cat_haslocation = 19; // the page has a location metadata attached - public static final int flag_cat_hasimage = 20; // the page refers to (at least one) images - public static final int flag_cat_hasaudio = 21; // the page refers to (at least one) audio file - public static final int flag_cat_hasvideo = 22; // the page refers to (at least one) videos - public static final int flag_cat_hasapp = 23; // the page refers to (at least one) application file - - //private Properties analysis; - private final Map words; // a string (the words) to (indexWord) - relation - private final Map> tags = new HashMap>(); // a set of tags, discovered from Autotagging - private final Set synonyms; // a set of synonyms to the words private long fuzzy_signature = 0, exact_signature = 0; // signatures for double-check detection private String fuzzy_signature_text = null; // signatures for double-check detection - public int RESULT_NUMB_WORDS = -1; - //public int RESULT_DIFF_WORDS = -1; - public int RESULT_NUMB_SENTENCES = -1; - //public int RESULT_DIFF_SENTENCES = -1; - public Bitfield RESULT_FLAGS = new Bitfield(4); private final Identificator languageIdentificator; public LinkedHashSet dates_in_content; @@ -100,12 +69,11 @@ public final class Condenser { final boolean findDatesInContent, final int timezoneOffset ) { + super(document.dc_source(), indexText ? document.getTextString() : "", meaningLib, doAutotagging, scraper); + Thread.currentThread().setName("condenser-" + document.dc_identifier()); // for debugging // if addMedia == true, then all the media links are also parsed and added to the words // added media words are flagged with the appropriate media flag - this.words = new HashMap(); - this.synonyms = new LinkedHashSet(); - this.RESULT_FLAGS = new Bitfield(4); this.dates_in_content = new LinkedHashSet(); // construct flag set for document @@ -125,7 +93,6 @@ public final class Condenser { if (indexText) { String text = document.getTextString(); if (findDatesInContent) this.dates_in_content = DateDetection.parse(text, timezoneOffset); - createCondensement(document.dc_source(), text, meaningLib, doAutotagging, scraper); // the phrase counter: // phrase 0 are words taken from the URL // phrase 1 is the MainTitle @@ -167,9 +134,7 @@ public final class Condenser { */ } else { this.RESULT_NUMB_WORDS = 0; - //this.RESULT_DIFF_WORDS = 0; this.RESULT_NUMB_SENTENCES = 0; - //this.RESULT_DIFF_SENTENCES = 0; } if (indexMedia) { @@ -229,14 +194,6 @@ public final class Condenser { document.addMetatags(this.tags); } - // create the synonyms set - if (SynonymLibrary.size() > 0) { - for (String word: this.words.keySet()) { - Set syms = SynonymLibrary.getSynonyms(word); - if (syms != null) this.synonyms.addAll(syms); - } - } - String text = document.getTextString(); // create hashes for duplicate detection @@ -252,14 +209,6 @@ public final class Condenser { this.exact_signature = EnhancedTextProfileSignature.getSignatureLong(text); } - private Condenser(final DigestURL root, final String text, final WordCache meaningLib, final boolean doAutotagging, final VocabularyScraper scraper) { - this.languageIdentificator = null; // we don't need that here - // analysis = new Properties(); - this.words = new TreeMap(); - this.synonyms = new HashSet(); - createCondensement(root, text, meaningLib, doAutotagging, scraper); - } - private void insertTextToWords( final SentenceReader text, final int phrase, @@ -300,17 +249,6 @@ public final class Condenser { return oldsize - this.words.size(); } - public Map words() { - // returns the words as word/indexWord relation map - return this.words; - } - - public List synonyms() { - ArrayList l = new ArrayList(this.synonyms.size()); - for (String s: this.synonyms) l.add(s); - return l; - } - public long fuzzySignature() { return this.fuzzy_signature; } @@ -327,168 +265,6 @@ public final class Condenser { return this.languageIdentificator.getLanguage(); } - private void createCondensement(final DigestURL root, final String text, final WordCache meaningLib, boolean doAutotagging, final VocabularyScraper scraper) { - assert text != null; - final Set currsentwords = new HashSet(); - String word = ""; - String[] wordcache = new String[LibraryProvider.autotagging.getMaxWordsInTerm() - 1]; - for (int i = 0; i < wordcache.length; i++) wordcache[i] = ""; - String k; - Tagging.Metatag tag; - int wordlen; - Word wsp; - final Word wsp1; - int wordHandle; - int wordHandleCount = 0; - //final int sentenceHandleCount = 0; - int allwordcounter = 0; - final int allsentencecounter = 0; - int wordInSentenceCounter = 1; - boolean comb_indexof = false, last_last = false, last_index = false; - //final Map sentences = new HashMap(100); - if (LibraryProvider.autotagging.isEmpty()) doAutotagging = false; - - // read source - WordTokenizer wordenum = new WordTokenizer(new SentenceReader(text), meaningLib); - try { - while (wordenum.hasMoreElements()) { - word = wordenum.nextElement().toString().toLowerCase(Locale.ENGLISH); - if (this.languageIdentificator != null) this.languageIdentificator.add(word); - if (word.length() < wordminsize) continue; - - // get tags from autotagging - if (doAutotagging) { - Set vocabularyNames = LibraryProvider.autotagging.getVocabularyNames(); - //Collection vocabularies = LibraryProvider.autotagging.getVocabularies(); - //assert vocabularyNames.size() == vocabularies.size(); - Map vocMap = scraper == null ? null : scraper.removeVocMap(root); - if (vocMap != null && vocMap.size() > 0) { - for (Map.Entry entry: vocMap.entrySet()) { - String navigatorName = entry.getKey(); - String term = entry.getValue(); - vocabularyNames.remove(navigatorName); // prevent that this is used again for auto-annotation - Tagging vocabulary = LibraryProvider.autotagging.getVocabulary(navigatorName); - if (vocabulary != null) { - // extend the vocabulary - String obj = vocabulary.getObjectlink(term); - if (obj == null) try {vocabulary.put(term, "", root.toNormalform(true));} catch (IOException e) {} // this makes IO, be careful! - // create annotation - tag = vocabulary.getMetatagFromTerm(term); - Set tagset = new HashSet<>(); - tagset.add(tag); - this.tags.put(navigatorName, tagset); - } - } - } - if (vocabularyNames.size() > 0) for (int wordc = 1; wordc <= wordcache.length + 1; wordc++) { - // wordc is number of words that are tested - StringBuilder sb = new StringBuilder(); - if (wordc == 1) { - sb.append(word); - } else { - for (int w = 0; w < wordc - 1; w++) { - sb.append(wordcache[wordcache.length - wordc + w + 1]).append(' '); - } - sb.append(word); - } - String testterm = sb.toString().trim(); - //System.out.println("Testing: " + testterm); - tag = LibraryProvider.autotagging.getTagFromTerm(vocabularyNames, testterm); - if (tag != null) { - String navigatorName = tag.getVocabularyName(); - Set tagset = this.tags.get(navigatorName); - if (tagset == null) { - tagset = new HashSet(); - this.tags.put(navigatorName, tagset); - } - tagset.add(tag); - } - } - } - // shift wordcache - System.arraycopy(wordcache, 1, wordcache, 0, wordcache.length - 1); - wordcache[wordcache.length - 1] = word; - - // distinguish punctuation and words - wordlen = word.length(); - if (wordlen == 1 && SentenceReader.punctuation(word.charAt(0))) { // TODO: wordlen == 1 never true (see earlier if < wordminsize ) - // store sentence - currsentwords.clear(); - wordInSentenceCounter = 1; - } else { - // check index.of detection - if (last_last && comb_indexof && word.equals("modified")) { - this.RESULT_FLAGS.set(flag_cat_indexof, true); - wordenum.pre(true); // parse lines as they come with CRLF - } - if (last_index && (wordminsize > 2 || word.equals("of"))) comb_indexof = true; - last_last = word.equals("last"); - last_index = word.equals("index"); - - // store word - allwordcounter++; - currsentwords.add(word); - wsp = this.words.get(word); - if (wsp != null) { - // word already exists - wordHandle = wsp.posInText; - wsp.inc(); - } else { - // word does not yet exist, create new word entry - wordHandle = wordHandleCount++; - wsp = new Word(wordHandle, wordInSentenceCounter, /* sentences.size() + */ 100); - wsp.flags = this.RESULT_FLAGS.clone(); - this.words.put(word.toLowerCase(), wsp); - } - // we now have the unique handle of the word, put it into the sentence: - wordInSentenceCounter++; - } - } - } finally { - wordenum.close(); - wordenum = null; - } - - if (pseudostemming) { - Map.Entry entry; - // we search for similar words and reorganize the corresponding sentences - // a word is similar, if a shortened version is equal - final Iterator> wi = this.words.entrySet().iterator(); // enumerates the keys in descending order - wordsearch: while (wi.hasNext()) { - entry = wi.next(); - word = entry.getKey(); - wordlen = word.length(); - wsp = entry.getValue(); - for (int i = wordcut; i > 0; i--) { - if (wordlen > i) { - k = word.substring(0, wordlen - i); - if (this.words.containsKey(k)) { - // update word counter - wsp1.count = wsp1.count + wsp.count; - this.words.put(k, wsp1); - // remove current word - wi.remove(); - continue wordsearch; - } - } - } - } - } - - // store result - //this.RESULT_NUMB_TEXT_BYTES = wordenum.count(); - this.RESULT_NUMB_WORDS = allwordcounter; - //this.RESULT_DIFF_WORDS = wordHandleCount; - this.RESULT_NUMB_SENTENCES = allsentencecounter; - //this.RESULT_DIFF_SENTENCES = sentenceHandleCount; - } - - public static Map getWords(final String text, final WordCache meaningLib) { - // returns a word/indexWord relation map - if (text == null) return null; - return new Condenser(null, text, meaningLib, false, null).words(); - } - public static void main(final String[] args) { // read a property file and convert them into configuration lines try { diff --git a/source/net/yacy/document/Tokenizer.java b/source/net/yacy/document/Tokenizer.java new file mode 100644 index 000000000..ed3b0fd0d --- /dev/null +++ b/source/net/yacy/document/Tokenizer.java @@ -0,0 +1,241 @@ +/** + * Annotation.java + * Copyright 2004 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany + * First released 09.01.2004 at http://yacy.net + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General private + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General private License for more details. + * + * You should have received a copy of the GNU Lesser General private License + * along with this program in the file lgpl21.txt + * If not, see . + */ + +package net.yacy.document; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Set; +import java.util.TreeMap; + +import net.yacy.cora.document.WordCache; +import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.language.synonyms.SynonymLibrary; +import net.yacy.cora.lod.vocabulary.Tagging; +import net.yacy.cora.order.NaturalOrder; +import net.yacy.kelondro.data.word.Word; +import net.yacy.kelondro.util.Bitfield; + +public class Tokenizer { + + // this is the page analysis class + public final static boolean pseudostemming = false; // switch for removal of words that appear in shortened form + public final static int wordminsize = 2; + public final static int wordcut = 2; + + // category flags that show how the page can be distinguished in different interest groups + public static final int flag_cat_indexof = 0; // a directory listing page (i.e. containing 'index of') + public static final int flag_cat_haslocation = 19; // the page has a location metadata attached + public static final int flag_cat_hasimage = 20; // the page refers to (at least one) images + public static final int flag_cat_hasaudio = 21; // the page refers to (at least one) audio file + public static final int flag_cat_hasvideo = 22; // the page refers to (at least one) videos + public static final int flag_cat_hasapp = 23; // the page refers to (at least one) application file + + //private Properties analysis; + protected final Map words; // a string (the words) to (indexWord) - relation + private final Set synonyms; // a set of synonyms to the words + protected final Map> tags = new HashMap>(); // a set of tags, discovered from Autotagging + + public int RESULT_NUMB_WORDS = -1; + public int RESULT_NUMB_SENTENCES = -1; + public Bitfield RESULT_FLAGS = new Bitfield(4); + + public Tokenizer(final DigestURL root, final String text, final WordCache meaningLib, boolean doAutotagging, final VocabularyScraper scraper) { + this.words = new TreeMap(NaturalOrder.naturalComparator); + this.synonyms = new LinkedHashSet(); + assert text != null; + final Set currsentwords = new HashSet(); + String[] wordcache = new String[LibraryProvider.autotagging.getMaxWordsInTerm() - 1]; + for (int i = 0; i < wordcache.length; i++) wordcache[i] = ""; + String k; + Tagging.Metatag tag; + int wordlen; + int wordHandle; + int wordHandleCount = 0; + //final int sentenceHandleCount = 0; + int allwordcounter = 0; + final int allsentencecounter = 0; + int wordInSentenceCounter = 1; + boolean comb_indexof = false, last_last = false, last_index = false; + //final Map sentences = new HashMap(100); + if (LibraryProvider.autotagging.isEmpty()) doAutotagging = false; + + // read source + WordTokenizer wordenum = new WordTokenizer(new SentenceReader(text), meaningLib); + try { + while (wordenum.hasMoreElements()) { + String word = wordenum.nextElement().toString().toLowerCase(Locale.ENGLISH); + if (word.length() < wordminsize) continue; + + // get tags from autotagging + if (doAutotagging) { + Set vocabularyNames = LibraryProvider.autotagging.getVocabularyNames(); + //Collection vocabularies = LibraryProvider.autotagging.getVocabularies(); + //assert vocabularyNames.size() == vocabularies.size(); + Map vocMap = scraper == null ? null : scraper.removeVocMap(root); + if (vocMap != null && vocMap.size() > 0) { + for (Map.Entry entry: vocMap.entrySet()) { + String navigatorName = entry.getKey(); + String term = entry.getValue(); + vocabularyNames.remove(navigatorName); // prevent that this is used again for auto-annotation + Tagging vocabulary = LibraryProvider.autotagging.getVocabulary(navigatorName); + if (vocabulary != null) { + // extend the vocabulary + String obj = vocabulary.getObjectlink(term); + if (obj == null) try {vocabulary.put(term, "", root.toNormalform(true));} catch (IOException e) {} // this makes IO, be careful! + // create annotation + tag = vocabulary.getMetatagFromTerm(term); + Set tagset = new HashSet<>(); + tagset.add(tag); + this.tags.put(navigatorName, tagset); + } + } + } + if (vocabularyNames.size() > 0) for (int wordc = 1; wordc <= wordcache.length + 1; wordc++) { + // wordc is number of words that are tested + StringBuilder sb = new StringBuilder(); + if (wordc == 1) { + sb.append(word); + } else { + for (int w = 0; w < wordc - 1; w++) { + sb.append(wordcache[wordcache.length - wordc + w + 1]).append(' '); + } + sb.append(word); + } + String testterm = sb.toString().trim(); + //System.out.println("Testing: " + testterm); + tag = LibraryProvider.autotagging.getTagFromTerm(vocabularyNames, testterm); + if (tag != null) { + String navigatorName = tag.getVocabularyName(); + Set tagset = this.tags.get(navigatorName); + if (tagset == null) { + tagset = new HashSet(); + this.tags.put(navigatorName, tagset); + } + tagset.add(tag); + } + } + } + // shift wordcache + System.arraycopy(wordcache, 1, wordcache, 0, wordcache.length - 1); + wordcache[wordcache.length - 1] = word; + + // distinguish punctuation and words + wordlen = word.length(); + if (wordlen == 1 && SentenceReader.punctuation(word.charAt(0))) { // TODO: wordlen == 1 never true (see earlier if < wordminsize ) + // store sentence + currsentwords.clear(); + wordInSentenceCounter = 1; + } else { + // check index.of detection + if (last_last && comb_indexof && word.equals("modified")) { + this.RESULT_FLAGS.set(flag_cat_indexof, true); + wordenum.pre(true); // parse lines as they come with CRLF + } + if (last_index && (wordminsize > 2 || word.equals("of"))) comb_indexof = true; + last_last = word.equals("last"); + last_index = word.equals("index"); + + // store word + allwordcounter++; + currsentwords.add(word); + Word wsp = this.words.get(word); + if (wsp != null) { + // word already exists + wordHandle = wsp.posInText; + wsp.inc(); + } else { + // word does not yet exist, create new word entry + wordHandle = wordHandleCount++; + wsp = new Word(wordHandle, wordInSentenceCounter, /* sentences.size() + */ 100); + wsp.flags = this.RESULT_FLAGS.clone(); + this.words.put(word.toLowerCase(), wsp); + } + // we now have the unique handle of the word, put it into the sentence: + wordInSentenceCounter++; + } + } + } finally { + wordenum.close(); + wordenum = null; + } + + if (pseudostemming) { + // we search for similar words and reorganize the corresponding sentences + // a word is similar, if a shortened version is equal + Iterator> wi = this.words.entrySet().iterator(); // enumerates the keys in descending order? + Map.Entry entry; + wordsearch: while (wi.hasNext()) { + entry = wi.next(); + String word = entry.getKey(); + wordlen = word.length(); + Word wsp = entry.getValue(); + for (int i = wordcut; i > 0; i--) { + if (wordlen > i) { + k = word.substring(0, wordlen - i); + Word wsp1 = this.words.get(k); + if (wsp1 != null) { + wsp1.count = wsp1.count + wsp.count; // update word counter + wi.remove(); // remove current word + continue wordsearch; + } + } + } + } + } + + // create the synonyms set + if (SynonymLibrary.size() > 0) { + for (String word: this.words.keySet()) { + Set syms = SynonymLibrary.getSynonyms(word); + if (syms != null) this.synonyms.addAll(syms); + } + } + + // store result + this.RESULT_NUMB_WORDS = allwordcounter; + this.RESULT_NUMB_SENTENCES = allsentencecounter; + } + + public Map words() { + // returns the words as word/indexWord relation map + return this.words; + } + + public static Map getWords(final String text, final WordCache meaningLib) { + // returns a word/indexWord relation map + if (text == null) return null; + return new Tokenizer(null, text, meaningLib, false, null).words(); + } + + public List synonyms() { + ArrayList l = new ArrayList(this.synonyms.size()); + for (String s: this.synonyms) l.add(s); + return l; + } + +}