yacy_search_server/source/net/yacy/document/Tokenizer.java

/**
 *  Annotation.java
 *  Copyright 2004 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany
 *  First released 09.01.2004 at http://yacy.net
 *
 *  This library is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU Lesser General private
 *  License as published by the Free Software Foundation; either
 *  version 2.1 of the License, or (at your option) any later version.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 *  Lesser General private License for more details.
 *
 *  You should have received a copy of the GNU Lesser General private License
 *  along with this program in the file lgpl21.txt
 *  If not, see <http://www.gnu.org/licenses/>.
 */

package net.yacy.document;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;

import net.yacy.cora.document.WordCache;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.language.synonyms.SynonymLibrary;
import net.yacy.cora.lod.vocabulary.Tagging;
import net.yacy.cora.order.NaturalOrder;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.util.Bitfield;

public class Tokenizer {

    // this is the page analysis class
    public final static boolean pseudostemming = false; // switch for removal of words that appear in shortened form
    public final static int wordminsize = 2;
    public final static int wordcut = 2;

    // category flags that show how the page can be distinguished in different interest groups
    public  static final int flag_cat_indexof       =  0; // a directory listing page (i.e. containing 'index of')
    public  static final int flag_cat_haslocation   = 19; // the page has a location metadata attached
    public  static final int flag_cat_hasimage      = 20; // the page refers to (at least one) images
    public  static final int flag_cat_hasaudio      = 21; // the page refers to (at least one) audio file
    public  static final int flag_cat_hasvideo      = 22; // the page refers to (at least one) videos
    public  static final int flag_cat_hasapp        = 23; // the page refers to (at least one) application file

    //private Properties analysis;
    protected final Map<String, Word> words; // a string (the words) to (indexWord) - relation
    private final Set<String> synonyms; // a set of synonyms to the words
    protected final Map<String, Set<Tagging.Metatag>> tags = new HashMap<String, Set<Tagging.Metatag>>(); // a set of tags, discovered from Autotagging
    
    public int RESULT_NUMB_WORDS = -1;
    public int RESULT_NUMB_SENTENCES = -1;
    public Bitfield RESULT_FLAGS = new Bitfield(4);

    public Tokenizer(final DigestURL root, final String text, final WordCache meaningLib, boolean doAutotagging, final VocabularyScraper scraper) {
        this.words = new TreeMap<String, Word>(NaturalOrder.naturalComparator);
        this.synonyms = new LinkedHashSet<String>();
        assert text != null;
        final Set<String> currsentwords = new HashSet<String>();
        String[] wordcache = new String[LibraryProvider.autotagging.getMaxWordsInTerm() - 1];
        for (int i = 0; i < wordcache.length; i++) wordcache[i] = "";
        String k;
        Tagging.Metatag tag;
        int wordlen;
        int wordHandle;
        int wordHandleCount = 0;
        //final int sentenceHandleCount = 0;
        int allwordcounter = 0;
        final int allsentencecounter = 0;
        int wordInSentenceCounter = 1;
        boolean comb_indexof = false, last_last = false, last_index = false;
        //final Map<StringBuilder, Phrase> sentences = new HashMap<StringBuilder, Phrase>(100);
        if (LibraryProvider.autotagging.isEmpty()) doAutotagging = false;

        // read source
        WordTokenizer wordenum = new WordTokenizer(new SentenceReader(text), meaningLib);
        try {
            while (wordenum.hasMoreElements()) {
                String word = wordenum.nextElement().toString().toLowerCase(Locale.ENGLISH);
                if (word.length() < wordminsize) continue;

                // get tags from autotagging
                if (doAutotagging) {
                    Set<String> vocabularyNames = LibraryProvider.autotagging.getVocabularyNames();
                    //Collection<Tagging> vocabularies = LibraryProvider.autotagging.getVocabularies();
                    //assert vocabularyNames.size() == vocabularies.size();
                    Map<String, String> vocMap = scraper == null ? null : scraper.removeVocMap(root);
                    if (vocMap != null && vocMap.size() > 0) {
                        for (Map.Entry<String, String> entry: vocMap.entrySet()) {
                            String navigatorName = entry.getKey();
                            String term = entry.getValue();
                            vocabularyNames.remove(navigatorName); // prevent that this is used again for auto-annotation
                            Tagging vocabulary = LibraryProvider.autotagging.getVocabulary(navigatorName);
                            if (vocabulary != null) {
                                // extend the vocabulary
                                String obj = vocabulary.getObjectlink(term);
                                if (obj == null) try {vocabulary.put(term, "", root.toNormalform(true));} catch (IOException e) {} // this makes IO, be careful!
                                // create annotation
                                tag = vocabulary.getMetatagFromTerm(term);
                                Set<Tagging.Metatag> tagset = new HashSet<>();
                                tagset.add(tag);
                                this.tags.put(navigatorName, tagset);
                            }
                        }
                    }
                    if (vocabularyNames.size() > 0) for (int wordc = 1; wordc <= wordcache.length + 1; wordc++) {
                        // wordc is number of words that are tested
                        StringBuilder sb = new StringBuilder();
                        if (wordc == 1) {
                            sb.append(word);
                        } else {
                            for (int w = 0; w < wordc - 1; w++) {
                                sb.append(wordcache[wordcache.length - wordc + w + 1]).append(' ');
                            }
                            sb.append(word);
                        }
                        String testterm = sb.toString().trim();
                        //System.out.println("Testing: " + testterm);
                        tag = LibraryProvider.autotagging.getTagFromTerm(vocabularyNames, testterm);
                        if (tag != null) {
                            String navigatorName = tag.getVocabularyName();
                            Set<Tagging.Metatag> tagset = this.tags.get(navigatorName);
                            if (tagset == null) {
                                tagset = new HashSet<Tagging.Metatag>();
                                this.tags.put(navigatorName, tagset);
                            }
                            tagset.add(tag);
                        }
                    }
                }
                // shift wordcache
                System.arraycopy(wordcache, 1, wordcache, 0, wordcache.length - 1);
                wordcache[wordcache.length - 1] = word;

                // distinguish punctuation and words
                wordlen = word.length();
                if (wordlen == 1 && SentenceReader.punctuation(word.charAt(0))) { // TODO: wordlen == 1 never true (see earlier if < wordminsize )
                    // store sentence
                    currsentwords.clear();
                    wordInSentenceCounter = 1;
                } else {
                    // check index.of detection
                    if (last_last && comb_indexof && word.equals("modified")) {
                        this.RESULT_FLAGS.set(flag_cat_indexof, true);
                        wordenum.pre(true); // parse lines as they come with CRLF
                    }
                    if (last_index && (wordminsize > 2 || word.equals("of"))) comb_indexof = true;
                    last_last = word.equals("last");
                    last_index = word.equals("index");

                    // store word
                    allwordcounter++;
                    currsentwords.add(word);
                    Word wsp = this.words.get(word);
                    if (wsp != null) {
                        // word already exists
                        wordHandle = wsp.posInText;
                        wsp.inc();
                    } else {
                        // word does not yet exist, create new word entry
                        wordHandle = wordHandleCount++;
                        wsp = new Word(wordHandle, wordInSentenceCounter, /* sentences.size() + */ 100);
                        wsp.flags = this.RESULT_FLAGS.clone();
                        this.words.put(word.toLowerCase(), wsp);
                    }
                    // we now have the unique handle of the word, put it into the sentence:
                    wordInSentenceCounter++;
                }
            }
        } finally {
            wordenum.close();
            wordenum = null;
        }

        if (pseudostemming) {
            // we search for similar words and reorganize the corresponding sentences
            // a word is similar, if a shortened version is equal
            Iterator<Map.Entry<String, Word>> wi = this.words.entrySet().iterator(); // enumerates the keys in descending order?
            Map.Entry<String, Word> entry;
            wordsearch: while (wi.hasNext()) {
                entry = wi.next();
                String word = entry.getKey();
                wordlen = word.length();
                Word wsp = entry.getValue();
                for (int i = wordcut; i > 0; i--) {
                    if (wordlen > i) {
                        k = word.substring(0, wordlen - i);
                        Word wsp1 = this.words.get(k);
                        if (wsp1 != null) {
                            wsp1.count = wsp1.count + wsp.count; // update word counter
                            wi.remove(); // remove current word
                            continue wordsearch;
                        }
                    }
                }
            }
        }

        // create the synonyms set
        if (SynonymLibrary.size() > 0) {
            for (String word: this.words.keySet()) {
                Set<String> syms = SynonymLibrary.getSynonyms(word);
                if (syms != null) this.synonyms.addAll(syms);
            }
        }
        
        // store result
        this.RESULT_NUMB_WORDS = allwordcounter;
        this.RESULT_NUMB_SENTENCES = allsentencecounter;
    }
    
    public Map<String, Word> words() {
        // returns the words as word/indexWord relation map
        return this.words;
    }
    
    public static Map<String, Word> getWords(final String text, final WordCache meaningLib) {
        // returns a word/indexWord relation map
        if (text == null) return null;
        return new Tokenizer(null, text, meaningLib, false, null).words();
    }

    public List<String> synonyms() {
        ArrayList<String> l = new ArrayList<String>(this.synonyms.size());
        for (String s: this.synonyms) l.add(s);
        return l;
    }

}
refactoring: separated condenser and tokenizer 10 years ago			`/**`
			`* Annotation.java`
			`* Copyright 2004 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany`
			`* First released 09.01.2004 at http://yacy.net`
			`*`
			`* This library is free software; you can redistribute it and/or`
			`* modify it under the terms of the GNU Lesser General private`
			`* License as published by the Free Software Foundation; either`
			`* version 2.1 of the License, or (at your option) any later version.`
			`*`
			`* This library is distributed in the hope that it will be useful,`
			`* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`* Lesser General private License for more details.`
			`*`
			`* You should have received a copy of the GNU Lesser General private License`
			`* along with this program in the file lgpl21.txt`
			`* If not, see <http://www.gnu.org/licenses/>.`
			`*/`

			`package net.yacy.document;`

			`import java.io.IOException;`
			`import java.util.ArrayList;`
			`import java.util.HashMap;`
			`import java.util.HashSet;`
			`import java.util.Iterator;`
			`import java.util.LinkedHashSet;`
			`import java.util.List;`
			`import java.util.Locale;`
			`import java.util.Map;`
			`import java.util.Set;`
			`import java.util.TreeMap;`

			`import net.yacy.cora.document.WordCache;`
			`import net.yacy.cora.document.id.DigestURL;`
			`import net.yacy.cora.language.synonyms.SynonymLibrary;`
			`import net.yacy.cora.lod.vocabulary.Tagging;`
			`import net.yacy.cora.order.NaturalOrder;`
			`import net.yacy.kelondro.data.word.Word;`
			`import net.yacy.kelondro.util.Bitfield;`

			`public class Tokenizer {`

			`// this is the page analysis class`
			`public final static boolean pseudostemming = false; // switch for removal of words that appear in shortened form`
			`public final static int wordminsize = 2;`
			`public final static int wordcut = 2;`

			`// category flags that show how the page can be distinguished in different interest groups`
			`public static final int flag_cat_indexof = 0; // a directory listing page (i.e. containing 'index of')`
			`public static final int flag_cat_haslocation = 19; // the page has a location metadata attached`
			`public static final int flag_cat_hasimage = 20; // the page refers to (at least one) images`
			`public static final int flag_cat_hasaudio = 21; // the page refers to (at least one) audio file`
			`public static final int flag_cat_hasvideo = 22; // the page refers to (at least one) videos`
			`public static final int flag_cat_hasapp = 23; // the page refers to (at least one) application file`

			`//private Properties analysis;`
			`protected final Map<String, Word> words; // a string (the words) to (indexWord) - relation`
			`private final Set<String> synonyms; // a set of synonyms to the words`
			`protected final Map<String, Set<Tagging.Metatag>> tags = new HashMap<String, Set<Tagging.Metatag>>(); // a set of tags, discovered from Autotagging`

			`public int RESULT_NUMB_WORDS = -1;`
			`public int RESULT_NUMB_SENTENCES = -1;`
			`public Bitfield RESULT_FLAGS = new Bitfield(4);`

			`public Tokenizer(final DigestURL root, final String text, final WordCache meaningLib, boolean doAutotagging, final VocabularyScraper scraper) {`
			`this.words = new TreeMap<String, Word>(NaturalOrder.naturalComparator);`
			`this.synonyms = new LinkedHashSet<String>();`
			`assert text != null;`
			`final Set<String> currsentwords = new HashSet<String>();`
			`String[] wordcache = new String[LibraryProvider.autotagging.getMaxWordsInTerm() - 1];`
			`for (int i = 0; i < wordcache.length; i++) wordcache[i] = "";`
			`String k;`
			`Tagging.Metatag tag;`
			`int wordlen;`
			`int wordHandle;`
			`int wordHandleCount = 0;`
			`//final int sentenceHandleCount = 0;`
			`int allwordcounter = 0;`
			`final int allsentencecounter = 0;`
			`int wordInSentenceCounter = 1;`
			`boolean comb_indexof = false, last_last = false, last_index = false;`
			`//final Map<StringBuilder, Phrase> sentences = new HashMap<StringBuilder, Phrase>(100);`
			`if (LibraryProvider.autotagging.isEmpty()) doAutotagging = false;`

			`// read source`
			`WordTokenizer wordenum = new WordTokenizer(new SentenceReader(text), meaningLib);`
			`try {`
			`while (wordenum.hasMoreElements()) {`
			`String word = wordenum.nextElement().toString().toLowerCase(Locale.ENGLISH);`
			`if (word.length() < wordminsize) continue;`

			`// get tags from autotagging`
			`if (doAutotagging) {`
			`Set<String> vocabularyNames = LibraryProvider.autotagging.getVocabularyNames();`
			`//Collection<Tagging> vocabularies = LibraryProvider.autotagging.getVocabularies();`
			`//assert vocabularyNames.size() == vocabularies.size();`
			`Map<String, String> vocMap = scraper == null ? null : scraper.removeVocMap(root);`
			`if (vocMap != null && vocMap.size() > 0) {`
			`for (Map.Entry<String, String> entry: vocMap.entrySet()) {`
			`String navigatorName = entry.getKey();`
			`String term = entry.getValue();`
			`vocabularyNames.remove(navigatorName); // prevent that this is used again for auto-annotation`
			`Tagging vocabulary = LibraryProvider.autotagging.getVocabulary(navigatorName);`
			`if (vocabulary != null) {`
			`// extend the vocabulary`
			`String obj = vocabulary.getObjectlink(term);`
			`if (obj == null) try {vocabulary.put(term, "", root.toNormalform(true));} catch (IOException e) {} // this makes IO, be careful!`
			`// create annotation`
			`tag = vocabulary.getMetatagFromTerm(term);`
			`Set<Tagging.Metatag> tagset = new HashSet<>();`
			`tagset.add(tag);`
			`this.tags.put(navigatorName, tagset);`
			`}`
			`}`
			`}`
			`if (vocabularyNames.size() > 0) for (int wordc = 1; wordc <= wordcache.length + 1; wordc++) {`
			`// wordc is number of words that are tested`
			`StringBuilder sb = new StringBuilder();`
			`if (wordc == 1) {`
			`sb.append(word);`
			`} else {`
			`for (int w = 0; w < wordc - 1; w++) {`
			`sb.append(wordcache[wordcache.length - wordc + w + 1]).append(' ');`
			`}`
			`sb.append(word);`
			`}`
			`String testterm = sb.toString().trim();`
			`//System.out.println("Testing: " + testterm);`
			`tag = LibraryProvider.autotagging.getTagFromTerm(vocabularyNames, testterm);`
			`if (tag != null) {`
			`String navigatorName = tag.getVocabularyName();`
			`Set<Tagging.Metatag> tagset = this.tags.get(navigatorName);`
			`if (tagset == null) {`
			`tagset = new HashSet<Tagging.Metatag>();`
			`this.tags.put(navigatorName, tagset);`
			`}`
			`tagset.add(tag);`
			`}`
			`}`
			`}`
			`// shift wordcache`
			`System.arraycopy(wordcache, 1, wordcache, 0, wordcache.length - 1);`
			`wordcache[wordcache.length - 1] = word;`

			`// distinguish punctuation and words`
			`wordlen = word.length();`
			`if (wordlen == 1 && SentenceReader.punctuation(word.charAt(0))) { // TODO: wordlen == 1 never true (see earlier if < wordminsize )`
			`// store sentence`
			`currsentwords.clear();`
			`wordInSentenceCounter = 1;`
			`} else {`
			`// check index.of detection`
			`if (last_last && comb_indexof && word.equals("modified")) {`
			`this.RESULT_FLAGS.set(flag_cat_indexof, true);`
			`wordenum.pre(true); // parse lines as they come with CRLF`
			`}`
			`if (last_index && (wordminsize > 2 \|\| word.equals("of"))) comb_indexof = true;`
			`last_last = word.equals("last");`
			`last_index = word.equals("index");`

			`// store word`
			`allwordcounter++;`
			`currsentwords.add(word);`
			`Word wsp = this.words.get(word);`
			`if (wsp != null) {`
			`// word already exists`
			`wordHandle = wsp.posInText;`
			`wsp.inc();`
			`} else {`
			`// word does not yet exist, create new word entry`
			`wordHandle = wordHandleCount++;`
			`wsp = new Word(wordHandle, wordInSentenceCounter, /* sentences.size() + */ 100);`
			`wsp.flags = this.RESULT_FLAGS.clone();`
			`this.words.put(word.toLowerCase(), wsp);`
			`}`
			`// we now have the unique handle of the word, put it into the sentence:`
			`wordInSentenceCounter++;`
			`}`
			`}`
			`} finally {`
			`wordenum.close();`
			`wordenum = null;`
			`}`

			`if (pseudostemming) {`
			`// we search for similar words and reorganize the corresponding sentences`
			`// a word is similar, if a shortened version is equal`
			`Iterator<Map.Entry<String, Word>> wi = this.words.entrySet().iterator(); // enumerates the keys in descending order?`
			`Map.Entry<String, Word> entry;`
			`wordsearch: while (wi.hasNext()) {`
			`entry = wi.next();`
			`String word = entry.getKey();`
			`wordlen = word.length();`
			`Word wsp = entry.getValue();`
			`for (int i = wordcut; i > 0; i--) {`
			`if (wordlen > i) {`
			`k = word.substring(0, wordlen - i);`
			`Word wsp1 = this.words.get(k);`
			`if (wsp1 != null) {`
			`wsp1.count = wsp1.count + wsp.count; // update word counter`
			`wi.remove(); // remove current word`
			`continue wordsearch;`
			`}`
			`}`
			`}`
			`}`
			`}`

			`// create the synonyms set`
			`if (SynonymLibrary.size() > 0) {`
			`for (String word: this.words.keySet()) {`
			`Set<String> syms = SynonymLibrary.getSynonyms(word);`
			`if (syms != null) this.synonyms.addAll(syms);`
			`}`
			`}`

			`// store result`
			`this.RESULT_NUMB_WORDS = allwordcounter;`
			`this.RESULT_NUMB_SENTENCES = allsentencecounter;`
			`}`

			`public Map<String, Word> words() {`
			`// returns the words as word/indexWord relation map`
			`return this.words;`
			`}`

			`public static Map<String, Word> getWords(final String text, final WordCache meaningLib) {`
			`// returns a word/indexWord relation map`
			`if (text == null) return null;`
			`return new Tokenizer(null, text, meaningLib, false, null).words();`
			`}`

			`public List<String> synonyms() {`
			`ArrayList<String> l = new ArrayList<String>(this.synonyms.size());`
			`for (String s: this.synonyms) l.add(s);`
			`return l;`
			`}`

			`}`