added recognition of multi-word terms in vocabulary matching

this makes the PND usable: it is now possible to recognize persons and navigate with a 'Persons' facet.
13 years ago · 94d54e2d91
parent 64c0268b2b
commit 94d54e2d91
2 changed files with 46 additions and 15 deletions
--- a/source/net/yacy/document/Autotagging.java
+++ b/source/net/yacy/document/Autotagging.java
@ -149,18 +149,31 @@ public class Autotagging {
        final WordTokenizer tokens = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(text)), LibraryProvider.dymLib);
        String tag;
        while (tokens.hasMoreElements()) {
-            tag = getTagFromWord(tokens.nextElement().toString()).toString();
+            tag = getTagFromTerm(tokens.nextElement().toString()).toString();
            if (tag != null) as.add(tag);
        }
        return as;
    }
    public int size() {
    	return this.vocabularies.size();
    }
    /**
     * maximum number of compound tags (number of words in one tag)
     * @return
     */
    public int getMaxWordsInTerm() {
    	//TODO: calculate from database
    	return 4;
    }
-    public Tagging.Metatag getTagFromWord(String word) {
+    public Tagging.Metatag getTagFromTerm(String term) {
        if (this.vocabularies.isEmpty()) return null;
        Tagging.Metatag tag;
-        word = Tagging.normalizeWord(word);
+        term = Tagging.normalizeWord(term);
        for (Map.Entry<String, Tagging> v: this.vocabularies.entrySet()) {
-            tag = v.getValue().getMetatagFromSynonym(this.prefixChar, word);
+            tag = v.getValue().getMetatagFromSynonym(this.prefixChar, term);
            if (tag != null) return tag;
        }
        return null;
--- a/source/net/yacy/document/Condenser.java
+++ b/source/net/yacy/document/Condenser.java
@ -299,6 +299,8 @@ public final class Condenser {
        assert is != null;
        final Set<String> currsentwords = new HashSet<String>();
        String word = "";
        String[] wordcache = new String[LibraryProvider.autotagging.getMaxWordsInTerm() - 1];
        for (int i = 0; i < wordcache.length; i++) wordcache[i] = "";
        String k;
        Tagging.Metatag tag;
        int wordlen;
@ -312,7 +314,8 @@ public final class Condenser {
        int wordInSentenceCounter = 1;
        boolean comb_indexof = false, last_last = false, last_index = false;
        final Map<StringBuilder, Phrase> sentences = new HashMap<StringBuilder, Phrase>(100);
-
+        if (LibraryProvider.autotagging.size() == 0) doAutotagging = false;
        // read source
        final WordTokenizer wordenum = new WordTokenizer(is, meaningLib);
        try {
@ -323,18 +326,33 @@ public final class Condenser {
 	            // get tags from autotagging
 	            if (doAutotagging) {
-	                tag = LibraryProvider.autotagging.getTagFromWord(word);
+	            	for (int wordc = 1; wordc <= wordcache.length + 1; wordc++) {
-	                if (tag != null) {
+	            		// wordc is number of words that are tested
-	                    Set<Tagging.Metatag> tagset = this.tags.get(tag.getVocabularyName());
+	            		StringBuilder sb = new StringBuilder();
-	                    if (tagset == null) {
+	            		if (wordc == 1) {
-	                        tagset = new HashSet<Tagging.Metatag>();
+	            			sb.append(word);
-	                        tagset.add(tag);
+	            		} else {
-	                        this.tags.put(tag.getVocabularyName(), tagset);
+	            			for (int w = 0; w < wordc - 1; w++) {
-	                    } else {
+	            				sb.append(wordcache[wordcache.length - wordc + w + 1]).append(' ');
 	            			}
 	            			sb.append(word);
 	            		}
 	            		String testterm = sb.toString().trim();
 	            		//System.out.println("Testing: " + testterm);
 		                tag = LibraryProvider.autotagging.getTagFromTerm(testterm);
 		                if (tag != null) {
 		                    Set<Tagging.Metatag> tagset = this.tags.get(tag.getVocabularyName());
 		                    if (tagset == null) {
 		                        tagset = new HashSet<Tagging.Metatag>();
 		                        this.tags.put(tag.getVocabularyName(), tagset);
 		                    }
 	                        tagset.add(tag);
-	                    }
+		                }
-	                }
+	            	}
 	            }
 	            // shift wordcache
 	            System.arraycopy(wordcache, 1, wordcache, 0, wordcache.length - 1);
 	            wordcache[wordcache.length - 1] = word;
 	            // distinguish punctuation and words
 	            wordlen = word.length();