added autotagging to document condenser:

- tags that are automatically generated now enrich the dc:subject - auto-generated tags have a '$' at the beginning of the tag - auto-generated tags lead the tag name with a vocabulary name each tag has the form $<vocabulary-name>:<tag-printname-space-replaced-by-'_'>
13 years ago · a58dc4a91f
parent 0d6176804b
commit a58dc4a91f
4 changed files with 77 additions and 17 deletions
--- a/source/net/yacy/document/Autotagging.java
+++ b/source/net/yacy/document/Autotagging.java
@ -20,6 +20,7 @@

 package net.yacy.document;

+import java.io.ByteArrayInputStream;
 import java.io.File;
 import java.io.IOException;
 import java.util.ArrayList;
@ -27,7 +28,9 @@ import java.util.HashSet;
 import java.util.Map;
 import java.util.Set;
 import java.util.concurrent.ConcurrentHashMap;
+import java.util.regex.Pattern;

+import net.yacy.cora.document.UTF8;
 import net.yacy.document.WordCache.Dictionary;
 import net.yacy.document.geolocalization.Localization;
 import net.yacy.kelondro.logging.Log;
@ -117,13 +120,30 @@ public class Autotagging {
     * @param text
     * @return
     */
-    public Set<String> tags(String text) {
+    public Set<String> getPrintTagsFromText(String text) {
        Set<String> as = new HashSet<String>();
-
+        if (this.vocabularies.isEmpty()) return as;
+        final WordTokenizer tokens = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(text)), LibraryProvider.dymLib);
+        String tag;
+        while (tokens.hasMoreElements()) {
+            tag = getPrintTagFromWord(tokens.nextElement().toString());
+            if (tag != null) as.add(tag);
+        }
        return as;
    }

-    public static class Vocabulary {
+    public String getPrintTagFromWord(String word) {
+        if (this.vocabularies.isEmpty()) return null;
+        Metatag tag;
+        word = normalizeWord(word);
+        for (Map.Entry<String, Vocabulary> v: this.vocabularies.entrySet()) {
+            tag = v.getValue().getMetatag(word);
+            if (tag != null) return tag.getMetatag();
+        }
+        return null;
+    }
+
+    public class Vocabulary {

        final String navigatorName;
        final Map<String, String> tag2print, print2tag;
@ -137,7 +157,7 @@ public class Autotagging {
        public Vocabulary(String name, File propFile) throws IOException {
            this(name);
            ArrayList<String> list = FileUtils.getListArray(propFile);
-            String k, v;
+            String k, kn, v;
            String[] tags;
            int p;
            vocloop: for (String line: list) {
@ -161,15 +181,16 @@ public class Autotagging {
                v = line.substring(p + 1);
                tags = v.split(",");
                tagloop: for (String t: tags) {
-                    t = t.trim().toLowerCase();
+                    t = normalizeWord(t);
                    if (t.length() == 0) {
                        continue tagloop;
                    }
                    this.tag2print.put(t, k);
                    this.print2tag.put(k, t);
                }
-                this.tag2print.put(k.toLowerCase(), k);
-                this.print2tag.put(k, k.toLowerCase());
+                kn = normalizeWord(k);
+                this.tag2print.put(kn, k);
+                this.print2tag.put(k, kn);
            }
        }

@ -197,12 +218,10 @@ public class Autotagging {
            return this.navigatorName;
        }

-        public String getPrint(final String tag) {
-            return this.tag2print.get(tag);
-        }
-
-        public String getTag(final String print) {
-            return this.print2tag.get(print);
+        public Metatag getMetatag(final String word) {
+            String printname = this.tag2print.get(word);
+            if (printname == null) return null;
+            return metatag(this.navigatorName, printname);
        }

        public Set<String> tags() {
@ -215,6 +234,20 @@ public class Autotagging {
        }
    }

+    private final static Pattern PATTERN_AE = Pattern.compile("\u00E4"); // german umlaute hack for better matching
+    private final static Pattern PATTERN_OE = Pattern.compile("\u00F6");
+    private final static Pattern PATTERN_UE = Pattern.compile("\u00FC");
+    private final static Pattern PATTERN_SZ = Pattern.compile("\u00DF");
+
+    private static final String normalizeWord(String word) {
+        word = word.trim().toLowerCase();
+        word = PATTERN_AE.matcher(word).replaceAll("ae");
+        word = PATTERN_OE.matcher(word).replaceAll("oe");
+        word = PATTERN_UE.matcher(word).replaceAll("ue");
+        word = PATTERN_SZ.matcher(word).replaceAll("ss");
+        return word;
+    }
+
    public class Metatag {
        private final String vocName;
        private final String print;
@ -253,6 +286,8 @@ public class Autotagging {
        for (Map.Entry<String, Vocabulary> entry: a.vocabularies.entrySet()) {
            System.out.println(entry);
        }
+        Set<String> tags = a.getPrintTagsFromText("In die Tueren und Fluchttueren muessen noch Schloesser eingebaut werden");
+        System.out.println(tags);
    }

 }
--- a/source/net/yacy/document/Condenser.java
+++ b/source/net/yacy/document/Condenser.java
@ -86,6 +86,7 @@ public final class Condenser {

    //private Properties analysis;
    private final Map<String, Word> words; // a string (the words) to (indexWord) - relation
+    private final Set<String> tags = new HashSet<String>(); // a set of tags, discovered from Autotagging

    //public int RESULT_NUMB_TEXT_BYTES = -1;
    public int RESULT_NUMB_WORDS = -1;
@ -222,6 +223,11 @@ public final class Condenser {
                }
            }
        }
+
+        // extend the tags in the document object with autotagging tags
+        if (!this.tags.isEmpty()) {
+            document.addTags(this.tags);
+        }
    }

    private void insertTextToWords(
@ -283,7 +289,7 @@ public final class Condenser {
        assert is != null;
        final Set<String> currsentwords = new HashSet<String>();
        String word = "";
-        String k;
+        String k, tag;
        int wordlen;
        Word wsp;
        final Word wsp1;
@ -304,6 +310,10 @@ public final class Condenser {
 	            if (this.languageIdentificator != null) this.languageIdentificator.add(word);
 	            if (word.length() < wordminsize) continue;

+	            // get tags from autotagging
+	            tag = LibraryProvider.autotagging.getPrintTagFromWord(word);
+	            if (tag != null) this.tags.add(tag);
+
 	            // distinguish punctuation and words
 	            wordlen = word.length();
 	            if (wordlen == 1 && SentenceReader.punctuation(word.charAt(0))) {
--- a/source/net/yacy/document/Document.java
+++ b/source/net/yacy/document/Document.java
@ -103,7 +103,8 @@ public class Document {
        this.mimeType = (mimeType == null) ? "application/octet-stream" : mimeType;
        this.charset = charset;
        this.parserObject = parserObject;
-        this.keywords = (keywords == null) ? new LinkedList<String>() : Arrays.asList(keywords);
+        this.keywords = new LinkedList<String>();
+        if (keywords != null) this.keywords.addAll(Arrays.asList(keywords));
        this.title = (title == null) ? new StringBuilder(0) : new StringBuilder(title);
        this.creator = (author == null) ? new StringBuilder(0) : new StringBuilder(author);
        this.sections = (sections == null) ? new LinkedList<String>() : Arrays.asList(sections);
@ -188,6 +189,20 @@ dc_rights
        return (this.creator == null) ? "" : this.creator.toString();
    }

+    /**
+     * add the given words to the set of keywords.
+     * These keywords will appear in dc_subject
+     * @param tags
+     */
+    public void addTags(Set<String> tags) {
+        for (String s: this.keywords) {
+            tags.remove(s);
+        }
+        for (String s: tags) {
+            this.keywords.add(s);
+        }
+    }
+
    public String[] dc_subject() {
        // sort out doubles and empty words
        final TreeSet<String> hs = new TreeSet<String>();
@ -195,7 +210,7 @@ dc_rights
        for (int i = 0; i < this.keywords.size(); i++) {
            if (this.keywords.get(i) == null) continue;
            s = (this.keywords.get(i)).trim();
-            if (s.length() > 0) hs.add(s.toLowerCase());
+            if (s.length() > 0) hs.add(s);
        }
        final String[] t = new String[hs.size()];
        int i = 0;
--- a/source/net/yacy/document/LibraryProvider.java
+++ b/source/net/yacy/document/LibraryProvider.java
@ -115,7 +115,7 @@ public class LibraryProvider
        Set<String> allTags = new HashSet<String>() ;
        allTags.addAll(autotagging.allTags()); // we must copy this into a clone to prevent circularity
        autotagging.addLocalization(geoLoc);
-        autotagging.addDictionaries(dymLib.getDictionaries());
+        //autotagging.addDictionaries(dymLib.getDictionaries()); // strange results with this: normal word lists are 'too full'
        WordCache.learn(allTags);
    }