diff --git a/source/net/yacy/document/Autotagging.java b/source/net/yacy/document/Autotagging.java index c3c220cdd..08b0dfeeb 100644 --- a/source/net/yacy/document/Autotagging.java +++ b/source/net/yacy/document/Autotagging.java @@ -20,6 +20,7 @@ package net.yacy.document; +import java.io.ByteArrayInputStream; import java.io.File; import java.io.IOException; import java.util.ArrayList; @@ -27,7 +28,9 @@ import java.util.HashSet; import java.util.Map; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; +import java.util.regex.Pattern; +import net.yacy.cora.document.UTF8; import net.yacy.document.WordCache.Dictionary; import net.yacy.document.geolocalization.Localization; import net.yacy.kelondro.logging.Log; @@ -117,13 +120,30 @@ public class Autotagging { * @param text * @return */ - public Set tags(String text) { + public Set getPrintTagsFromText(String text) { Set as = new HashSet(); - + if (this.vocabularies.isEmpty()) return as; + final WordTokenizer tokens = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(text)), LibraryProvider.dymLib); + String tag; + while (tokens.hasMoreElements()) { + tag = getPrintTagFromWord(tokens.nextElement().toString()); + if (tag != null) as.add(tag); + } return as; } - public static class Vocabulary { + public String getPrintTagFromWord(String word) { + if (this.vocabularies.isEmpty()) return null; + Metatag tag; + word = normalizeWord(word); + for (Map.Entry v: this.vocabularies.entrySet()) { + tag = v.getValue().getMetatag(word); + if (tag != null) return tag.getMetatag(); + } + return null; + } + + public class Vocabulary { final String navigatorName; final Map tag2print, print2tag; @@ -137,7 +157,7 @@ public class Autotagging { public Vocabulary(String name, File propFile) throws IOException { this(name); ArrayList list = FileUtils.getListArray(propFile); - String k, v; + String k, kn, v; String[] tags; int p; vocloop: for (String line: list) { @@ -161,15 +181,16 @@ public class Autotagging { v = line.substring(p + 1); tags = v.split(","); tagloop: for (String t: tags) { - t = t.trim().toLowerCase(); + t = normalizeWord(t); if (t.length() == 0) { continue tagloop; } this.tag2print.put(t, k); this.print2tag.put(k, t); } - this.tag2print.put(k.toLowerCase(), k); - this.print2tag.put(k, k.toLowerCase()); + kn = normalizeWord(k); + this.tag2print.put(kn, k); + this.print2tag.put(k, kn); } } @@ -197,12 +218,10 @@ public class Autotagging { return this.navigatorName; } - public String getPrint(final String tag) { - return this.tag2print.get(tag); - } - - public String getTag(final String print) { - return this.print2tag.get(print); + public Metatag getMetatag(final String word) { + String printname = this.tag2print.get(word); + if (printname == null) return null; + return metatag(this.navigatorName, printname); } public Set tags() { @@ -215,6 +234,20 @@ public class Autotagging { } } + private final static Pattern PATTERN_AE = Pattern.compile("\u00E4"); // german umlaute hack for better matching + private final static Pattern PATTERN_OE = Pattern.compile("\u00F6"); + private final static Pattern PATTERN_UE = Pattern.compile("\u00FC"); + private final static Pattern PATTERN_SZ = Pattern.compile("\u00DF"); + + private static final String normalizeWord(String word) { + word = word.trim().toLowerCase(); + word = PATTERN_AE.matcher(word).replaceAll("ae"); + word = PATTERN_OE.matcher(word).replaceAll("oe"); + word = PATTERN_UE.matcher(word).replaceAll("ue"); + word = PATTERN_SZ.matcher(word).replaceAll("ss"); + return word; + } + public class Metatag { private final String vocName; private final String print; @@ -253,6 +286,8 @@ public class Autotagging { for (Map.Entry entry: a.vocabularies.entrySet()) { System.out.println(entry); } + Set tags = a.getPrintTagsFromText("In die Tueren und Fluchttueren muessen noch Schloesser eingebaut werden"); + System.out.println(tags); } } diff --git a/source/net/yacy/document/Condenser.java b/source/net/yacy/document/Condenser.java index 54d265c65..9915a052d 100644 --- a/source/net/yacy/document/Condenser.java +++ b/source/net/yacy/document/Condenser.java @@ -86,6 +86,7 @@ public final class Condenser { //private Properties analysis; private final Map words; // a string (the words) to (indexWord) - relation + private final Set tags = new HashSet(); // a set of tags, discovered from Autotagging //public int RESULT_NUMB_TEXT_BYTES = -1; public int RESULT_NUMB_WORDS = -1; @@ -222,6 +223,11 @@ public final class Condenser { } } } + + // extend the tags in the document object with autotagging tags + if (!this.tags.isEmpty()) { + document.addTags(this.tags); + } } private void insertTextToWords( @@ -283,7 +289,7 @@ public final class Condenser { assert is != null; final Set currsentwords = new HashSet(); String word = ""; - String k; + String k, tag; int wordlen; Word wsp; final Word wsp1; @@ -304,6 +310,10 @@ public final class Condenser { if (this.languageIdentificator != null) this.languageIdentificator.add(word); if (word.length() < wordminsize) continue; + // get tags from autotagging + tag = LibraryProvider.autotagging.getPrintTagFromWord(word); + if (tag != null) this.tags.add(tag); + // distinguish punctuation and words wordlen = word.length(); if (wordlen == 1 && SentenceReader.punctuation(word.charAt(0))) { diff --git a/source/net/yacy/document/Document.java b/source/net/yacy/document/Document.java index 5b90c888a..dfa8802db 100644 --- a/source/net/yacy/document/Document.java +++ b/source/net/yacy/document/Document.java @@ -103,7 +103,8 @@ public class Document { this.mimeType = (mimeType == null) ? "application/octet-stream" : mimeType; this.charset = charset; this.parserObject = parserObject; - this.keywords = (keywords == null) ? new LinkedList() : Arrays.asList(keywords); + this.keywords = new LinkedList(); + if (keywords != null) this.keywords.addAll(Arrays.asList(keywords)); this.title = (title == null) ? new StringBuilder(0) : new StringBuilder(title); this.creator = (author == null) ? new StringBuilder(0) : new StringBuilder(author); this.sections = (sections == null) ? new LinkedList() : Arrays.asList(sections); @@ -188,6 +189,20 @@ dc_rights return (this.creator == null) ? "" : this.creator.toString(); } + /** + * add the given words to the set of keywords. + * These keywords will appear in dc_subject + * @param tags + */ + public void addTags(Set tags) { + for (String s: this.keywords) { + tags.remove(s); + } + for (String s: tags) { + this.keywords.add(s); + } + } + public String[] dc_subject() { // sort out doubles and empty words final TreeSet hs = new TreeSet(); @@ -195,7 +210,7 @@ dc_rights for (int i = 0; i < this.keywords.size(); i++) { if (this.keywords.get(i) == null) continue; s = (this.keywords.get(i)).trim(); - if (s.length() > 0) hs.add(s.toLowerCase()); + if (s.length() > 0) hs.add(s); } final String[] t = new String[hs.size()]; int i = 0; diff --git a/source/net/yacy/document/LibraryProvider.java b/source/net/yacy/document/LibraryProvider.java index 37994fc03..54a597d75 100644 --- a/source/net/yacy/document/LibraryProvider.java +++ b/source/net/yacy/document/LibraryProvider.java @@ -115,7 +115,7 @@ public class LibraryProvider Set allTags = new HashSet() ; allTags.addAll(autotagging.allTags()); // we must copy this into a clone to prevent circularity autotagging.addLocalization(geoLoc); - autotagging.addDictionaries(dymLib.getDictionaries()); + //autotagging.addDictionaries(dymLib.getDictionaries()); // strange results with this: normal word lists are 'too full' WordCache.learn(allTags); }