added autotagging to document condenser:

- tags that are automatically generated now enrich the dc:subject
- auto-generated tags have a '$' at the beginning of the tag
- auto-generated tags lead the tag name with a vocabulary name
each tag has the form
$<vocabulary-name>:<tag-printname-space-replaced-by-'_'>
pull/1/head
Michael Peter Christen 13 years ago
parent 0d6176804b
commit a58dc4a91f

@ -20,6 +20,7 @@
package net.yacy.document;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
@ -27,7 +28,9 @@ import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.regex.Pattern;
import net.yacy.cora.document.UTF8;
import net.yacy.document.WordCache.Dictionary;
import net.yacy.document.geolocalization.Localization;
import net.yacy.kelondro.logging.Log;
@ -117,13 +120,30 @@ public class Autotagging {
* @param text
* @return
*/
public Set<String> tags(String text) {
public Set<String> getPrintTagsFromText(String text) {
Set<String> as = new HashSet<String>();
if (this.vocabularies.isEmpty()) return as;
final WordTokenizer tokens = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(text)), LibraryProvider.dymLib);
String tag;
while (tokens.hasMoreElements()) {
tag = getPrintTagFromWord(tokens.nextElement().toString());
if (tag != null) as.add(tag);
}
return as;
}
public static class Vocabulary {
public String getPrintTagFromWord(String word) {
if (this.vocabularies.isEmpty()) return null;
Metatag tag;
word = normalizeWord(word);
for (Map.Entry<String, Vocabulary> v: this.vocabularies.entrySet()) {
tag = v.getValue().getMetatag(word);
if (tag != null) return tag.getMetatag();
}
return null;
}
public class Vocabulary {
final String navigatorName;
final Map<String, String> tag2print, print2tag;
@ -137,7 +157,7 @@ public class Autotagging {
public Vocabulary(String name, File propFile) throws IOException {
this(name);
ArrayList<String> list = FileUtils.getListArray(propFile);
String k, v;
String k, kn, v;
String[] tags;
int p;
vocloop: for (String line: list) {
@ -161,15 +181,16 @@ public class Autotagging {
v = line.substring(p + 1);
tags = v.split(",");
tagloop: for (String t: tags) {
t = t.trim().toLowerCase();
t = normalizeWord(t);
if (t.length() == 0) {
continue tagloop;
}
this.tag2print.put(t, k);
this.print2tag.put(k, t);
}
this.tag2print.put(k.toLowerCase(), k);
this.print2tag.put(k, k.toLowerCase());
kn = normalizeWord(k);
this.tag2print.put(kn, k);
this.print2tag.put(k, kn);
}
}
@ -197,12 +218,10 @@ public class Autotagging {
return this.navigatorName;
}
public String getPrint(final String tag) {
return this.tag2print.get(tag);
}
public String getTag(final String print) {
return this.print2tag.get(print);
public Metatag getMetatag(final String word) {
String printname = this.tag2print.get(word);
if (printname == null) return null;
return metatag(this.navigatorName, printname);
}
public Set<String> tags() {
@ -215,6 +234,20 @@ public class Autotagging {
}
}
private final static Pattern PATTERN_AE = Pattern.compile("\u00E4"); // german umlaute hack for better matching
private final static Pattern PATTERN_OE = Pattern.compile("\u00F6");
private final static Pattern PATTERN_UE = Pattern.compile("\u00FC");
private final static Pattern PATTERN_SZ = Pattern.compile("\u00DF");
private static final String normalizeWord(String word) {
word = word.trim().toLowerCase();
word = PATTERN_AE.matcher(word).replaceAll("ae");
word = PATTERN_OE.matcher(word).replaceAll("oe");
word = PATTERN_UE.matcher(word).replaceAll("ue");
word = PATTERN_SZ.matcher(word).replaceAll("ss");
return word;
}
public class Metatag {
private final String vocName;
private final String print;
@ -253,6 +286,8 @@ public class Autotagging {
for (Map.Entry<String, Vocabulary> entry: a.vocabularies.entrySet()) {
System.out.println(entry);
}
Set<String> tags = a.getPrintTagsFromText("In die Tueren und Fluchttueren muessen noch Schloesser eingebaut werden");
System.out.println(tags);
}
}

@ -86,6 +86,7 @@ public final class Condenser {
//private Properties analysis;
private final Map<String, Word> words; // a string (the words) to (indexWord) - relation
private final Set<String> tags = new HashSet<String>(); // a set of tags, discovered from Autotagging
//public int RESULT_NUMB_TEXT_BYTES = -1;
public int RESULT_NUMB_WORDS = -1;
@ -222,6 +223,11 @@ public final class Condenser {
}
}
}
// extend the tags in the document object with autotagging tags
if (!this.tags.isEmpty()) {
document.addTags(this.tags);
}
}
private void insertTextToWords(
@ -283,7 +289,7 @@ public final class Condenser {
assert is != null;
final Set<String> currsentwords = new HashSet<String>();
String word = "";
String k;
String k, tag;
int wordlen;
Word wsp;
final Word wsp1;
@ -304,6 +310,10 @@ public final class Condenser {
if (this.languageIdentificator != null) this.languageIdentificator.add(word);
if (word.length() < wordminsize) continue;
// get tags from autotagging
tag = LibraryProvider.autotagging.getPrintTagFromWord(word);
if (tag != null) this.tags.add(tag);
// distinguish punctuation and words
wordlen = word.length();
if (wordlen == 1 && SentenceReader.punctuation(word.charAt(0))) {

@ -103,7 +103,8 @@ public class Document {
this.mimeType = (mimeType == null) ? "application/octet-stream" : mimeType;
this.charset = charset;
this.parserObject = parserObject;
this.keywords = (keywords == null) ? new LinkedList<String>() : Arrays.asList(keywords);
this.keywords = new LinkedList<String>();
if (keywords != null) this.keywords.addAll(Arrays.asList(keywords));
this.title = (title == null) ? new StringBuilder(0) : new StringBuilder(title);
this.creator = (author == null) ? new StringBuilder(0) : new StringBuilder(author);
this.sections = (sections == null) ? new LinkedList<String>() : Arrays.asList(sections);
@ -188,6 +189,20 @@ dc_rights
return (this.creator == null) ? "" : this.creator.toString();
}
/**
* add the given words to the set of keywords.
* These keywords will appear in dc_subject
* @param tags
*/
public void addTags(Set<String> tags) {
for (String s: this.keywords) {
tags.remove(s);
}
for (String s: tags) {
this.keywords.add(s);
}
}
public String[] dc_subject() {
// sort out doubles and empty words
final TreeSet<String> hs = new TreeSet<String>();
@ -195,7 +210,7 @@ dc_rights
for (int i = 0; i < this.keywords.size(); i++) {
if (this.keywords.get(i) == null) continue;
s = (this.keywords.get(i)).trim();
if (s.length() > 0) hs.add(s.toLowerCase());
if (s.length() > 0) hs.add(s);
}
final String[] t = new String[hs.size()];
int i = 0;

@ -115,7 +115,7 @@ public class LibraryProvider
Set<String> allTags = new HashSet<String>() ;
allTags.addAll(autotagging.allTags()); // we must copy this into a clone to prevent circularity
autotagging.addLocalization(geoLoc);
autotagging.addDictionaries(dymLib.getDictionaries());
//autotagging.addDictionaries(dymLib.getDictionaries()); // strange results with this: normal word lists are 'too full'
WordCache.learn(allTags);
}

Loading…
Cancel
Save