added recognition of multi-word terms in vocabulary matching

this makes the PND usable: it is now possible to recognize persons and
navigate with a 'Persons' facet.
pull/1/head
Michael Peter Christen 13 years ago
parent 64c0268b2b
commit 94d54e2d91

@ -149,18 +149,31 @@ public class Autotagging {
final WordTokenizer tokens = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(text)), LibraryProvider.dymLib); final WordTokenizer tokens = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(text)), LibraryProvider.dymLib);
String tag; String tag;
while (tokens.hasMoreElements()) { while (tokens.hasMoreElements()) {
tag = getTagFromWord(tokens.nextElement().toString()).toString(); tag = getTagFromTerm(tokens.nextElement().toString()).toString();
if (tag != null) as.add(tag); if (tag != null) as.add(tag);
} }
return as; return as;
} }
public int size() {
return this.vocabularies.size();
}
/**
* maximum number of compound tags (number of words in one tag)
* @return
*/
public int getMaxWordsInTerm() {
//TODO: calculate from database
return 4;
}
public Tagging.Metatag getTagFromWord(String word) { public Tagging.Metatag getTagFromTerm(String term) {
if (this.vocabularies.isEmpty()) return null; if (this.vocabularies.isEmpty()) return null;
Tagging.Metatag tag; Tagging.Metatag tag;
word = Tagging.normalizeWord(word); term = Tagging.normalizeWord(term);
for (Map.Entry<String, Tagging> v: this.vocabularies.entrySet()) { for (Map.Entry<String, Tagging> v: this.vocabularies.entrySet()) {
tag = v.getValue().getMetatagFromSynonym(this.prefixChar, word); tag = v.getValue().getMetatagFromSynonym(this.prefixChar, term);
if (tag != null) return tag; if (tag != null) return tag;
} }
return null; return null;

@ -299,6 +299,8 @@ public final class Condenser {
assert is != null; assert is != null;
final Set<String> currsentwords = new HashSet<String>(); final Set<String> currsentwords = new HashSet<String>();
String word = ""; String word = "";
String[] wordcache = new String[LibraryProvider.autotagging.getMaxWordsInTerm() - 1];
for (int i = 0; i < wordcache.length; i++) wordcache[i] = "";
String k; String k;
Tagging.Metatag tag; Tagging.Metatag tag;
int wordlen; int wordlen;
@ -312,7 +314,8 @@ public final class Condenser {
int wordInSentenceCounter = 1; int wordInSentenceCounter = 1;
boolean comb_indexof = false, last_last = false, last_index = false; boolean comb_indexof = false, last_last = false, last_index = false;
final Map<StringBuilder, Phrase> sentences = new HashMap<StringBuilder, Phrase>(100); final Map<StringBuilder, Phrase> sentences = new HashMap<StringBuilder, Phrase>(100);
if (LibraryProvider.autotagging.size() == 0) doAutotagging = false;
// read source // read source
final WordTokenizer wordenum = new WordTokenizer(is, meaningLib); final WordTokenizer wordenum = new WordTokenizer(is, meaningLib);
try { try {
@ -323,18 +326,33 @@ public final class Condenser {
// get tags from autotagging // get tags from autotagging
if (doAutotagging) { if (doAutotagging) {
tag = LibraryProvider.autotagging.getTagFromWord(word); for (int wordc = 1; wordc <= wordcache.length + 1; wordc++) {
if (tag != null) { // wordc is number of words that are tested
Set<Tagging.Metatag> tagset = this.tags.get(tag.getVocabularyName()); StringBuilder sb = new StringBuilder();
if (tagset == null) { if (wordc == 1) {
tagset = new HashSet<Tagging.Metatag>(); sb.append(word);
tagset.add(tag); } else {
this.tags.put(tag.getVocabularyName(), tagset); for (int w = 0; w < wordc - 1; w++) {
} else { sb.append(wordcache[wordcache.length - wordc + w + 1]).append(' ');
}
sb.append(word);
}
String testterm = sb.toString().trim();
//System.out.println("Testing: " + testterm);
tag = LibraryProvider.autotagging.getTagFromTerm(testterm);
if (tag != null) {
Set<Tagging.Metatag> tagset = this.tags.get(tag.getVocabularyName());
if (tagset == null) {
tagset = new HashSet<Tagging.Metatag>();
this.tags.put(tag.getVocabularyName(), tagset);
}
tagset.add(tag); tagset.add(tag);
} }
} }
} }
// shift wordcache
System.arraycopy(wordcache, 1, wordcache, 0, wordcache.length - 1);
wordcache[wordcache.length - 1] = word;
// distinguish punctuation and words // distinguish punctuation and words
wordlen = word.length(); wordlen = word.length();

Loading…
Cancel
Save