Refactoring : documented and extracted autotagging processing functions.

pull/167/head
luccioman 7 years ago
parent 58b9834729
commit 5a14d34a7d

@ -68,10 +68,11 @@ public class Tokenizer {
this.words = new TreeMap<String, Word>(NaturalOrder.naturalComparator); this.words = new TreeMap<String, Word>(NaturalOrder.naturalComparator);
this.synonyms = new LinkedHashSet<String>(); this.synonyms = new LinkedHashSet<String>();
assert text != null; assert text != null;
String[] wordcache = new String[LibraryProvider.autotagging.getMaxWordsInTerm() - 1]; final String[] wordcache = new String[LibraryProvider.autotagging.getMaxWordsInTerm() - 1];
for (int i = 0; i < wordcache.length; i++) wordcache[i] = ""; for (int i = 0; i < wordcache.length; i++) {
wordcache[i] = "";
}
String k; String k;
Tagging.Metatag tag;
int wordlen; int wordlen;
int allwordcounter = 0; int allwordcounter = 0;
int allsentencecounter = 0; int allsentencecounter = 0;
@ -98,51 +99,9 @@ public class Tokenizer {
// get tags from autotagging // get tags from autotagging
if (doAutotagging) { if (doAutotagging) {
Set<String> vocabularyNames = LibraryProvider.autotagging.getVocabularyNames(); Set<String> vocabularyNames = LibraryProvider.autotagging.getVocabularyNames();
//Collection<Tagging> vocabularies = LibraryProvider.autotagging.getVocabularies(); extendVocabularies(root, scraper, vocabularyNames);
//assert vocabularyNames.size() == vocabularies.size();
Map<String, String> vocMap = scraper == null ? null : scraper.removeVocMap(root); extractAutoTagsFromText(wordcache, word, vocabularyNames);
if (vocMap != null && vocMap.size() > 0) {
for (Map.Entry<String, String> entry: vocMap.entrySet()) {
String navigatorName = entry.getKey();
String term = entry.getValue();
vocabularyNames.remove(navigatorName); // prevent that this is used again for auto-annotation
Tagging vocabulary = LibraryProvider.autotagging.getVocabulary(navigatorName);
if (vocabulary != null) {
// extend the vocabulary
String obj = vocabulary.getObjectlink(term);
if (obj == null) try {vocabulary.put(term, "", root.toNormalform(true));} catch (IOException e) {} // this makes IO, be careful!
// create annotation
tag = vocabulary.getMetatagFromTerm(term);
Set<Tagging.Metatag> tagset = new HashSet<>();
tagset.add(tag);
this.tags.put(navigatorName, tagset);
}
}
}
if (vocabularyNames.size() > 0) for (int wordc = 1; wordc <= wordcache.length + 1; wordc++) {
// wordc is number of words that are tested
StringBuilder sb = new StringBuilder();
if (wordc == 1) {
sb.append(word);
} else {
for (int w = 0; w < wordc - 1; w++) {
sb.append(wordcache[wordcache.length - wordc + w + 1]).append(' ');
}
sb.append(word);
}
String testterm = sb.toString().trim();
//System.out.println("Testing: " + testterm);
tag = LibraryProvider.autotagging.getTagFromTerm(vocabularyNames, testterm);
if (tag != null) {
String navigatorName = tag.getVocabularyName();
Set<Tagging.Metatag> tagset = this.tags.get(navigatorName);
if (tagset == null) {
tagset = new HashSet<Tagging.Metatag>();
this.tags.put(navigatorName, tagset);
}
tagset.add(tag);
}
}
} }
// shift wordcache // shift wordcache
System.arraycopy(wordcache, 1, wordcache, 0, wordcache.length - 1); System.arraycopy(wordcache, 1, wordcache, 0, wordcache.length - 1);
@ -215,6 +174,89 @@ public class Tokenizer {
this.RESULT_NUMB_SENTENCES = allsentencecounter + (wordInSentenceCounter > 1 ? 1 : 0); this.RESULT_NUMB_SENTENCES = allsentencecounter + (wordInSentenceCounter > 1 ? 1 : 0);
} }
/**
* Check whether a single word or multiple ones match tags
* from the given autotagging vocabularies. Then fill this instance "tags" map
* with the eventually matching tags found.
*
* @param wordcache
* the words to be checked for matching a tag as a single word or as combination of words
* @param word
* an additional word to be considered for tag matching
* @param vocabularyNames
* names of the autotagging vocabularies to check
*/
protected void extractAutoTagsFromText(final String[] wordcache, final String word, final Set<String> vocabularyNames) {
Tagging.Metatag tag;
if (vocabularyNames.size() > 0) {
for (int wordc = 1; wordc <= wordcache.length + 1; wordc++) {
// wordc is number of words that are tested
StringBuilder sb = new StringBuilder();
if (wordc == 1) {
sb.append(word);
} else {
for (int w = 0; w < wordc - 1; w++) {
sb.append(wordcache[wordcache.length - wordc + w + 1]).append(' ');
}
sb.append(word);
}
String testterm = sb.toString().trim();
tag = LibraryProvider.autotagging.getTagFromTerm(vocabularyNames, testterm);
if (tag != null) {
String navigatorName = tag.getVocabularyName();
Set<Tagging.Metatag> tagset = this.tags.get(navigatorName);
if (tagset == null) {
tagset = new HashSet<Tagging.Metatag>();
this.tags.put(navigatorName, tagset);
}
tagset.add(tag);
}
}
}
}
/**
* Extend the specified vocabularies, with terms eventually found by the
* vocabulary scraper for these vocabularies. The scraper is emptied after
* processing, and extended vocabularies names are removed from the
* vocabularyNames.
*
* @param root
* the document URL
* @param scraper
* the vocabulary scraper, eventually containing new terms scraped
* for the registered vocabularies
* @param vocabularyNames
* vocabularies names to be extended
*/
protected void extendVocabularies(final DigestURL root, final VocabularyScraper scraper,
final Set<String> vocabularyNames) {
Tagging.Metatag tag;
Map<String, String> vocMap = scraper == null ? null : scraper.removeVocMap(root);
if (vocMap != null && vocMap.size() > 0) {
for (Map.Entry<String, String> entry: vocMap.entrySet()) {
String navigatorName = entry.getKey();
String term = entry.getValue();
vocabularyNames.remove(navigatorName); // prevent that this is used again for auto-annotation
Tagging vocabulary = LibraryProvider.autotagging.getVocabulary(navigatorName);
if (vocabulary != null) {
// extend the vocabulary
String obj = vocabulary.getObjectlink(term);
if (obj == null) {
try {
vocabulary.put(term, "", root.toNormalform(true));
} catch (IOException e) {} // this makes IO, be careful!
}
// create annotation
tag = vocabulary.getMetatagFromTerm(term);
Set<Tagging.Metatag> tagset = new HashSet<>();
tagset.add(tag);
this.tags.put(navigatorName, tagset);
}
}
}
}
/** /**
* @return returns the words as word/indexWord relation map. All words are lowercase. * @return returns the words as word/indexWord relation map. All words are lowercase.
*/ */

@ -1,7 +1,6 @@
package net.yacy.document; package net.yacy.document;
import java.net.MalformedURLException;
import java.util.HashSet; import java.util.HashSet;
import java.util.Map; import java.util.Map;
import java.util.Set; import java.util.Set;
@ -17,7 +16,7 @@ public class TokenizerTest {
* Test of words method, of class Tokenizer. * Test of words method, of class Tokenizer.
*/ */
@Test @Test
public void testWords() throws MalformedURLException { public void testWords() {
// pos = 1 2 3 4 5 6 7 8 9 10 // 1-letter words don't count // pos = 1 2 3 4 5 6 7 8 9 10 // 1-letter words don't count
String text = "One word is not a sentence because words are just words."; String text = "One word is not a sentence because words are just words.";
WordCache meaningLib = new WordCache(null); WordCache meaningLib = new WordCache(null);
@ -43,7 +42,7 @@ public class TokenizerTest {
*/ */
@Test @Test
public void testNumberOfSentences() { public void testNumberOfSentences() {
Set<String> testText = new HashSet(); Set<String> testText = new HashSet<>();
// text with 5 sentences // text with 5 sentences
testText.add("Sentence One. Sentence Two. Comment on this. This is sentence four! Good By................"); testText.add("Sentence One. Sentence Two. Comment on this. This is sentence four! Good By................");
testText.add("Sentence One. Sentence two. Sentence 3? Sentence 4! Sentence w/o punktuation at end of text"); testText.add("Sentence One. Sentence two. Sentence 3? Sentence 4! Sentence w/o punktuation at end of text");

Loading…
Cancel
Save