Refactoring : documented and extracted autotagging processing functions.

7 years ago · 5a14d34a7d
parent 58b9834729
commit 5a14d34a7d
2 changed files with 92 additions and 51 deletions
--- a/source/net/yacy/document/Tokenizer.java
+++ b/source/net/yacy/document/Tokenizer.java
@ -68,10 +68,11 @@ public class Tokenizer {
        this.words = new TreeMap<String, Word>(NaturalOrder.naturalComparator);
        this.synonyms = new LinkedHashSet<String>();
        assert text != null;
-        String[] wordcache = new String[LibraryProvider.autotagging.getMaxWordsInTerm() - 1];
+        final String[] wordcache = new String[LibraryProvider.autotagging.getMaxWordsInTerm() - 1];
-        for (int i = 0; i < wordcache.length; i++) wordcache[i] = "";
+        for (int i = 0; i < wordcache.length; i++) {
        	wordcache[i] = "";
        }
        String k;
        Tagging.Metatag tag;
        int wordlen;
        int allwordcounter = 0;
        int allsentencecounter = 0;
@ -98,51 +99,9 @@ public class Tokenizer {
                // get tags from autotagging
                if (doAutotagging) {
                    Set<String> vocabularyNames = LibraryProvider.autotagging.getVocabularyNames();
-                    //Collection<Tagging> vocabularies = LibraryProvider.autotagging.getVocabularies();
+                    extendVocabularies(root, scraper, vocabularyNames);
-                    //assert vocabularyNames.size() == vocabularies.size();
+                    
-                    Map<String, String> vocMap = scraper == null ? null : scraper.removeVocMap(root);
+                    extractAutoTagsFromText(wordcache, word, vocabularyNames);
                    if (vocMap != null && vocMap.size() > 0) {
                        for (Map.Entry<String, String> entry: vocMap.entrySet()) {
                            String navigatorName = entry.getKey();
                            String term = entry.getValue();
                            vocabularyNames.remove(navigatorName); // prevent that this is used again for auto-annotation
                            Tagging vocabulary = LibraryProvider.autotagging.getVocabulary(navigatorName);
                            if (vocabulary != null) {
                                // extend the vocabulary
                                String obj = vocabulary.getObjectlink(term);
                                if (obj == null) try {vocabulary.put(term, "", root.toNormalform(true));} catch (IOException e) {} // this makes IO, be careful!
                                // create annotation
                                tag = vocabulary.getMetatagFromTerm(term);
                                Set<Tagging.Metatag> tagset = new HashSet<>();
                                tagset.add(tag);
                                this.tags.put(navigatorName, tagset);
                            }
                        }
                    }
                    if (vocabularyNames.size() > 0) for (int wordc = 1; wordc <= wordcache.length + 1; wordc++) {
                        // wordc is number of words that are tested
                        StringBuilder sb = new StringBuilder();
                        if (wordc == 1) {
                            sb.append(word);
                        } else {
                            for (int w = 0; w < wordc - 1; w++) {
                                sb.append(wordcache[wordcache.length - wordc + w + 1]).append(' ');
                            }
                            sb.append(word);
                        }
                        String testterm = sb.toString().trim();
                        //System.out.println("Testing: " + testterm);
                        tag = LibraryProvider.autotagging.getTagFromTerm(vocabularyNames, testterm);
                        if (tag != null) {
                            String navigatorName = tag.getVocabularyName();
                            Set<Tagging.Metatag> tagset = this.tags.get(navigatorName);
                            if (tagset == null) {
                                tagset = new HashSet<Tagging.Metatag>();
                                this.tags.put(navigatorName, tagset);
                            }
                            tagset.add(tag);
                        }
                    }
                }
                // shift wordcache
                System.arraycopy(wordcache, 1, wordcache, 0, wordcache.length - 1);
@ -215,6 +174,89 @@ public class Tokenizer {
        this.RESULT_NUMB_SENTENCES = allsentencecounter + (wordInSentenceCounter > 1 ? 1 : 0);
    }
 	/**
 	 * Check whether a single word or multiple ones match tags
 	 * from the given autotagging vocabularies. Then fill this instance "tags" map
 	 * with the eventually matching tags found.
 	 * 
 	 * @param wordcache
 	 *            the words to be checked for matching a tag as a single word or as combination of words 
 	 * @param word
 	 *            an additional word to be considered for tag matching
 	 * @param vocabularyNames
 	 *            names of the autotagging vocabularies to check
 	 */
 	protected void extractAutoTagsFromText(final String[] wordcache, final String word, final Set<String> vocabularyNames) {
 		Tagging.Metatag tag;
 		if (vocabularyNames.size() > 0) {
 			for (int wordc = 1; wordc <= wordcache.length + 1; wordc++) {
 				// wordc is number of words that are tested
 				StringBuilder sb = new StringBuilder();
 				if (wordc == 1) {
 					sb.append(word);
 				} else {
 					for (int w = 0; w < wordc - 1; w++) {
 						sb.append(wordcache[wordcache.length - wordc + w + 1]).append(' ');
 					}
 					sb.append(word);
 				}
 				String testterm = sb.toString().trim();
 				tag = LibraryProvider.autotagging.getTagFromTerm(vocabularyNames, testterm);
 				if (tag != null) {
 					String navigatorName = tag.getVocabularyName();
 					Set<Tagging.Metatag> tagset = this.tags.get(navigatorName);
 					if (tagset == null) {
 						tagset = new HashSet<Tagging.Metatag>();
 						this.tags.put(navigatorName, tagset);
 					}
 					tagset.add(tag);
 				}
 			}
 		}
 	}
 	/**
 	 * Extend the specified vocabularies, with terms eventually found by the
 	 * vocabulary scraper for these vocabularies. The scraper is emptied after
 	 * processing, and extended vocabularies names are removed from the
 	 * vocabularyNames.
 	 * 
 	 * @param root
 	 *            the document URL
 	 * @param scraper
 	 *            the vocabulary scraper, eventually containing new terms scraped
 	 *            for the registered vocabularies
 	 * @param vocabularyNames
 	 *            vocabularies names to be extended
 	 */
 	protected void extendVocabularies(final DigestURL root, final VocabularyScraper scraper,
 			final Set<String> vocabularyNames) {
 		Tagging.Metatag tag;
 		Map<String, String> vocMap = scraper == null ? null : scraper.removeVocMap(root);
 		if (vocMap != null && vocMap.size() > 0) {
 		    for (Map.Entry<String, String> entry: vocMap.entrySet()) {
 		        String navigatorName = entry.getKey();
 		        String term = entry.getValue();
 		        vocabularyNames.remove(navigatorName); // prevent that this is used again for auto-annotation
 		        Tagging vocabulary = LibraryProvider.autotagging.getVocabulary(navigatorName);
 		        if (vocabulary != null) {
 		            // extend the vocabulary
 		            String obj = vocabulary.getObjectlink(term);
 		            if (obj == null) {
 		            	try {
 		            		vocabulary.put(term, "", root.toNormalform(true));
 		            	} catch (IOException e) {} // this makes IO, be careful!
 		            }
 		            // create annotation
 		            tag = vocabulary.getMetatagFromTerm(term);
 		            Set<Tagging.Metatag> tagset = new HashSet<>();
 		            tagset.add(tag);
 		            this.tags.put(navigatorName, tagset);
 		        }
 		    }
 		}
 	}
    /**
     * @return returns the words as word/indexWord relation map. All words are lowercase.
     */
--- a/test/java/net/yacy/document/TokenizerTest.java
+++ b/test/java/net/yacy/document/TokenizerTest.java
@ -1,7 +1,6 @@
 package net.yacy.document;
 import java.net.MalformedURLException;
 import java.util.HashSet;
 import java.util.Map;
 import java.util.Set;
@ -17,7 +16,7 @@ public class TokenizerTest {
     * Test of words method, of class Tokenizer.
     */
    @Test
-    public void testWords() throws MalformedURLException {
+    public void testWords() {
        //  pos  =      1   2   3   4       5        6      7    8   9    10     // 1-letter words don't count
        String text = "One word is not a sentence because words are just words.";
        WordCache meaningLib = new WordCache(null);
@ -43,7 +42,7 @@ public class TokenizerTest {
     */
    @Test
    public void testNumberOfSentences() {
-        Set<String> testText = new HashSet();
+        Set<String> testText = new HashSet<>();
        // text with 5 sentences
        testText.add("Sentence One. Sentence Two. Comment on this. This is sentence four! Good By................");
        testText.add("Sentence One. Sentence two. Sentence 3? Sentence 4! Sentence w/o punktuation at end of text");